From d43f75e4cc62341c7a32355236b2a1f22dea0232 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Tue, 20 Oct 2020 13:39:53 +0800
Subject: [PATCH 001/185] add rois_num for roi_align xpu OP (#28077)

* add stack pool2d roi_align xpu op,test=kunlun

* error message opt, test=kunlun

* add xpu unittest,test=kunlun

* skip check grad,test=kunlun

* fix boostget , test=kunlun

* error message opt for XPU, test=kunlun

* add rois_num for roi_align xpu OP, test=develop
---
 paddle/fluid/operators/roi_align_op_xpu.cc    | 42 +++++++++++++++----
 .../unittests/xpu/test_roi_align_op_xpu.py    | 24 +++++++++++
 2 files changed, 58 insertions(+), 8 deletions(-)
diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 2c3bfdbc16b4d..75bd94142e6b7 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -39,14 +39,40 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     int width = in_dims[3];
     int rois_num = rois->dims()[0];
     const T* input_data = in->data<T>();
-    auto rois_lod = rois->lod().back();
-    int rois_batch_size = rois_lod.size() - 1;
-    PADDLE_ENFORCE_EQ(
-        rois_batch_size, batch_size,
-        platform::errors::InvalidArgument(
-            "The rois_batch_size and imgs batch_size of roi_align_xpu OP must "
-            "be the same. But received rois_batch_size %d , batch_size %d",
-            rois_batch_size, batch_size));
+
+    framework::Tensor _roi_batch_list;
+    _roi_batch_list.Resize({rois_num});
+    int* rois_lod = _roi_batch_list.mutable_data<int>(ctx.GetPlace());
+    int rois_batch_size = 1;
+    if (ctx.HasInput("RoisNum")) {
+      auto* rois_num_t = ctx.Input<framework::Tensor>("RoisNum");
+      rois_batch_size = rois_num_t->numel();
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument(
+              "The batch size of rois and the batch size of images "
+              " must be the same. But received the batch size of rois is %d, "
+              "and the batch size of images is %d",
+              rois_batch_size, batch_size));
+      auto* rois_num_data = rois_num_t->data<int>();
+      rois_lod[0] = 0;
+      for (int n = 0; n < rois_batch_size; ++n) {
+        rois_lod[n + 1] = rois_lod[n] + rois_num_data[n];
+      }
+    } else {
+      auto _rois_lod = rois->lod().back();
+      rois_batch_size = _rois_lod.size() - 1;
+      for (int n = 0; n < _rois_lod.size(); ++n) {
+        rois_lod[n] = _rois_lod[n];
+      }
+      PADDLE_ENFORCE_EQ(
+          rois_batch_size, batch_size,
+          platform::errors::InvalidArgument(
+              "The rois_batch_size and imgs batch_size of roi_align_xpu OP "
+              "must "
+              "be the same. But received rois_batch_size %d , batch_size %d",
+              rois_batch_size, batch_size));
+    }
     int rois_num_with_lod = rois_lod[rois_batch_size];
     PADDLE_ENFORCE_EQ(
         rois_num, rois_num_with_lod,
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
index 813bbffefcb34..70f03edb6bac6 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_roi_align_op_xpu.py
@@ -179,5 +179,29 @@ def test_check_output(self):
             self.check_output_with_place(place)
 
 
+class TestROIAlignInLodOp(TestROIAlignOp):
+    def set_data(self):
+        self.init_test_case()
+        self.make_rois()
+        self.calc_roi_align()
+
+        seq_len = self.rois_lod[0]
+
+        self.inputs = {
+            'X': self.x,
+            'ROIs': (self.rois[:, 1:5], self.rois_lod),
+            'RoisNum': np.asarray(seq_len).astype('int32')
+        }
+
+        self.attrs = {
+            'spatial_scale': self.spatial_scale,
+            'pooled_height': self.pooled_height,
+            'pooled_width': self.pooled_width,
+            'sampling_ratio': self.sampling_ratio
+        }
+
+        self.outputs = {'Out': self.out_data}
+
+
 if __name__ == '__main__':
     unittest.main()

From af709240612919ec7cf5b0e8fac9a6debda990df Mon Sep 17 00:00:00 2001
From: zhang wenhui <frankwhzhang@126.com>
Date: Tue, 20 Oct 2020 14:27:03 +0800
Subject: [PATCH 002/185] fix test_group_norm_op_v2.py, test=develop (#28104)

---
 python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
index 47761eb5eaf87..2ba79cc9e4396 100644
--- a/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_group_norm_op_v2.py
@@ -91,7 +91,7 @@ def compute_v2(x_np):
             x = np.random.randn(*shape).astype("float32")
             y1 = compute_v1(x)
             y2 = compute_v2(x)
-            self.assertTrue(np.allclose(y1, y2))
+            self.assertTrue(np.allclose(y1, y2, atol=1e-5))
 
 
 if __name__ == '__main__':

From 5a589b2f864e57a890f5305a8be5e85c25cb4d15 Mon Sep 17 00:00:00 2001
From: hong <43953930+phlrain@users.noreply.github.com>
Date: Tue, 20 Oct 2020 14:30:53 +0800
Subject: [PATCH 003/185] reduce imperative ocr attention config; test=develop
 (#28079)

---
 .../unittests/test_imperative_ocr_attention_model.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index 5400b785d2929..afe50664ef2eb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -29,11 +29,11 @@ class Config(object):
     config for training
     '''
     # encoder rnn hidden_size
-    encoder_size = 64
+    encoder_size = 16
     # decoder size for decoder stage
-    decoder_size = 64
+    decoder_size = 16
     # size for word embedding
-    word_vector_dim = 64
+    word_vector_dim = 16
     # max length for label padding
     max_length = 5
     # optimizer setting
@@ -41,9 +41,9 @@ class Config(object):
     learning_rate_decay = None
 
     # batch size to train
-    batch_size = 16
+    batch_size = 8
     # class number to classify
-    num_classes = 481
+    num_classes = 64
 
     use_gpu = False
     # special label for start and end
@@ -376,7 +376,7 @@ def test_while_op(self):
         seed = 90
         epoch_num = 1
         if core.is_compiled_with_cuda():
-            batch_num = 10
+            batch_num = 6
         else:
             batch_num = 4
         np.random.seed = seed

From 74fadeb44ad4b1bbba47272f66987da4ddf4de98 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 20 Oct 2020 15:11:41 +0800
Subject: [PATCH 004/185] Add Ubuntu18 dockerfile with cuda11 (#28083)

---
 tools/dockerfile/Dockerfile.ubuntu18          | 127 ++++++++++++++++++
 tools/dockerfile/build_scripts/install_trt.sh |  15 +++
 tools/dockerfile/centos6_manylinux.sh         |  25 +++-
 3 files changed, 166 insertions(+), 1 deletion(-)
 create mode 100644 tools/dockerfile/Dockerfile.ubuntu18

diff --git a/tools/dockerfile/Dockerfile.ubuntu18 b/tools/dockerfile/Dockerfile.ubuntu18
new file mode 100644
index 0000000000000..f8b7bbf91fb70
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.ubuntu18
@@ -0,0 +1,127 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:<baseimg>
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV HOME /root
+# Add bash enhancements
+COPY paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+  apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
+  apt-get update && \
+  apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
+    coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
+
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+      COPY tools/dockerfile/build_scripts /build_scripts 
+      RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
+      RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+
+RUN apt-get update && \
+  apt-get install -y python2.7 python2.7-dev \
+  python3.5 python3.5-dev \
+  python3.6 python3.6-dev \
+  python3.7 python3.7-dev \
+  python3.8 python3.8-dev && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python2.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.5 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
+  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
+    tar -xzf binutils-2.33.1.tar.gz && \ 
+    cd binutils-2.33.1 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
+
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip --no-cache-dir install ipykernel==4.6.0 wheel 
+
+#For docstring checker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.7 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip --no-cache-dir install pylint pytest astroid isort
+
+COPY ./python/requirements.txt /root/
+RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.7 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip --no-cache-dir install -r /root/requirements.txt
+
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+#CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+EXPOSE 22
diff --git a/tools/dockerfile/build_scripts/install_trt.sh b/tools/dockerfile/build_scripts/install_trt.sh
index 70297042bc6f4..02441efbe2b7f 100644
--- a/tools/dockerfile/build_scripts/install_trt.sh
+++ b/tools/dockerfile/build_scripts/install_trt.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 
 if [[ "$VERSION" == "10.1" ]];then
diff --git a/tools/dockerfile/centos6_manylinux.sh b/tools/dockerfile/centos6_manylinux.sh
index ea9c8a7bf36f0..7ea082baf2b4e 100755
--- a/tools/dockerfile/centos6_manylinux.sh
+++ b/tools/dockerfile/centos6_manylinux.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 set -xe
 
 REPO="${REPO:-paddledocker}"
@@ -18,6 +33,11 @@ function make_cuda101cudnn7() {
   sed -i "s#COPY build_scripts /build_scripts#COPY build_scripts /build_scripts \nRUN bash build_scripts/install_gcc.sh gcc82 \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH#g" Dockerfile.tmp
 }
 
+function make_cuda102cudnn7() {
+  sed 's/<baseimg>/10.2-cudnn7-devel-centos6/g' Dockerfile.centos >Dockerfile.tmp
+  sed -i "s#COPY build_scripts /build_scripts#COPY build_scripts /build_scripts \nRUN bash build_scripts/install_gcc.sh gcc82 \nENV PATH=/usr/local/gcc-8.2/bin:\$PATH#g" Dockerfile.tmp
+}
+
 
 function main() {
   local CMD=$1 
@@ -31,6 +51,9 @@ function main() {
     cuda101cudnn7)
       make_cuda101cudnn7
       ;;
+    cuda102cudnn7)
+      make_cuda102cudnn7
+      ;;
     *)
       echo "Make dockerfile error, Without this paramet."
       exit 1
@@ -38,4 +61,4 @@ function main() {
   esac
 }
 
-main $@
+main "$@"

From 6dd64b0a3008d8a879728dfc9db95115490758d0 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Tue, 20 Oct 2020 16:14:02 +0800
Subject: [PATCH 005/185] randperm run error in multi-gpus (#27942)

---
 paddle/fluid/operators/randperm_op.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/randperm_op.h b/paddle/fluid/operators/randperm_op.h
index 02aabb9a7b569..3446c4785bd4a 100644
--- a/paddle/fluid/operators/randperm_op.h
+++ b/paddle/fluid/operators/randperm_op.h
@@ -57,7 +57,7 @@ class RandpermKernel : public framework::OpKernel<T> {
       tmp_tensor.Resize(framework::make_ddim({n}));
       T* tmp_data = tmp_tensor.mutable_data<T>(platform::CPUPlace());
       random_permate<T>(tmp_data, n, seed);
-      framework::TensorCopy(tmp_tensor, platform::CUDAPlace(), out_tensor);
+      framework::TensorCopy(tmp_tensor, ctx.GetPlace(), out_tensor);
     }
   }
 };

From 135b62a4ecef1ef4ce8e4dc910d2e801af91abe9 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Tue, 20 Oct 2020 16:34:54 +0800
Subject: [PATCH 006/185] [Dy2stat] Refine code of DygraphToStaticAst (#28103)

* refine code of DygraphToStaticAst

* add __init__ function
---
 .../dygraph_to_static/ast_transformer.py      | 89 ++++++-------------
 1 file changed, 29 insertions(+), 60 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 5050067e48a1b..2c59a66f22be2 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -47,6 +47,9 @@ class DygraphToStaticAst(gast.NodeTransformer):
     Main class to transform Dygraph to Static Graph
     """
 
+    def __init__(self):
+        self.translator_logger = logging_utils.TranslatorLogger()
+
     def get_static_ast(self, root):
         # save root for some analysis may need global AST
         self.root = root
@@ -57,71 +60,37 @@ def get_static_ast(self, root):
         self.transfer_from_node_type(self.static_analysis_root)
         return self.static_analysis_root
 
+    def _apply(self, transformer, node_wrapper, log_level):
+        transformer(node_wrapper).transform()
+        self.translator_logger.log_transformed_code(log_level, self.root,
+                                                    transformer.__name__)
+
     def transfer_from_node_type(self, node_wrapper):
-        translator_logger = logging_utils.TranslatorLogger()
-        translator_logger.log(
+        self.translator_logger.log(
             1, "Source code: \n{}".format(ast_to_source_code(self.root)))
         # Generic transformation
         self.visit(node_wrapper.node)
 
-        # Transform basic api of dygraph to static graph and get feed_name_to_arg_name
-        BasicApiTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(1, self.root,
-                                               "BasicApiTransformer")
-
-        # Transform Tensor.shape into fluid.layers.shape(Tensor)
-        TensorShapeTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(2, self.root,
-                                               "TensorShapeTransformer")
-
-        # Transform list used in control flow
-        ListTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(3, self.root, "ListTransformer")
-
-        # Transform break/continue in loops
-        BreakContinueTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(4, self.root,
-                                               "BreakContinueTransformer")
-
-        # Transform return in functions
-        ReturnTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(5, self.root,
-                                               "ReturnTransformer")
-
-        # Transform logical and/or/not
-        LogicalTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(6, self.root,
-                                               "LogicalTransformer")
-
-        # Transform for loop and while loop
-        LoopTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(7, self.root, "LoopTransformer")
-
-        # Transform all if/else statement of Dygraph into Static Graph.
-        IfElseTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(8, self.root,
-                                               "IfElseTransformer")
-
-        # Transform python assert statement
-        AssertTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(9, self.root,
-                                               "AssertTransformer")
-
-        # Transform all python print statement
-        PrintTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(10, self.root,
-                                               "PrintTransformer")
-
-        # Transform call recursively
-        CallTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(11, self.root, "CallTransformer")
-
-        # Transform python type casting statement
-        CastTransformer(node_wrapper).transform()
-        translator_logger.log_transformed_code(12, self.root, "CastTransformer")
-
-        translator_logger.log_transformed_code(logging_utils.LOG_AllTransformer,
-                                               self.root, "All Transformers")
+        transformers = [
+            BasicApiTransformer,  # Basic Api
+            TensorShapeTransformer,  # Tensor.shape -> layers.shape(Tensor)
+            ListTransformer,  # List used in control flow
+            BreakContinueTransformer,  # break/continue in loops
+            ReturnTransformer,  # return in functions
+            LogicalTransformer,  # logical and/or/not
+            LoopTransformer,  # for/while -> while_op
+            IfElseTransformer,  # if/else -> cond_op
+            AssertTransformer,  # assert statement
+            PrintTransformer,  # print statement
+            CallTransformer,  # transform call recursively
+            CastTransformer,  # type casting statement
+        ]
+
+        for index, transformer in enumerate(transformers):
+            self._apply(transformer, node_wrapper, log_level=index + 1)
+
+        self.translator_logger.log_transformed_code(
+            logging_utils.LOG_AllTransformer, self.root, "All Transformers")
 
     def visit_FunctionDef(self, node):
         if self.decorate_func_name is None:

From 74c8a811276a09f6a774dea98904656468ce56bf Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Tue, 20 Oct 2020 17:21:26 +0800
Subject: [PATCH 007/185] Add pil backend for vision transforms (#28035)

* add pil backend
---
 python/paddle/tests/test_callbacks.py         |    2 +-
 python/paddle/tests/test_transforms.py        |  297 ++++-
 python/paddle/vision/__init__.py              |    6 +-
 python/paddle/vision/datasets/folder.py       |   21 +-
 python/paddle/vision/image.py                 |  162 +++
 python/paddle/vision/transforms/functional.py |  747 +++++++----
 .../vision/transforms/functional_cv2.py       |  503 ++++++++
 .../vision/transforms/functional_pil.py       |  458 +++++++
 .../vision/transforms/functional_tensor.py    |   40 +
 python/paddle/vision/transforms/transforms.py | 1149 +++++++++--------
 10 files changed, 2556 insertions(+), 829 deletions(-)
 create mode 100644 python/paddle/vision/image.py
 create mode 100644 python/paddle/vision/transforms/functional_cv2.py
 create mode 100644 python/paddle/vision/transforms/functional_pil.py
 create mode 100644 python/paddle/vision/transforms/functional_tensor.py

diff --git a/python/paddle/tests/test_callbacks.py b/python/paddle/tests/test_callbacks.py
index b9442c46b8fd4..5c349c5f1d35e 100644
--- a/python/paddle/tests/test_callbacks.py
+++ b/python/paddle/tests/test_callbacks.py
@@ -105,7 +105,7 @@ def test_callback_verbose_2(self):
         self.run_callback()
 
     def test_visualdl_callback(self):
-        # visualdl not support python3
+        # visualdl not support python2
         if sys.version_info < (3, ):
             return
 
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index 6c2944d1e750f..ac21f8a6192c4 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -18,14 +18,19 @@
 import cv2
 import shutil
 import numpy as np
+from PIL import Image
 
+import paddle
+from paddle.vision import get_image_backend, set_image_backend, image_load
 from paddle.vision.datasets import DatasetFolder
 from paddle.vision.transforms import transforms
 import paddle.vision.transforms.functional as F
 
 
-class TestTransforms(unittest.TestCase):
+class TestTransformsCV2(unittest.TestCase):
     def setUp(self):
+        self.backend = self.get_backend()
+        set_image_backend(self.backend)
         self.data_dir = tempfile.mkdtemp()
         for i in range(2):
             sub_dir = os.path.join(self.data_dir, 'class_' + str(i))
@@ -40,6 +45,22 @@ def setUp(self):
                         (400, 300, 3)) * 255).astype('uint8')
                 cv2.imwrite(os.path.join(sub_dir, str(j) + '.jpg'), fake_img)
 
+    def get_backend(self):
+        return 'cv2'
+
+    def create_image(self, shape):
+        if self.backend == 'cv2':
+            return (np.random.rand(*shape) * 255).astype('uint8')
+        elif self.backend == 'pil':
+            return Image.fromarray((np.random.rand(*shape) * 255).astype(
+                'uint8'))
+
+    def get_shape(self, img):
+        if self.backend == 'pil':
+            return np.array(img).shape
+
+        return img.shape
+
     def tearDown(self):
         shutil.rmtree(self.data_dir)
 
@@ -51,27 +72,29 @@ def do_transform(self, trans):
 
     def test_trans_all(self):
         normalize = transforms.Normalize(
-            mean=[123.675, 116.28, 103.53], std=[58.395, 57.120, 57.375])
+            mean=[123.675, 116.28, 103.53],
+            std=[58.395, 57.120, 57.375], )
         trans = transforms.Compose([
-            transforms.RandomResizedCrop(224), transforms.GaussianNoise(),
+            transforms.RandomResizedCrop(224),
             transforms.ColorJitter(
-                brightness=0.4, contrast=0.4, saturation=0.4,
-                hue=0.4), transforms.RandomHorizontalFlip(),
-            transforms.Permute(mode='CHW'), normalize
+                brightness=0.4, contrast=0.4, saturation=0.4, hue=0.4),
+            transforms.RandomHorizontalFlip(),
+            transforms.Transpose(),
+            normalize,
         ])
 
         self.do_transform(trans)
 
     def test_normalize(self):
         normalize = transforms.Normalize(mean=0.5, std=0.5)
-        trans = transforms.Compose([transforms.Permute(mode='CHW'), normalize])
+        trans = transforms.Compose([transforms.Transpose(), normalize])
         self.do_transform(trans)
 
     def test_trans_resize(self):
         trans = transforms.Compose([
-            transforms.Resize(300, [0, 1]),
+            transforms.Resize(300),
             transforms.RandomResizedCrop((280, 280)),
-            transforms.Resize(280, [0, 1]),
+            transforms.Resize(280),
             transforms.Resize((256, 200)),
             transforms.Resize((180, 160)),
             transforms.CenterCrop(128),
@@ -79,13 +102,6 @@ def test_trans_resize(self):
         ])
         self.do_transform(trans)
 
-    def test_trans_centerCrop(self):
-        trans = transforms.Compose([
-            transforms.CenterCropResize(224),
-            transforms.CenterCropResize(128, 160),
-        ])
-        self.do_transform(trans)
-
     def test_flip(self):
         trans = transforms.Compose([
             transforms.RandomHorizontalFlip(1.0),
@@ -96,7 +112,7 @@ def test_flip(self):
         self.do_transform(trans)
 
     def test_color_jitter(self):
-        trans = transforms.BatchCompose([
+        trans = transforms.Compose([
             transforms.BrightnessTransform(0.0),
             transforms.HueTransform(0.0),
             transforms.SaturationTransform(0.0),
@@ -106,11 +122,11 @@ def test_color_jitter(self):
 
     def test_rotate(self):
         trans = transforms.Compose([
-            transforms.RandomRotate(90),
-            transforms.RandomRotate([-10, 10]),
-            transforms.RandomRotate(
+            transforms.RandomRotation(90),
+            transforms.RandomRotation([-10, 10]),
+            transforms.RandomRotation(
                 45, expand=True),
-            transforms.RandomRotate(
+            transforms.RandomRotation(
                 10, expand=True, center=(60, 80)),
         ])
         self.do_transform(trans)
@@ -119,20 +135,15 @@ def test_pad(self):
         trans = transforms.Compose([transforms.Pad(2)])
         self.do_transform(trans)
 
-        fake_img = np.random.rand(200, 150, 3).astype('float32')
+        fake_img = self.create_image((200, 150, 3))
         trans_pad = transforms.Pad(10)
         fake_img_padded = trans_pad(fake_img)
-        np.testing.assert_equal(fake_img_padded.shape, (220, 170, 3))
+        np.testing.assert_equal(self.get_shape(fake_img_padded), (220, 170, 3))
         trans_pad1 = transforms.Pad([1, 2])
         trans_pad2 = transforms.Pad([1, 2, 3, 4])
         img = trans_pad1(fake_img)
         img = trans_pad2(img)
 
-    def test_erase(self):
-        trans = transforms.Compose(
-            [transforms.RandomErasing(), transforms.RandomErasing(value=0.0)])
-        self.do_transform(trans)
-
     def test_random_crop(self):
         trans = transforms.Compose([
             transforms.RandomCrop(200),
@@ -143,18 +154,19 @@ def test_random_crop(self):
         trans_random_crop1 = transforms.RandomCrop(224)
         trans_random_crop2 = transforms.RandomCrop((140, 160))
 
-        fake_img = np.random.rand(500, 400, 3).astype('float32')
+        fake_img = self.create_image((500, 400, 3))
         fake_img_crop1 = trans_random_crop1(fake_img)
         fake_img_crop2 = trans_random_crop2(fake_img_crop1)
 
-        np.testing.assert_equal(fake_img_crop1.shape, (224, 224, 3))
+        np.testing.assert_equal(self.get_shape(fake_img_crop1), (224, 224, 3))
 
-        np.testing.assert_equal(fake_img_crop2.shape, (140, 160, 3))
+        np.testing.assert_equal(self.get_shape(fake_img_crop2), (140, 160, 3))
 
         trans_random_crop_same = transforms.RandomCrop((140, 160))
         img = trans_random_crop_same(fake_img_crop2)
 
-        trans_random_crop_bigger = transforms.RandomCrop((180, 200))
+        trans_random_crop_bigger = transforms.RandomCrop(
+            (180, 200), pad_if_needed=True)
         img = trans_random_crop_bigger(img)
 
         trans_random_crop_pad = transforms.RandomCrop((224, 256), 2, True)
@@ -165,21 +177,38 @@ def test_grayscale(self):
         self.do_transform(trans)
 
         trans_gray = transforms.Grayscale()
-        fake_img = np.random.rand(500, 400, 3).astype('float32')
+        fake_img = self.create_image((500, 400, 3))
         fake_img_gray = trans_gray(fake_img)
 
-        np.testing.assert_equal(len(fake_img_gray.shape), 3)
-        np.testing.assert_equal(fake_img_gray.shape[0], 500)
-        np.testing.assert_equal(fake_img_gray.shape[1], 400)
+        np.testing.assert_equal(self.get_shape(fake_img_gray)[0], 500)
+        np.testing.assert_equal(self.get_shape(fake_img_gray)[1], 400)
 
         trans_gray3 = transforms.Grayscale(3)
-        fake_img = np.random.rand(500, 400, 3).astype('float32')
+        fake_img = self.create_image((500, 400, 3))
         fake_img_gray = trans_gray3(fake_img)
 
+    def test_tranpose(self):
+        trans = transforms.Compose([transforms.Transpose()])
+        self.do_transform(trans)
+
+        fake_img = self.create_image((50, 100, 3))
+        converted_img = trans(fake_img)
+
+        np.testing.assert_equal(self.get_shape(converted_img), (3, 50, 100))
+
+    def test_to_tensor(self):
+        trans = transforms.Compose([transforms.ToTensor()])
+        fake_img = self.create_image((50, 100, 3))
+
+        tensor = trans(fake_img)
+
+        assert isinstance(tensor, paddle.Tensor)
+        np.testing.assert_equal(tensor.shape, (3, 50, 100))
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
-        trans_batch = transforms.BatchCompose([transforms.Resize(-1)])
+        trans_batch = transforms.Compose([transforms.Resize(-1)])
 
         with self.assertRaises(Exception):
             self.do_transform(trans)
@@ -203,35 +232,211 @@ def test_exception(self):
             transforms.Pad([1.0, 2.0, 3.0])
 
         with self.assertRaises(TypeError):
-            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            fake_img = self.create_image((100, 120, 3))
             F.pad(fake_img, '1')
 
         with self.assertRaises(TypeError):
-            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            fake_img = self.create_image((100, 120, 3))
             F.pad(fake_img, 1, {})
 
         with self.assertRaises(TypeError):
-            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            fake_img = self.create_image((100, 120, 3))
             F.pad(fake_img, 1, padding_mode=-1)
 
         with self.assertRaises(ValueError):
-            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            fake_img = self.create_image((100, 120, 3))
             F.pad(fake_img, [1.0, 2.0, 3.0])
 
         with self.assertRaises(ValueError):
-            transforms.RandomRotate(-2)
+            transforms.RandomRotation(-2)
 
         with self.assertRaises(ValueError):
-            transforms.RandomRotate([1, 2, 3])
+            transforms.RandomRotation([1, 2, 3])
 
         with self.assertRaises(ValueError):
             trans_gray = transforms.Grayscale(5)
-            fake_img = np.random.rand(100, 120, 3).astype('float32')
+            fake_img = self.create_image((100, 120, 3))
             trans_gray(fake_img)
 
+        with self.assertRaises(TypeError):
+            transform = transforms.RandomResizedCrop(64)
+            transform(1)
+
+        with self.assertRaises(ValueError):
+            transform = transforms.BrightnessTransform([-0.1, -0.2])
+
+        with self.assertRaises(TypeError):
+            transform = transforms.BrightnessTransform('0.1')
+
+        with self.assertRaises(ValueError):
+            transform = transforms.BrightnessTransform('0.1', keys=1)
+
+        with self.assertRaises(NotImplementedError):
+            transform = transforms.BrightnessTransform('0.1', keys='a')
+
     def test_info(self):
         str(transforms.Compose([transforms.Resize((224, 224))]))
-        str(transforms.BatchCompose([transforms.Resize((224, 224))]))
+        str(transforms.Compose([transforms.Resize((224, 224))]))
+
+
+class TestTransformsPIL(TestTransformsCV2):
+    def get_backend(self):
+        return 'pil'
+
+
+class TestFunctional(unittest.TestCase):
+    def test_errors(self):
+        with self.assertRaises(TypeError):
+            F.to_tensor(1)
+
+        with self.assertRaises(ValueError):
+            fake_img = Image.fromarray((np.random.rand(28, 28, 3) * 255).astype(
+                'uint8'))
+            F.to_tensor(fake_img, data_format=1)
+
+        with self.assertRaises(TypeError):
+            fake_img = Image.fromarray((np.random.rand(28, 28, 3) * 255).astype(
+                'uint8'))
+            F.resize(fake_img, '1')
+
+        with self.assertRaises(TypeError):
+            F.resize(1, 1)
+
+        with self.assertRaises(TypeError):
+            F.pad(1, 1)
+
+        with self.assertRaises(TypeError):
+            F.crop(1, 1, 1, 1, 1)
+
+        with self.assertRaises(TypeError):
+            F.hflip(1)
+
+        with self.assertRaises(TypeError):
+            F.vflip(1)
+
+        with self.assertRaises(TypeError):
+            F.adjust_brightness(1, 0.1)
+
+        with self.assertRaises(TypeError):
+            F.adjust_contrast(1, 0.1)
+
+        with self.assertRaises(TypeError):
+            F.adjust_hue(1, 0.1)
+
+        with self.assertRaises(TypeError):
+            F.adjust_saturation(1, 0.1)
+
+        with self.assertRaises(TypeError):
+            F.rotate(1, 0.1)
+
+        with self.assertRaises(TypeError):
+            F.to_grayscale(1)
+
+        with self.assertRaises(ValueError):
+            set_image_backend(1)
+
+        with self.assertRaises(ValueError):
+            image_load('tmp.jpg', backend=1)
+
+    def test_normalize(self):
+        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+        tensor_img = F.to_tensor(pil_img)
+        tensor_img_hwc = F.to_tensor(pil_img, data_format='HWC')
+
+        mean = [0.5, 0.5, 0.5]
+        std = [0.5, 0.5, 0.5]
+
+        normalized_img = F.normalize(tensor_img, mean, std)
+        normalized_img = F.normalize(
+            tensor_img_hwc, mean, std, data_format='HWC')
+
+        normalized_img = F.normalize(pil_img, mean, std, data_format='HWC')
+        normalized_img = F.normalize(
+            np_img, mean, std, data_format='HWC', to_rgb=True)
+
+    def test_center_crop(self):
+        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+
+        np_cropped_img = F.center_crop(np_img, 4)
+        pil_cropped_img = F.center_crop(pil_img, 4)
+
+        np.testing.assert_almost_equal(np_cropped_img,
+                                       np.array(pil_cropped_img))
+
+    def test_pad(self):
+        np_img = (np.random.rand(28, 24, 3)).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+
+        np_padded_img = F.pad(np_img, [1, 2], padding_mode='reflect')
+        pil_padded_img = F.pad(pil_img, [1, 2], padding_mode='reflect')
+
+        np.testing.assert_almost_equal(np_padded_img, np.array(pil_padded_img))
+
+        pil_p_img = pil_img.convert('P')
+        pil_padded_img = F.pad(pil_p_img, [1, 2])
+        pil_padded_img = F.pad(pil_p_img, [1, 2], padding_mode='reflect')
+
+    def test_resize(self):
+        np_img = (np.zeros([28, 24, 3])).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+
+        np_reseized_img = F.resize(np_img, 40)
+        pil_reseized_img = F.resize(pil_img, 40)
+
+        np.testing.assert_almost_equal(np_reseized_img,
+                                       np.array(pil_reseized_img))
+
+        gray_img = (np.zeros([28, 32])).astype('uint8')
+        gray_resize_img = F.resize(gray_img, 40)
+
+    def test_to_tensor(self):
+        np_img = (np.random.rand(28, 28) * 255).astype('uint8')
+        pil_img = Image.fromarray(np_img)
+
+        np_tensor = F.to_tensor(np_img, data_format='HWC')
+        pil_tensor = F.to_tensor(pil_img, data_format='HWC')
+
+        np.testing.assert_allclose(np_tensor.numpy(), pil_tensor.numpy())
+
+        # test float dtype 
+        float_img = np.random.rand(28, 28)
+        float_tensor = F.to_tensor(float_img)
+
+        pil_img = Image.fromarray(np_img).convert('I')
+        pil_tensor = F.to_tensor(pil_img)
+
+        pil_img = Image.fromarray(np_img).convert('I;16')
+        pil_tensor = F.to_tensor(pil_img)
+
+        pil_img = Image.fromarray(np_img).convert('F')
+        pil_tensor = F.to_tensor(pil_img)
+
+        pil_img = Image.fromarray(np_img).convert('1')
+        pil_tensor = F.to_tensor(pil_img)
+
+        pil_img = Image.fromarray(np_img).convert('YCbCr')
+        pil_tensor = F.to_tensor(pil_img)
+
+    def test_image_load(self):
+        fake_img = Image.fromarray((np.random.random((32, 32, 3)) * 255).astype(
+            'uint8'))
+
+        path = 'temp.jpg'
+        fake_img.save(path)
+
+        set_image_backend('pil')
+
+        pil_img = image_load(path).convert('RGB')
+
+        print(type(pil_img))
+
+        set_image_backend('cv2')
+
+        np_img = image_load(path)
+
+        os.remove(path)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/vision/__init__.py b/python/paddle/vision/__init__.py
index 7d28d567cefa2..db5a94f932934 100644
--- a/python/paddle/vision/__init__.py
+++ b/python/paddle/vision/__init__.py
@@ -21,6 +21,10 @@
 from . import datasets
 from .datasets import *
 
+from . import image
+from .image import *
+
 __all__ = models.__all__ \
         + transforms.__all__ \
-        + datasets.__all__
+        + datasets.__all__ \
+        + image.__all__
diff --git a/python/paddle/vision/datasets/folder.py b/python/paddle/vision/datasets/folder.py
index 19d913504bdf7..d005bc4f19ebb 100644
--- a/python/paddle/vision/datasets/folder.py
+++ b/python/paddle/vision/datasets/folder.py
@@ -14,6 +14,7 @@
 
 import os
 import sys
+from PIL import Image
 
 import paddle
 from paddle.io import Dataset
@@ -136,7 +137,7 @@ def __init__(self,
                 "Found 0 files in subfolders of: " + self.root + "\n"
                 "Supported extensions are: " + ",".join(extensions)))
 
-        self.loader = cv2_loader if loader is None else loader
+        self.loader = default_loader if loader is None else loader
         self.extensions = extensions
 
         self.classes = classes
@@ -193,9 +194,23 @@ def __len__(self):
                   '.tiff', '.webp')
 
 
+def pil_loader(path):
+    with open(path, 'rb') as f:
+        img = Image.open(f)
+        return img.convert('RGB')
+
+
 def cv2_loader(path):
     cv2 = try_import('cv2')
-    return cv2.imread(path)
+    return cv2.cvtColor(cv2.imread(path), cv2.COLOR_BGR2RGB)
+
+
+def default_loader(path):
+    from paddle.vision import get_image_backend
+    if get_image_backend() == 'cv2':
+        return cv2_loader(path)
+    else:
+        return pil_loader(path)
 
 
 class ImageFolder(Dataset):
@@ -280,7 +295,7 @@ def is_valid_file(x):
                 "Found 0 files in subfolders of: " + self.root + "\n"
                 "Supported extensions are: " + ",".join(extensions)))
 
-        self.loader = cv2_loader if loader is None else loader
+        self.loader = default_loader if loader is None else loader
         self.extensions = extensions
         self.samples = samples
         self.transform = transform
diff --git a/python/paddle/vision/image.py b/python/paddle/vision/image.py
new file mode 100644
index 0000000000000..3d5ea3a73af6c
--- /dev/null
+++ b/python/paddle/vision/image.py
@@ -0,0 +1,162 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from PIL import Image
+from paddle.utils import try_import
+
+__all__ = ['set_image_backend', 'get_image_backend', 'image_load']
+
+_image_backend = 'pil'
+
+
+def set_image_backend(backend):
+    """
+    Specifies the backend used to load images in class ``paddle.vision.datasets.ImageFolder`` 
+    and ``paddle.vision.datasets.DatasetFolder`` . Now support backends are pillow and opencv. 
+    If backend not set, will use 'pil' as default. 
+
+    Args:
+        backend (str): Name of the image load backend, should be one of {'pil', 'cv2'}.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import os
+            import shutil
+            import tempfile
+            import numpy as np
+            from PIL import Image
+
+            from paddle.vision import DatasetFolder
+            from paddle.vision import set_image_backend
+
+            set_image_backend('pil')
+
+            def make_fake_dir():
+                data_dir = tempfile.mkdtemp()
+
+                for i in range(2):
+                    sub_dir = os.path.join(data_dir, 'class_' + str(i))
+                    if not os.path.exists(sub_dir):
+                        os.makedirs(sub_dir)
+                    for j in range(2):
+                        fake_img = Image.fromarray((np.random.random((32, 32, 3)) * 255).astype('uint8'))
+                        fake_img.save(os.path.join(sub_dir, str(j) + '.png'))
+                return data_dir
+
+            temp_dir = make_fake_dir()
+
+            pil_data_folder = DatasetFolder(temp_dir)
+
+            for items in pil_data_folder:
+                break
+
+            # should get PIL.Image.Image
+            print(type(items[0]))
+
+            # use opencv as backend
+            # set_image_backend('cv2')
+
+            # cv2_data_folder = DatasetFolder(temp_dir)
+
+            # for items in cv2_data_folder:
+            #     break
+
+            # should get numpy.ndarray
+            # print(type(items[0]))
+
+            shutil.rmtree(temp_dir)
+    """
+    global _image_backend
+    if backend not in ['pil', 'cv2']:
+        raise ValueError(
+            "Expected backend are one of ['pil', 'cv2'], but got {}"
+            .format(backend))
+    _image_backend = backend
+
+
+def get_image_backend():
+    """
+    Gets the name of the package used to load images
+
+    Returns:
+        str: backend of image load.
+
+    Examples:
+    
+        .. code-block:: python
+
+            from paddle.vision import get_image_backend
+
+            backend = get_image_backend()
+            print(backend)
+
+    """
+    return _image_backend
+
+
+def image_load(path, backend=None):
+    """Load an image.
+
+    Args:
+        path (str): Path of the image.
+        backend (str, optional): The image decoding backend type. Options are
+            `cv2`, `pil`, `None`. If backend is None, the global _imread_backend 
+            specified by ``paddle.vision.set_image_backend`` will be used. Default: None.
+
+    Returns:
+        PIL.Image or np.array: Loaded image.
+
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision import image_load, set_image_backend
+
+            fake_img = Image.fromarray((np.random.random((32, 32, 3)) * 255).astype('uint8'))
+
+            path = 'temp.png'
+            fake_img.save(path)
+
+            set_image_backend('pil')
+            
+            pil_img = image_load(path).convert('RGB')
+
+            # should be PIL.Image.Image
+            print(type(pil_img))
+
+            # use opencv as backend
+            # set_image_backend('cv2')
+
+            # np_img = image_load(path)
+            # # should get numpy.ndarray
+            # print(type(np_img))
+    
+    """
+
+    if backend is None:
+        backend = _image_backend
+    if backend not in ['pil', 'cv2']:
+        raise ValueError(
+            "Expected backend are one of ['pil', 'cv2'], but got {}"
+            .format(backend))
+
+    if backend == 'pil':
+        return Image.open(path)
+    else:
+        cv2 = try_import('cv2')
+        return cv2.imread(path)
diff --git a/python/paddle/vision/transforms/functional.py b/python/paddle/vision/transforms/functional.py
index acceb111e6f84..7391ae322e359 100644
--- a/python/paddle/vision/transforms/functional.py
+++ b/python/paddle/vision/transforms/functional.py
@@ -12,16 +12,18 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+from __future__ import division
+
 import sys
-import collections
-import random
 import math
-import functools
-
 import numbers
-import numpy as np
+import warnings
+import collections
 
-from paddle.utils import try_import
+import numpy as np
+from PIL import Image
+from numpy import sin, cos, tan
+import paddle
 
 if sys.version_info < (3, 3):
     Sequence = collections.Sequence
@@ -30,314 +32,623 @@
     Sequence = collections.abc.Sequence
     Iterable = collections.abc.Iterable
 
-__all__ = ['flip', 'resize', 'pad', 'rotate', 'to_grayscale']
+from . import functional_pil as F_pil
+from . import functional_cv2 as F_cv2
+from . import functional_tensor as F_t
 
+__all__ = [
+    'to_tensor', 'hflip', 'vflip', 'resize', 'pad', 'rotate', 'to_grayscale',
+    'crop', 'center_crop', 'adjust_brightness', 'adjust_contrast', 'adjust_hue',
+    'to_grayscale', 'normalize'
+]
 
-def keepdims(func):
-    """Keep the dimension of input images unchanged"""
 
-    @functools.wraps(func)
-    def wrapper(image, *args, **kwargs):
-        if len(image.shape) != 3:
-            raise ValueError("Expect image have 3 dims, but got {} dims".format(
-                len(image.shape)))
-        ret = func(image, *args, **kwargs)
-        if len(ret.shape) == 2:
-            ret = ret[:, :, np.newaxis]
-        return ret
+def _is_pil_image(img):
+    return isinstance(img, Image.Image)
 
-    return wrapper
 
+def _is_tensor_image(img):
+    return isinstance(img, paddle.Tensor)
 
-@keepdims
-def flip(image, code):
-    """
-    Accordding to the code (the type of flip), flip the input image
+
+def _is_numpy_image(img):
+    return isinstance(img, np.ndarray) and (img.ndim in {2, 3})
+
+
+def to_tensor(pic, data_format='CHW'):
+    """Converts a ``PIL.Image`` or ``numpy.ndarray`` to paddle.Tensor.
+
+    See ``ToTensor`` for more details.
 
     Args:
-        image (np.ndarray): Input image, with (H, W, C) shape
-        code (int): Code that indicates the type of flip.
-            -1 : Flip horizontally and vertically
-            0 : Flip vertically
-            1 : Flip horizontally
+        pic (PIL.Image|np.ndarray): Image to be converted to tensor.
+        data_format (str, optional): Data format of input img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        Tensor: Converted image. Data format is same as input img.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
             from paddle.vision.transforms import functional as F
 
-            fake_img = np.random.rand(224, 224, 3)
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
-            # flip horizontally and vertically
-            F.flip(fake_img, -1)
+            fake_img = Image.fromarray(fake_img)
 
-            # flip vertically
-            F.flip(fake_img, 0)
+            tensor = F.to_tensor(fake_img)
+            print(tensor.shape)
 
-            # flip horizontally
-            F.flip(fake_img, 1)
     """
-    cv2 = try_import('cv2')
-    return cv2.flip(image, flipCode=code)
+    if not (_is_pil_image(pic) or _is_numpy_image(pic)):
+        raise TypeError('pic should be PIL Image or ndarray. Got {}'.format(
+            type(pic)))
+
+    if _is_pil_image(pic):
+        return F_pil.to_tensor(pic, data_format)
+    else:
+        return F_cv2.to_tensor(pic, data_format)
 
 
-@keepdims
-def resize(img, size, interpolation=1):
+def resize(img, size, interpolation='bilinear'):
     """
-    resize the input data to given size
+    Resizes the image to given size
 
     Args:
-        input (np.ndarray): Input data, could be image or masks, with (H, W, C) shape
+        input (PIL.Image|np.ndarray): Image to be resized.
         size (int|list|tuple): Target size of input data, with (height, width) shape.
-        interpolation (int, optional): Interpolation method.
-            0 : cv2.INTER_NEAREST 
-            1 : cv2.INTER_LINEAR 
-            2 : cv2.INTER_CUBIC 
-            3 : cv2.INTER_AREA 
-            4 : cv2.INTER_LANCZOS4 
-            5 : cv2.INTER_LINEAR_EXACT
-            7 : cv2.INTER_MAX 
-            8 : cv2.WARP_FILL_OUTLIERS 
-            16: cv2.WARP_INVERSE_MAP 
+        interpolation (int|str, optional): Interpolation method. when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC, 
+            - "box": Image.BOX, 
+            - "lanczos": Image.LANCZOS, 
+            - "hamming": Image.HAMMING
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "area": cv2.INTER_AREA, 
+            - "bicubic": cv2.INTER_CUBIC, 
+            - "lanczos": cv2.INTER_LANCZOS4
+
+    Returns:
+        PIL.Image or np.array: Resized image.
 
     Examples:
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
             from paddle.vision.transforms import functional as F
 
-            fake_img = np.random.rand(256, 256, 3)
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
-            F.resize(fake_img, 224)
+            fake_img = Image.fromarray(fake_img)
 
-            F.resize(fake_img, (200, 150))
+            converted_img = F.resize(fake_img, 224)
+            print(converted_img.size)
+
+            converted_img = F.resize(fake_img, (200, 150))
+            print(converted_img.size)
     """
-    cv2 = try_import('cv2')
-    if isinstance(interpolation, Sequence):
-        interpolation = random.choice(interpolation)
-
-    if isinstance(size, int):
-        h, w = img.shape[:2]
-        if (w <= h and w == size) or (h <= w and h == size):
-            return img
-        if w < h:
-            ow = size
-            oh = int(size * h / w)
-            return cv2.resize(img, (ow, oh), interpolation=interpolation)
-        else:
-            oh = size
-            ow = int(size * w / h)
-            return cv2.resize(img, (ow, oh), interpolation=interpolation)
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.resize(img, size, interpolation)
     else:
-        return cv2.resize(img, size[::-1], interpolation=interpolation)
+        return F_cv2.resize(img, size, interpolation)
 
 
-@keepdims
-def pad(img, padding, fill=(0, 0, 0), padding_mode='constant'):
-    """Pads the given CV Image on all sides with speficified padding mode and fill value.
+def pad(img, padding, fill=0, padding_mode='constant'):
+    """
+    Pads the given PIL.Image or numpy.array on all sides with specified padding mode and fill value.
 
     Args:
-        img (np.ndarray): Image to be padded.
-        padding (int|tuple): Padding on each border. If a single int is provided this
+        img (PIL.Image|np.array): Image to be padded.
+        padding (int|list|tuple): Padding on each border. If a single int is provided this
             is used to pad all borders. If tuple of length 2 is provided this is the padding
             on left/right and top/bottom respectively. If a tuple of length 4 is provided
             this is the padding for the left, top, right and bottom borders
             respectively.
-        fill (int|tuple): Pixel fill value for constant fill. Default is 0. If a tuple of
+        fill (float, optional): Pixel fill value for constant fill. If a tuple of
             length 3, it is used to fill R, G, B channels respectively.
-            This value is only used when the padding_mode is constant
-        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant.
-            ``constant`` means padding with a constant value, this value is specified with fill. 
-            ``edge`` means padding with the last value at the edge of the image. 
-            ``reflect`` means padding with reflection of image (without repeating the last value on the edge) 
-            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in reflect mode 
-            will result in ``[3, 2, 1, 2, 3, 4, 3, 2]``.
-            ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
-            padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode 
-            will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
+            This value is only used when the padding_mode is constant. Default: 0. 
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
 
     Returns:
-        numpy ndarray: Padded image.
+        PIL.Image or np.array: Padded image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            padded_img = F.pad(fake_img, padding=1)
+            print(padded_img.size)
+
+            padded_img = F.pad(fake_img, padding=(2, 1))
+            print(padded_img.size)
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.pad(img, padding, fill, padding_mode)
+    else:
+        return F_cv2.pad(img, padding, fill, padding_mode)
+
+
+def crop(img, top, left, height, width):
+    """Crops the given Image.
+
+    Args:
+        img (PIL.Image|np.array): Image to be cropped. (0,0) denotes the top left 
+            corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+
+    Returns:
+        PIL.Image or np.array: Cropped image.
 
     Examples:
-    
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
 
-            from paddle.vision.transforms.functional import pad
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray(fake_img)
 
-            fake_img = pad(fake_img, 2)
-            print(fake_img.shape)
+            cropped_img = F.crop(fake_img, 56, 150, 200, 100)
+            print(cropped_img.size)
 
     """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.crop(img, top, left, height, width)
+    else:
+        return F_cv2.crop(img, top, left, height, width)
+
+
+def center_crop(img, output_size):
+    """Crops the given Image and resize it to desired size.
+
+        Args:
+            img (PIL.Image|np.array): Image to be cropped. (0,0) denotes the top left corner of the image.
+            output_size (sequence or int): (height, width) of the crop box. If int,
+                it is used for both directions
+        
+        Returns:
+            PIL.Image or np.array: Cropped image.
 
-    if not isinstance(padding, (numbers.Number, list, tuple)):
-        raise TypeError('Got inappropriate padding arg')
-    if not isinstance(fill, (numbers.Number, str, list, tuple)):
-        raise TypeError('Got inappropriate fill arg')
-    if not isinstance(padding_mode, str):
-        raise TypeError('Got inappropriate padding_mode arg')
-
-    if isinstance(padding, collections.Sequence) and len(padding) not in [2, 4]:
-        raise ValueError(
-            "Padding must be an int or a 2, or 4 element tuple, not a " +
-            "{} element tuple".format(len(padding)))
-
-    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
-        'Expected padding mode be either constant, edge, reflect or symmetric, but got {}'.format(padding_mode)
-
-    cv2 = try_import('cv2')
-
-    PAD_MOD = {
-        'constant': cv2.BORDER_CONSTANT,
-        'edge': cv2.BORDER_REPLICATE,
-        'reflect': cv2.BORDER_DEFAULT,
-        'symmetric': cv2.BORDER_REFLECT
-    }
-
-    if isinstance(padding, int):
-        pad_left = pad_right = pad_top = pad_bottom = padding
-    if isinstance(padding, collections.Sequence) and len(padding) == 2:
-        pad_left = pad_right = padding[0]
-        pad_top = pad_bottom = padding[1]
-    if isinstance(padding, collections.Sequence) and len(padding) == 4:
-        pad_left, pad_top, pad_right, pad_bottom = padding
-
-    if isinstance(fill, numbers.Number):
-        fill = (fill, ) * (2 * len(img.shape) - 3)
-
-    if padding_mode == 'constant':
-        assert (len(fill) == 3 and len(img.shape) == 3) or (len(fill) == 1 and len(img.shape) == 2), \
-            'channel of image is {} but length of fill is {}'.format(img.shape[-1], len(fill))
-
-    img = cv2.copyMakeBorder(
-        src=img,
-        top=pad_top,
-        bottom=pad_bottom,
-        left=pad_left,
-        right=pad_right,
-        borderType=PAD_MOD[padding_mode],
-        value=fill)
-
-    return img
-
-
-@keepdims
-def rotate(img, angle, interpolation=1, expand=False, center=None):
+        Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            cropped_img = F.center_crop(fake_img, (150, 100))
+            print(cropped_img.size)
+        """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.center_crop(img, output_size)
+    else:
+        return F_cv2.center_crop(img, output_size)
+
+
+def hflip(img, backend='pil'):
+    """Horizontally flips the given Image or np.array.
+
+    Args:
+        img (PIL.Image|np.array): Image to be flipped.
+        backend (str, optional): The image proccess backend type. Options are `pil`, 
+            `cv2`. Default: 'pil'. 
+
+    Returns:
+        PIL.Image or np.array:  Horizontall flipped image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            flpped_img = F.hflip(fake_img)
+            print(flpped_img.size)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.hflip(img)
+    else:
+        return F_cv2.hflip(img)
+
+
+def vflip(img):
+    """Vertically flips the given Image or np.array.
+
+    Args:
+        img (PIL.Image|np.array): Image to be flipped.
+
+    Returns:
+        PIL.Image or np.array:  Vertically flipped image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            flpped_img = F.vflip(fake_img)
+            print(flpped_img.size)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.vflip(img)
+    else:
+        return F_cv2.vflip(img)
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjusts brightness of an Image.
+
+    Args:
+        img (PIL.Image|np.array): Image to be adjusted.
+        brightness_factor (float): How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        PIL.Image or np.array: Brightness adjusted image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            converted_img = F.adjust_brightness(fake_img, 0.4)
+            print(converted_img.size)
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.adjust_brightness(img, brightness_factor)
+    else:
+        return F_cv2.adjust_brightness(img, brightness_factor)
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjusts contrast of an Image.
+
+    Args:
+        img (PIL.Image|np.array): Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        PIL.Image or np.array: Contrast adjusted image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            converted_img = F.adjust_contrast(fake_img, 0.4)
+            print(converted_img.size)
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.adjust_contrast(img, contrast_factor)
+    else:
+        return F_cv2.adjust_contrast(img, contrast_factor)
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjusts color saturation of an image.
+
+    Args:
+        img (PIL.Image|np.array): Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        PIL.Image or np.array: Saturation adjusted image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            converted_img = F.adjust_saturation(fake_img, 0.4)
+            print(converted_img.size)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.adjust_saturation(img, saturation_factor)
+    else:
+        return F_cv2.adjust_saturation(img, saturation_factor)
+
+
+def adjust_hue(img, hue_factor):
+    """Adjusts hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Args:
+        img (PIL.Image|np.array): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        PIL.Image or np.array: Hue adjusted image.
+
+    Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            converted_img = F.adjust_hue(fake_img, 0.4)
+            print(converted_img.size)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.adjust_hue(img, hue_factor)
+    else:
+        return F_cv2.adjust_hue(img, hue_factor)
+
+
+def rotate(img, angle, resample=False, expand=False, center=None, fill=0):
     """Rotates the image by angle.
 
+
     Args:
-        img (numpy.ndarray): Image to be rotated.
-        angle (float|int): In degrees clockwise order.
-        interpolation (int, optional): Interpolation method. Default: 1.
-            0 : cv2.INTER_NEAREST 
-            1 : cv2.INTER_LINEAR 
-            2 : cv2.INTER_CUBIC 
-            3 : cv2.INTER_AREA 
-            4 : cv2.INTER_LANCZOS4 
-            5 : cv2.INTER_LINEAR_EXACT
-            7 : cv2.INTER_MAX 
-            8 : cv2.WARP_FILL_OUTLIERS 
-            16: cv2.WARP_INVERSE_MAP 
-        expand (bool|optional): Optional expansion flag.
+        img (PIL.Image|np.array): Image to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        resample (int|str, optional): An optional resampling filter. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. when use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        expand (bool, optional): Optional expansion flag.
             If true, expands the output image to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
             Note that the expand flag assumes rotation around the center and no translation.
-        center (2-tuple|optional): Optional center of rotation.
+        center (2-tuple, optional): Optional center of rotation.
             Origin is the upper left corner.
             Default is the center of the image.
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
 
     Returns:
-        numpy ndarray: Rotated image.
+        PIL.Image or np.array: Rotated image.
 
     Examples:
-    
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
 
-            from paddle.vision.transforms.functional import rotate
+            fake_img = Image.fromarray(fake_img)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            rotated_img = F.rotate(fake_img, 90)
+            print(rotated_img.size)
 
-            fake_img = rotate(fake_img, 10)
-            print(fake_img.shape)
     """
-    cv2 = try_import('cv2')
-
-    dtype = img.dtype
-    h, w, _ = img.shape
-    point = center or (w / 2, h / 2)
-    M = cv2.getRotationMatrix2D(point, angle=-angle, scale=1)
-
-    if expand:
-        if center is None:
-            cos = np.abs(M[0, 0])
-            sin = np.abs(M[0, 1])
-
-            nW = int((h * sin) + (w * cos))
-            nH = int((h * cos) + (w * sin))
-
-            M[0, 2] += (nW / 2) - point[0]
-            M[1, 2] += (nH / 2) - point[1]
-
-            dst = cv2.warpAffine(img, M, (nW, nH))
-        else:
-            xx = []
-            yy = []
-            for point in (np.array([0, 0, 1]), np.array([w - 1, 0, 1]),
-                          np.array([w - 1, h - 1, 1]), np.array([0, h - 1, 1])):
-                target = np.dot(M, point)
-                xx.append(target[0])
-                yy.append(target[1])
-            nh = int(math.ceil(max(yy)) - math.floor(min(yy)))
-            nw = int(math.ceil(max(xx)) - math.floor(min(xx)))
-
-            M[0, 2] += (nw - w) / 2
-            M[1, 2] += (nh - h) / 2
-            dst = cv2.warpAffine(img, M, (nw, nh), flags=interpolation)
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.rotate(img, angle, resample, expand, center, fill)
     else:
-        dst = cv2.warpAffine(img, M, (w, h), flags=interpolation)
-    return dst.astype(dtype)
+        return F_cv2.rotate(img, angle, resample, expand, center, fill)
 
 
-@keepdims
 def to_grayscale(img, num_output_channels=1):
     """Converts image to grayscale version of image.
 
     Args:
-        img (numpy.ndarray): Image to be converted to grayscale.
+        img (PIL.Image|np.array): Image to be converted to grayscale.
+        backend (str, optional): The image proccess backend type. Options are `pil`, 
+                    `cv2`. Default: 'pil'. 
 
     Returns:
-        numpy.ndarray:  Grayscale version of the image.
-                        if num_output_channels == 1, returned image is single channel
-                        if num_output_channels == 3, returned image is 3 channel with r == g == b
+        PIL.Image or np.array: Grayscale version of the image.
+            if num_output_channels = 1 : returned image is single channel
+
+            if num_output_channels = 3 : returned image is 3 channel with r = g = b
     
     Examples:
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
+
+            gray_img = F.to_grayscale(fake_img)
+            print(gray_img.size)
+
+    """
+    if not (_is_pil_image(img) or _is_numpy_image(img)):
+        raise TypeError(
+            'img should be PIL Image or ndarray with dim=[2 or 3]. Got {}'.
+            format(type(img)))
+
+    if _is_pil_image(img):
+        return F_pil.to_grayscale(img, num_output_channels)
+    else:
+        return F_cv2.to_grayscale(img, num_output_channels)
+
+
+def normalize(img, mean, std, data_format='CHW', to_rgb=False):
+    """Normalizes a tensor or image with mean and standard deviation.
+
+    Args:
+        img (PIL.Image|np.array|paddle.Tensor): input data to be normalized.
+        mean (list|tuple): Sequence of means for each channel.
+        std (list|tuple): Sequence of standard deviations for each channel.
+        data_format (str, optional): Data format of input img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+        to_rgb (bool, optional): Whether to convert to rgb. If input is tensor, 
+            this option will be igored. Default: False.
+
+    Returns:
+        Tensor: Normalized mage. Data format is same as input img.
     
+    Examples:
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import functional as F
+
+            fake_img = (np.random.rand(256, 300, 3) * 255.).astype('uint8')
+
+            fake_img = Image.fromarray(fake_img)
 
-            from paddle.vision.transforms.functional import to_grayscale
+            mean = [127.5, 127.5, 127.5]
+            std = [127.5, 127.5, 127.5]
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            normalized_img = F.normalize(fake_img, mean, std, data_format='HWC')
+            print(normalized_img.max(), normalized_img.min())
 
-            fake_img = to_grayscale(fake_img)
-            print(fake_img.shape)
     """
-    cv2 = try_import('cv2')
 
-    if num_output_channels == 1:
-        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
-    elif num_output_channels == 3:
-        img = cv2.cvtColor(
-            cv2.cvtColor(img, cv2.COLOR_RGB2GRAY), cv2.COLOR_GRAY2RGB)
+    if _is_tensor_image(img):
+        return F_t.normalize(img, mean, std, data_format)
     else:
-        raise ValueError('num_output_channels should be either 1 or 3')
+        if _is_pil_image(img):
+            img = np.array(img).astype(np.float32)
 
-    return img
+        return F_cv2.normalize(img, mean, std, data_format, to_rgb)
diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
new file mode 100644
index 0000000000000..5c2e8d61bc527
--- /dev/null
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -0,0 +1,503 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import sys
+import numbers
+import warnings
+import collections
+
+import numpy as np
+from numpy import sin, cos, tan
+
+import paddle
+from paddle.utils import try_import
+
+if sys.version_info < (3, 3):
+    Sequence = collections.Sequence
+    Iterable = collections.Iterable
+else:
+    Sequence = collections.abc.Sequence
+    Iterable = collections.abc.Iterable
+
+
+def to_tensor(pic, data_format='CHW'):
+    """Converts a ``numpy.ndarray`` to paddle.Tensor.
+
+    See ``ToTensor`` for more details.
+
+    Args:
+        pic (np.ndarray): Image to be converted to tensor.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        Tensor: Converted image.
+
+    """
+
+    if not data_format in ['CHW', 'HWC']:
+        raise ValueError('data_format should be CHW or HWC. Got {}'.format(
+            data_format))
+
+    if pic.ndim == 2:
+        pic = pic[:, :, None]
+
+    if data_format == 'CHW':
+        img = paddle.to_tensor(pic.transpose((2, 0, 1)))
+    else:
+        img = paddle.to_tensor(pic)
+
+    if paddle.fluid.data_feeder.convert_dtype(img.dtype) == 'uint8':
+        return paddle.cast(img, np.float32) / 255.
+    else:
+        return img
+
+
+def resize(img, size, interpolation='bilinear'):
+    """
+    Resizes the image to given size
+
+    Args:
+        input (np.ndarray): Image to be resized.
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int|str, optional): Interpolation method. when use cv2 backend, 
+            support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "area": cv2.INTER_AREA, 
+            - "bicubic": cv2.INTER_CUBIC, 
+            - "lanczos": cv2.INTER_LANCZOS4
+
+    Returns:
+        np.array: Resized image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_interp_from_str = {
+        'nearest': cv2.INTER_NEAREST,
+        'bilinear': cv2.INTER_LINEAR,
+        'area': cv2.INTER_AREA,
+        'bicubic': cv2.INTER_CUBIC,
+        'lanczos': cv2.INTER_LANCZOS4
+    }
+
+    if not (isinstance(size, int) or
+            (isinstance(size, Iterable) and len(size) == 2)):
+        raise TypeError('Got inappropriate size arg: {}'.format(size))
+
+    h, w = img.shape[:2]
+
+    if isinstance(size, int):
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+            output = cv2.resize(
+                img,
+                dsize=(ow, oh),
+                interpolation=_cv2_interp_from_str[interpolation])
+        else:
+            oh = size
+            ow = int(size * w / h)
+            output = cv2.resize(
+                img,
+                dsize=(ow, oh),
+                interpolation=_cv2_interp_from_str[interpolation])
+    else:
+        output = cv2.resize(
+            img,
+            dsize=(size[1], size[0]),
+            interpolation=_cv2_interp_from_str[interpolation])
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return output[:, :, np.newaxis]
+    else:
+        return output
+
+
+def pad(img, padding, fill=0, padding_mode='constant'):
+    """
+    Pads the given numpy.array on all sides with specified padding mode and fill value.
+
+    Args:
+        img (np.array): Image to be padded.
+        padding (int|list|tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (float, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0. 
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        np.array: Padded image.
+
+    """
+    cv2 = try_import('cv2')
+    _cv2_pad_from_str = {
+        'constant': cv2.BORDER_CONSTANT,
+        'edge': cv2.BORDER_REPLICATE,
+        'reflect': cv2.BORDER_REFLECT_101,
+        'symmetric': cv2.BORDER_REFLECT
+    }
+
+    if not isinstance(padding, (numbers.Number, list, tuple)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, list, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+
+    if isinstance(padding, Sequence) and len(padding) not in [2, 4]:
+        raise ValueError(
+            "Padding must be an int or a 2, or 4 element tuple, not a " +
+            "{} element tuple".format(len(padding)))
+
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Padding mode should be either constant, edge, reflect or symmetric'
+
+    if isinstance(padding, list):
+        padding = tuple(padding)
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    if isinstance(padding, Sequence) and len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    if isinstance(padding, Sequence) and len(padding) == 4:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.copyMakeBorder(
+            img,
+            top=pad_top,
+            bottom=pad_bottom,
+            left=pad_left,
+            right=pad_right,
+            borderType=_cv2_pad_from_str[padding_mode],
+            value=fill)[:, :, np.newaxis]
+    else:
+        return cv2.copyMakeBorder(
+            img,
+            top=pad_top,
+            bottom=pad_bottom,
+            left=pad_left,
+            right=pad_right,
+            borderType=_cv2_pad_from_str[padding_mode],
+            value=fill)
+
+
+def crop(img, top, left, height, width):
+    """Crops the given image.
+
+    Args:
+        img (np.array): Image to be cropped. (0,0) denotes the top left 
+            corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+
+    Returns:
+        np.array: Cropped image.
+
+    """
+
+    return img[top:top + height, left:left + width, :]
+
+
+def center_crop(img, output_size):
+    """Crops the given image and resize it to desired size.
+
+        Args:
+            img (np.array): Image to be cropped. (0,0) denotes the top left corner of the image.
+            output_size (sequence or int): (height, width) of the crop box. If int,
+                it is used for both directions
+            backend (str, optional): The image proccess backend type. Options are `pil`, `cv2`. Default: 'pil'. 
+        
+        Returns:
+            np.array: Cropped image.
+
+        """
+
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+
+    h, w = img.shape[0:2]
+    th, tw = output_size
+    i = int(round((h - th) / 2.))
+    j = int(round((w - tw) / 2.))
+    return crop(img, i, j, th, tw)
+
+
+def hflip(img):
+    """Horizontally flips the given image.
+
+    Args:
+        img (np.array): Image to be flipped.
+
+    Returns:
+        np.array:  Horizontall flipped image.
+
+    """
+    cv2 = try_import('cv2')
+
+    return cv2.flip(img, 1)
+
+
+def vflip(img):
+    """Vertically flips the given np.array.
+
+    Args:
+        img (np.array): Image to be flipped.
+
+    Returns:
+        np.array:  Vertically flipped image.
+
+    """
+    cv2 = try_import('cv2')
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.flip(img, 0)[:, :, np.newaxis]
+    else:
+        return cv2.flip(img, 0)
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjusts brightness of an image.
+
+    Args:
+        img (np.array): Image to be adjusted.
+        brightness_factor (float):  How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        np.array: Brightness adjusted image.
+
+    """
+    cv2 = try_import('cv2')
+
+    table = np.array([i * brightness_factor
+                      for i in range(0, 256)]).clip(0, 255).astype('uint8')
+
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.LUT(img, table)[:, :, np.newaxis]
+    else:
+        return cv2.LUT(img, table)
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjusts contrast of an image.
+
+    Args:
+        img (np.array): Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        np.array: Contrast adjusted image.
+
+    """
+    cv2 = try_import('cv2')
+
+    table = np.array([(i - 74) * contrast_factor + 74
+                      for i in range(0, 256)]).clip(0, 255).astype('uint8')
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.LUT(img, table)[:, :, np.newaxis]
+    else:
+        return cv2.LUT(img, table)
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjusts color saturation of an image.
+
+    Args:
+        img (np.array): Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        np.array: Saturation adjusted image.
+
+    """
+    cv2 = try_import('cv2')
+
+    dtype = img.dtype
+    img = img.astype(np.float32)
+    alpha = np.random.uniform(
+        max(0, 1 - saturation_factor), 1 + saturation_factor)
+    gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
+    gray_img = gray_img[..., np.newaxis]
+    img = img * alpha + gray_img * (1 - alpha)
+    return img.clip(0, 255).astype(dtype)
+
+
+def adjust_hue(img, hue_factor):
+    """Adjusts hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Args:
+        img (np.array): Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        np.array: Hue adjusted image.
+
+    """
+    cv2 = try_import('cv2')
+
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+
+    dtype = img.dtype
+    img = img.astype(np.uint8)
+    hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
+    h, s, v = cv2.split(hsv_img)
+
+    alpha = np.random.uniform(hue_factor, hue_factor)
+    h = h.astype(np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over="ignore"):
+        h += np.uint8(alpha * 255)
+    hsv_img = cv2.merge([h, s, v])
+    return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
+
+
+def rotate(img, angle, resample=False, expand=False, center=None, fill=0):
+    """Rotates the image by angle.
+
+    Args:
+        img (np.array): Image to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        resample (int|str, optional): An optional resampling filter. If omitted, or if the 
+            image has only one channel, it is set to cv2.INTER_NEAREST.
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        np.array: Rotated image.
+
+    """
+    cv2 = try_import('cv2')
+
+    rows, cols = img.shape[0:2]
+    if center is None:
+        center = (cols / 2, rows / 2)
+    M = cv2.getRotationMatrix2D(center, angle, 1)
+    if len(img.shape) == 3 and img.shape[2] == 1:
+        return cv2.warpAffine(img, M, (cols, rows))[:, :, np.newaxis]
+    else:
+        return cv2.warpAffine(img, M, (cols, rows))
+
+
+def to_grayscale(img, num_output_channels=1):
+    """Converts image to grayscale version of image.
+
+    Args:
+        img (np.array): Image to be converted to grayscale.
+
+    Returns:
+        np.array: Grayscale version of the image.
+            if num_output_channels = 1 : returned image is single channel
+
+            if num_output_channels = 3 : returned image is 3 channel with r = g = b
+
+    """
+    cv2 = try_import('cv2')
+
+    if num_output_channels == 1:
+        img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)[:, :, np.newaxis]
+    elif num_output_channels == 3:
+        # much faster than doing cvtColor to go back to gray
+        img = np.broadcast_to(
+            cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)[:, :, np.newaxis], img.shape)
+    else:
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    return img
+
+
+def normalize(img, mean, std, data_format='CHW', to_rgb=False):
+    """Normalizes a ndarray imge or image with mean and standard deviation.
+
+    Args:
+        img (np.array): input data to be normalized.
+        mean (list|tuple): Sequence of means for each channel.
+        std (list|tuple): Sequence of standard deviations for each channel.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+        to_rgb (bool, optional): Whether to convert to rgb. Default: False.
+
+    Returns:
+        np.array: Normalized mage.
+
+    """
+
+    if data_format == 'CHW':
+        mean = np.float32(np.array(mean).reshape(-1, 1, 1))
+        std = np.float32(np.array(std).reshape(-1, 1, 1))
+    else:
+        mean = np.float32(np.array(mean).reshape(1, 1, -1))
+        std = np.float32(np.array(std).reshape(1, 1, -1))
+    if to_rgb:
+        cv2 = try_import('cv2')
+        # inplace
+        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+
+    img = (img - mean) / std
+    return img
diff --git a/python/paddle/vision/transforms/functional_pil.py b/python/paddle/vision/transforms/functional_pil.py
new file mode 100644
index 0000000000000..49b02fc049e2c
--- /dev/null
+++ b/python/paddle/vision/transforms/functional_pil.py
@@ -0,0 +1,458 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import sys
+import math
+import numbers
+import warnings
+import collections
+from PIL import Image, ImageOps, ImageEnhance
+
+import numpy as np
+from numpy import sin, cos, tan
+import paddle
+
+if sys.version_info < (3, 3):
+    Sequence = collections.Sequence
+    Iterable = collections.Iterable
+else:
+    Sequence = collections.abc.Sequence
+    Iterable = collections.abc.Iterable
+
+_pil_interp_from_str = {
+    'nearest': Image.NEAREST,
+    'bilinear': Image.BILINEAR,
+    'bicubic': Image.BICUBIC,
+    'box': Image.BOX,
+    'lanczos': Image.LANCZOS,
+    'hamming': Image.HAMMING
+}
+
+
+def to_tensor(pic, data_format='CHW'):
+    """Converts a ``PIL.Image`` to paddle.Tensor.
+
+    See ``ToTensor`` for more details.
+
+    Args:
+        pic (PIL.Image): Image to be converted to tensor.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        Tensor: Converted image.
+
+    """
+
+    if not data_format in ['CHW', 'HWC']:
+        raise ValueError('data_format should be CHW or HWC. Got {}'.format(
+            data_format))
+
+    # PIL Image
+    if pic.mode == 'I':
+        img = paddle.to_tensor(np.array(pic, np.int32, copy=False))
+    elif pic.mode == 'I;16':
+        # cast and reshape not support int16
+        img = paddle.to_tensor(np.array(pic, np.int32, copy=False))
+    elif pic.mode == 'F':
+        img = paddle.to_tensor(np.array(pic, np.float32, copy=False))
+    elif pic.mode == '1':
+        img = 255 * paddle.to_tensor(np.array(pic, np.uint8, copy=False))
+    else:
+        img = paddle.to_tensor(np.array(pic, copy=False))
+
+    if pic.mode == 'YCbCr':
+        nchannel = 3
+    elif pic.mode == 'I;16':
+        nchannel = 1
+    else:
+        nchannel = len(pic.mode)
+
+    dtype = paddle.fluid.data_feeder.convert_dtype(img.dtype)
+    if dtype == 'uint8':
+        img = paddle.cast(img, np.float32) / 255.
+
+    img = img.reshape([pic.size[1], pic.size[0], nchannel])
+
+    if data_format == 'CHW':
+        img = img.transpose([2, 0, 1])
+
+    return img
+
+
+def resize(img, size, interpolation='bilinear'):
+    """
+    Resizes the image to given size
+
+    Args:
+        input (PIL.Image): Image to be resized.
+        size (int|list|tuple): Target size of input data, with (height, width) shape.
+        interpolation (int|str, optional): Interpolation method. when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC, 
+            - "box": Image.BOX, 
+            - "lanczos": Image.LANCZOS, 
+            - "hamming": Image.HAMMING
+
+    Returns:
+        PIL.Image: Resized image.
+
+    """
+
+    if not (isinstance(size, int) or
+            (isinstance(size, Iterable) and len(size) == 2)):
+        raise TypeError('Got inappropriate size arg: {}'.format(size))
+
+    if isinstance(size, int):
+        w, h = img.size
+        if (w <= h and w == size) or (h <= w and h == size):
+            return img
+        if w < h:
+            ow = size
+            oh = int(size * h / w)
+            return img.resize((ow, oh), _pil_interp_from_str[interpolation])
+        else:
+            oh = size
+            ow = int(size * w / h)
+            return img.resize((ow, oh), _pil_interp_from_str[interpolation])
+    else:
+        return img.resize(size[::-1], _pil_interp_from_str[interpolation])
+
+
+def pad(img, padding, fill=0, padding_mode='constant'):
+    """
+    Pads the given PIL.Image on all sides with specified padding mode and fill value.
+
+    Args:
+        img (PIL.Image): Image to be padded.
+        padding (int|list|tuple): Padding on each border. If a single int is provided this
+            is used to pad all borders. If tuple of length 2 is provided this is the padding
+            on left/right and top/bottom respectively. If a tuple of length 4 is provided
+            this is the padding for the left, top, right and bottom borders
+            respectively.
+        fill (float, optional): Pixel fill value for constant fill. If a tuple of
+            length 3, it is used to fill R, G, B channels respectively.
+            This value is only used when the padding_mode is constant. Default: 0. 
+        padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default: 'constant'.
+
+            - constant: pads with a constant value, this value is specified with fill
+
+            - edge: pads with the last value on the edge of the image
+
+            - reflect: pads with reflection of image (without repeating the last value on the edge)
+
+                       padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode
+                       will result in [3, 2, 1, 2, 3, 4, 3, 2]
+
+            - symmetric: pads with reflection of image (repeating the last value on the edge)
+
+                         padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode
+                         will result in [2, 1, 1, 2, 3, 4, 4, 3]
+
+    Returns:
+        PIL.Image: Padded image.
+
+    """
+
+    if not isinstance(padding, (numbers.Number, list, tuple)):
+        raise TypeError('Got inappropriate padding arg')
+    if not isinstance(fill, (numbers.Number, str, list, tuple)):
+        raise TypeError('Got inappropriate fill arg')
+    if not isinstance(padding_mode, str):
+        raise TypeError('Got inappropriate padding_mode arg')
+
+    if isinstance(padding, Sequence) and len(padding) not in [2, 4]:
+        raise ValueError(
+            "Padding must be an int or a 2, or 4 element tuple, not a " +
+            "{} element tuple".format(len(padding)))
+
+    assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric'], \
+        'Padding mode should be either constant, edge, reflect or symmetric'
+
+    if isinstance(padding, list):
+        padding = tuple(padding)
+    if isinstance(padding, int):
+        pad_left = pad_right = pad_top = pad_bottom = padding
+    if isinstance(padding, Sequence) and len(padding) == 2:
+        pad_left = pad_right = padding[0]
+        pad_top = pad_bottom = padding[1]
+    if isinstance(padding, Sequence) and len(padding) == 4:
+        pad_left = padding[0]
+        pad_top = padding[1]
+        pad_right = padding[2]
+        pad_bottom = padding[3]
+
+    if padding_mode == 'constant':
+        if img.mode == 'P':
+            palette = img.getpalette()
+            image = ImageOps.expand(img, border=padding, fill=fill)
+            image.putpalette(palette)
+            return image
+
+        return ImageOps.expand(img, border=padding, fill=fill)
+    else:
+        if img.mode == 'P':
+            palette = img.getpalette()
+            img = np.asarray(img)
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)),
+                         padding_mode)
+            img = Image.fromarray(img)
+            img.putpalette(palette)
+            return img
+
+        img = np.asarray(img)
+        # RGB image
+        if len(img.shape) == 3:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right),
+                               (0, 0)), padding_mode)
+        # Grayscale image
+        if len(img.shape) == 2:
+            img = np.pad(img, ((pad_top, pad_bottom), (pad_left, pad_right)),
+                         padding_mode)
+
+        return Image.fromarray(img)
+
+
+def crop(img, top, left, height, width):
+    """Crops the given PIL Image.
+
+    Args:
+        img (PIL.Image): Image to be cropped. (0,0) denotes the top left 
+            corner of the image.
+        top (int): Vertical component of the top left corner of the crop box.
+        left (int): Horizontal component of the top left corner of the crop box.
+        height (int): Height of the crop box.
+        width (int): Width of the crop box.
+
+    Returns:
+        PIL.Image: Cropped image.
+
+    """
+    return img.crop((left, top, left + width, top + height))
+
+
+def center_crop(img, output_size):
+    """Crops the given PIL Image and resize it to desired size.
+
+        Args:
+            img (PIL.Image): Image to be cropped. (0,0) denotes the top left corner of the image.
+            output_size (sequence or int): (height, width) of the crop box. If int,
+                it is used for both directions
+            backend (str, optional): The image proccess backend type. Options are `pil`, `cv2`. Default: 'pil'. 
+        
+        Returns:
+            PIL.Image: Cropped image.
+
+        """
+
+    if isinstance(output_size, numbers.Number):
+        output_size = (int(output_size), int(output_size))
+
+    image_width, image_height = img.size
+    crop_height, crop_width = output_size
+    crop_top = int(round((image_height - crop_height) / 2.))
+    crop_left = int(round((image_width - crop_width) / 2.))
+    return crop(img, crop_top, crop_left, crop_height, crop_width)
+
+
+def hflip(img):
+    """Horizontally flips the given PIL Image.
+
+    Args:
+        img (PIL.Image): Image to be flipped.
+
+    Returns:
+        PIL.Image:  Horizontall flipped image.
+
+    """
+
+    return img.transpose(Image.FLIP_LEFT_RIGHT)
+
+
+def vflip(img):
+    """Vertically flips the given PIL Image.
+
+    Args:
+        img (PIL.Image): Image to be flipped.
+
+    Returns:
+        PIL.Image:  Vertically flipped image.
+
+    """
+
+    return img.transpose(Image.FLIP_TOP_BOTTOM)
+
+
+def adjust_brightness(img, brightness_factor):
+    """Adjusts brightness of an Image.
+
+    Args:
+        img (PIL.Image): PIL Image to be adjusted.
+        brightness_factor (float):  How much to adjust the brightness. Can be
+            any non negative number. 0 gives a black image, 1 gives the
+            original image while 2 increases the brightness by a factor of 2.
+
+    Returns:
+        PIL.Image: Brightness adjusted image.
+
+    """
+
+    enhancer = ImageEnhance.Brightness(img)
+    img = enhancer.enhance(brightness_factor)
+    return img
+
+
+def adjust_contrast(img, contrast_factor):
+    """Adjusts contrast of an Image.
+
+    Args:
+        img (PIL.Image): PIL Image to be adjusted.
+        contrast_factor (float): How much to adjust the contrast. Can be any
+            non negative number. 0 gives a solid gray image, 1 gives the
+            original image while 2 increases the contrast by a factor of 2.
+
+    Returns:
+        PIL.Image: Contrast adjusted image.
+
+    """
+
+    enhancer = ImageEnhance.Contrast(img)
+    img = enhancer.enhance(contrast_factor)
+    return img
+
+
+def adjust_saturation(img, saturation_factor):
+    """Adjusts color saturation of an image.
+
+    Args:
+        img (PIL.Image): PIL Image to be adjusted.
+        saturation_factor (float):  How much to adjust the saturation. 0 will
+            give a black and white image, 1 will give the original image while
+            2 will enhance the saturation by a factor of 2.
+
+    Returns:
+        PIL.Image: Saturation adjusted image.
+
+    """
+
+    enhancer = ImageEnhance.Color(img)
+    img = enhancer.enhance(saturation_factor)
+    return img
+
+
+def adjust_hue(img, hue_factor):
+    """Adjusts hue of an image.
+
+    The image hue is adjusted by converting the image to HSV and
+    cyclically shifting the intensities in the hue channel (H).
+    The image is then converted back to original image mode.
+
+    `hue_factor` is the amount of shift in H channel and must be in the
+    interval `[-0.5, 0.5]`.
+
+    Args:
+        img (PIL.Image): PIL Image to be adjusted.
+        hue_factor (float):  How much to shift the hue channel. Should be in
+            [-0.5, 0.5]. 0.5 and -0.5 give complete reversal of hue channel in
+            HSV space in positive and negative direction respectively.
+            0 means no shift. Therefore, both -0.5 and 0.5 will give an image
+            with complementary colors while 0 gives the original image.
+
+    Returns:
+        PIL.Image: Hue adjusted image.
+
+    """
+    if not (-0.5 <= hue_factor <= 0.5):
+        raise ValueError('hue_factor is not in [-0.5, 0.5].'.format(hue_factor))
+
+    input_mode = img.mode
+    if input_mode in {'L', '1', 'I', 'F'}:
+        return img
+
+    h, s, v = img.convert('HSV').split()
+
+    np_h = np.array(h, dtype=np.uint8)
+    # uint8 addition take cares of rotation across boundaries
+    with np.errstate(over='ignore'):
+        np_h += np.uint8(hue_factor * 255)
+    h = Image.fromarray(np_h, 'L')
+
+    img = Image.merge('HSV', (h, s, v)).convert(input_mode)
+    return img
+
+
+def rotate(img, angle, resample=False, expand=False, center=None, fill=0):
+    """Rotates the image by angle.
+
+    Args:
+        img (PIL.Image): Image to be rotated.
+        angle (float or int): In degrees degrees counter clockwise order.
+        resample (int|str, optional): An optional resampling filter. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST . when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+        expand (bool, optional): Optional expansion flag.
+            If true, expands the output image to make it large enough to hold the entire rotated image.
+            If false or omitted, make the output image the same size as the input image.
+            Note that the expand flag assumes rotation around the center and no translation.
+        center (2-tuple, optional): Optional center of rotation.
+            Origin is the upper left corner.
+            Default is the center of the image.
+        fill (3-tuple or int): RGB pixel fill value for area outside the rotated image.
+            If int, it is used for all channels respectively.
+
+    Returns:
+        PIL.Image: Rotated image.
+
+    """
+
+    if isinstance(fill, int):
+        fill = tuple([fill] * 3)
+
+    return img.rotate(angle, resample, expand, center, fillcolor=fill)
+
+
+def to_grayscale(img, num_output_channels=1):
+    """Converts image to grayscale version of image.
+
+    Args:
+        img (PIL.Image): Image to be converted to grayscale.
+        backend (str, optional): The image proccess backend type. Options are `pil`, 
+                    `cv2`. Default: 'pil'. 
+
+    Returns:
+        PIL.Image: Grayscale version of the image.
+            if num_output_channels = 1 : returned image is single channel
+
+            if num_output_channels = 3 : returned image is 3 channel with r = g = b
+
+    """
+
+    if num_output_channels == 1:
+        img = img.convert('L')
+    elif num_output_channels == 3:
+        img = img.convert('L')
+        np_img = np.array(img, dtype=np.uint8)
+        np_img = np.dstack([np_img, np_img, np_img])
+        img = Image.fromarray(np_img, 'RGB')
+    else:
+        raise ValueError('num_output_channels should be either 1 or 3')
+
+    return img
diff --git a/python/paddle/vision/transforms/functional_tensor.py b/python/paddle/vision/transforms/functional_tensor.py
new file mode 100644
index 0000000000000..e8b70820dd9af
--- /dev/null
+++ b/python/paddle/vision/transforms/functional_tensor.py
@@ -0,0 +1,40 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+
+import paddle
+
+
+def normalize(img, mean, std, data_format='CHW'):
+    """Normalizes a tensor image with mean and standard deviation.
+
+    Args:
+        img (paddle.Tensor): input data to be normalized.
+        mean (list|tuple): Sequence of means for each channel.
+        std (list|tuple): Sequence of standard deviations for each channel.
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+
+    Returns:
+        Tensor: Normalized mage.
+
+    """
+    if data_format == 'CHW':
+        mean = paddle.to_tensor(mean).reshape([-1, 1, 1])
+        std = paddle.to_tensor(std).reshape([-1, 1, 1])
+    else:
+        mean = paddle.to_tensor(mean)
+        std = paddle.to_tensor(std)
+    return (img - mean) / std
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 9ea828271765c..9079f91aac9fa 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -36,30 +36,50 @@
     Iterable = collections.abc.Iterable
 
 __all__ = [
-    "Compose",
-    "BatchCompose",
-    "Resize",
-    "RandomResizedCrop",
-    "CenterCropResize",
-    "CenterCrop",
-    "RandomHorizontalFlip",
-    "RandomVerticalFlip",
-    "Permute",
-    "Normalize",
-    "GaussianNoise",
-    "BrightnessTransform",
-    "SaturationTransform",
-    "ContrastTransform",
-    "HueTransform",
-    "ColorJitter",
-    "RandomCrop",
-    "RandomErasing",
-    "Pad",
-    "RandomRotate",
-    "Grayscale",
+    "BaseTransform", "Compose", "Resize", "RandomResizedCrop", "CenterCrop",
+    "RandomHorizontalFlip", "RandomVerticalFlip", "Transpose", "Normalize",
+    "BrightnessTransform", "SaturationTransform", "ContrastTransform",
+    "HueTransform", "ColorJitter", "RandomCrop", "Pad", "RandomRotation",
+    "Grayscale", "ToTensor"
 ]
 
 
+def _get_image_size(img):
+    if F._is_pil_image(img):
+        return img.size
+    elif F._is_numpy_image(img):
+        return img.shape[:2][::-1]
+    else:
+        raise TypeError("Unexpected type {}".format(type(img)))
+
+
+def _check_input(value,
+                 name,
+                 center=1,
+                 bound=(0, float('inf')),
+                 clip_first_on_zero=True):
+    if isinstance(value, numbers.Number):
+        if value < 0:
+            raise ValueError(
+                "If {} is a single number, it must be non negative.".format(
+                    name))
+        value = [center - value, center + value]
+        if clip_first_on_zero:
+            value[0] = max(value[0], 0)
+    elif isinstance(value, (tuple, list)) and len(value) == 2:
+        if not bound[0] <= value[0] <= value[1] <= bound[1]:
+            raise ValueError("{} values should be between {}".format(name,
+                                                                     bound))
+    else:
+        raise TypeError(
+            "{} should be a single number or a list/tuple with lenght 2.".
+            format(name))
+
+    if value[0] == value[1] == center:
+        value = None
+    return value
+
+
 class Compose(object):
     """
     Composes several transforms together use for composing list of transforms
@@ -91,15 +111,10 @@ class Compose(object):
     def __init__(self, transforms):
         self.transforms = transforms
 
-    def __call__(self, *data):
+    def __call__(self, data):
         for f in self.transforms:
             try:
-                # multi-fileds in a sample
-                if isinstance(data, Sequence):
-                    data = f(*data)
-                # single field in a sample, call transform directly
-                else:
-                    data = f(data)
+                data = f(data)
             except Exception as e:
                 stack_info = traceback.format_exc()
                 print("fail to perform transform [{}] with error: "
@@ -116,96 +131,217 @@ def __repr__(self):
         return format_string
 
 
-class BatchCompose(object):
-    """Composes several batch transforms together
+class BaseTransform(object):
+    """
+    Base class of all transforms used in computer vision.
 
-    Args:
-        transforms (list): List of transforms to compose.
-                           these transforms perform on batch data.
+    calling logic: 
+
+        if keys is None:
+            _get_params -> _apply_image()
+        else:
+            _get_params -> _apply_*() for * in keys 
+
+    If you want to implement a self-defined transform method for image,
+    rewrite _apply_* method in subclass.
 
+    Args:
+        keys (list[str]|tuple[str], optional): Input type. Input is a tuple contains different structures,
+            key is used to specify the type of input. For example, if your input
+            is image type, then the key can be None or ("image"). if your input
+            is (image, image) type, then the keys should be ("image", "image"). 
+            if your input is (image, boxes), then the keys should be ("image", "boxes").
+
+            Current available strings & data type are describe below:
+
+            - "image": input image, with shape of (H, W, C) 
+            - "coords": coordinates, with shape of (N, 2) 
+            - "boxes": bounding boxes, with shape of (N, 4), "xyxy" format, 
+            
+                       the 1st "xy" represents top left point of a box, 
+                       the 2nd "xy" represents right bottom point.
+
+            - "mask": map used for segmentation, with shape of (H, W, 1)
+            
+            You can also customize your data types only if you implement the corresponding
+            _apply_*() methods, otherwise ``NotImplementedError`` will be raised.
+    
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-            from paddle.io import DataLoader
+            from PIL import Image
+            import paddle.vision.transforms.functional as F
+            from paddle.vision.transforms import BaseTransform
+
+            def _get_image_size(img):
+                if F._is_pil_image(img):
+                    return img.size
+                elif F._is_numpy_image(img):
+                    return img.shape[:2][::-1]
+                else:
+                    raise TypeError("Unexpected type {}".format(type(img)))
+
+            class CustomRandomFlip(BaseTransform):
+                def __init__(self, prob=0.5, keys=None):
+                    super(CustomRandomFlip, self).__init__(keys)
+                    self.prob = prob
+
+                def _get_params(self, inputs):
+                    image = inputs[self.keys.index('image')]
+                    params = {}
+                    params['flip'] = np.random.random() < self.prob
+                    params['size'] = _get_image_size(image)
+                    return params
+
+                def _apply_image(self, image):
+                    if self.params['flip']:
+                        return F.hflip(image)
+                    return image
+
+                # if you only want to transform image, do not need to rewrite this function
+                def _apply_coords(self, coords):
+                    if self.params['flip']:
+                        w = self.params['size'][0]
+                        coords[:, 0] = w - coords[:, 0]
+                    return coords
+
+                # if you only want to transform image, do not need to rewrite this function
+                def _apply_boxes(self, boxes):
+                    idxs = np.array([(0, 1), (2, 1), (0, 3), (2, 3)]).flatten()
+                    coords = np.asarray(boxes).reshape(-1, 4)[:, idxs].reshape(-1, 2)
+                    coords = self._apply_coords(coords).reshape((-1, 4, 2))
+                    minxy = coords.min(axis=1)
+                    maxxy = coords.max(axis=1)
+                    trans_boxes = np.concatenate((minxy, maxxy), axis=1)
+                    return trans_boxes
+                    
+                # if you only want to transform image, do not need to rewrite this function
+                def _apply_mask(self, mask):
+                    if self.params['flip']:
+                        return F.hflip(mask)
+                    return mask
+
+            # create fake inputs
+            fake_img = Image.fromarray((np.random.rand(400, 500, 3) * 255.).astype('uint8'))
+            fake_boxes = np.array([[2, 3, 200, 300], [50, 60, 80, 100]])
+            fake_mask = fake_img.convert('L')
+
+            # only transform for image:
+            flip_transform = CustomRandomFlip(1.0)
+            converted_img = flip_transform(fake_img)
+
+            # transform for image, boxes and mask
+            flip_transform = CustomRandomFlip(1.0, keys=('image', 'boxes', 'mask'))
+            (converted_img, converted_boxes, converted_mask) = flip_transform((fake_img, fake_boxes, fake_mask))
+            print('converted boxes', converted_boxes)
 
-            from paddle import set_device
-            from paddle.vision.datasets import Flowers
-            from paddle.vision.transforms import Compose, BatchCompose, Resize
-
-            class NormalizeBatch(object):
-                def __init__(self,
-                            mean=[0.485, 0.456, 0.406],
-                            std=[0.229, 0.224, 0.225],
-                            scale=True,
-                            channel_first=True):
-
-                    self.mean = mean
-                    self.std = std
-                    self.scale = scale
-                    self.channel_first = channel_first
-                    if not (isinstance(self.mean, list) and isinstance(self.std, list) and
-                            isinstance(self.scale, bool)):
-                        raise TypeError("{}: input type is invalid.".format(self))
-                    from functools import reduce
-                    if reduce(lambda x, y: x * y, self.std) == 0:
-                        raise ValueError('{}: std is invalid!'.format(self))
-
-                def __call__(self, samples):
-                    for i in range(len(samples)):
-                        samples[i] = list(samples[i])
-                        im = samples[i][0]
-                        im = im.astype(np.float32, copy=False)
-                        mean = np.array(self.mean)[np.newaxis, np.newaxis, :]
-                        std = np.array(self.std)[np.newaxis, np.newaxis, :]
-                        if self.scale:
-                            im = im / 255.0
-                        im -= mean
-                        im /= std
-                        if self.channel_first:
-                            im = im.transpose((2, 0, 1))
-                        samples[i][0] = im
-                    return samples
-
-            transform = Compose([Resize((500, 500))])
-            flowers_dataset = Flowers(mode='test', transform=transform)
-
-            device = set_device('cpu')
-
-            collate_fn = BatchCompose([NormalizeBatch()])
-            loader = DataLoader(
-                        flowers_dataset,
-                        batch_size=4,
-                        places=device,
-                        return_list=True,
-                        collate_fn=collate_fn)
-
-            for data in loader:
-                # do something
-                break
     """
 
-    def __init__(self, transforms=[]):
-        self.transforms = transforms
+    def __init__(self, keys=None):
+        if keys is None:
+            keys = ("image", )
+        elif not isinstance(keys, Sequence):
+            raise ValueError(
+                "keys should be a sequence, but got keys={}".format(keys))
+        for k in keys:
+            if self._get_apply(k) is None:
+                raise NotImplementedError(
+                    "{} is unsupported data structure".format(k))
+        self.keys = keys
+
+        # storage some params get from function get_params()
+        self.params = None
+
+    def _get_params(self, inputs):
+        pass
+
+    def __call__(self, inputs):
+        """Apply transform on single input data"""
+        if not isinstance(inputs, tuple):
+            inputs = (inputs, )
+
+        self.params = self._get_params(inputs)
+
+        outputs = []
+        for i in range(min(len(inputs), len(self.keys))):
+            apply_func = self._get_apply(self.keys[i])
+            if apply_func is None:
+                outputs.append(inputs[i])
+            else:
+                outputs.append(apply_func(inputs[i]))
+        if len(inputs) > len(self.keys):
+            outputs.extend(input[len(self.keys):])
+
+        if len(outputs) == 1:
+            outputs = outputs[0]
+        else:
+            outputs = tuple(outputs)
+        return outputs
 
-    def __call__(self, data):
-        for f in self.transforms:
-            try:
-                data = f(data)
-            except Exception as e:
-                stack_info = traceback.format_exc()
-                print("fail to perform batch transform [{}] with error: "
-                      "{} and stack:\n{}".format(f, e, str(stack_info)))
-                raise e
+    def _get_apply(self, key):
+        return getattr(self, "_apply_{}".format(key), None)
 
-        # sample list to batch data
-        batch = list(zip(*data))
+    def _apply_image(self, image):
+        raise NotImplementedError
 
-        return batch
+    def _apply_boxes(self, boxes):
+        raise NotImplementedError
 
+    def _apply_mask(self, mask):
+        raise NotImplementedError
 
-class Resize(object):
+
+class ToTensor(BaseTransform):
+    """Convert a ``PIL.Image`` or ``numpy.ndarray`` to ``paddle.Tensor``.
+
+    Converts a PIL.Image or numpy.ndarray (H x W x C) in the range
+    [0, 255] to a paddle.Tensor of shape (C x H x W) in the range [0.0, 1.0]
+    if the PIL Image belongs to one of the modes (L, LA, P, I, F, RGB, YCbCr, RGBA, CMYK, 1)
+    or if the numpy.ndarray has dtype = np.uint8
+
+    In the other cases, tensors are returned without scaling.
+
+    Args:
+        data_format (str, optional): Data format of input img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
+    Examples:
+    
+        .. code-block:: python
+
+            import numpy as np
+            from PIL import Image
+
+            import paddle.vision.transforms as T
+            import paddle.vision.transforms.functional as F
+
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
+
+            transform = T.ToTensor()
+
+            tensor = transform(fake_img)
+
+    """
+
+    def __init__(self, data_format='CHW', keys=None):
+        super(ToTensor, self).__init__(keys)
+        self.data_format = data_format
+
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL.Image|np.ndarray): Image to be converted to tensor.
+
+        Returns:
+            Tensor: Converted image.
+        """
+        return F.to_tensor(img, self.data_format)
+
+
+class Resize(BaseTransform):
     """Resize the input Image to the given size.
 
     Args:
@@ -214,97 +350,111 @@ class Resize(object):
             smaller edge of the image will be matched to this number.
             i.e, if height > width, then image will be rescaled to
             (size * height / width, size)
-        interpolation (int, optional): Interpolation mode of resize. Default: 1.
-            0 : cv2.INTER_NEAREST 
-            1 : cv2.INTER_LINEAR 
-            2 : cv2.INTER_CUBIC 
-            3 : cv2.INTER_AREA 
-            4 : cv2.INTER_LANCZOS4 
-            5 : cv2.INTER_LINEAR_EXACT
-            7 : cv2.INTER_MAX 
-            8 : cv2.WARP_FILL_OUTLIERS 
-            16: cv2.WARP_INVERSE_MAP 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. 
+            when use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC, 
+            - "box": Image.BOX, 
+            - "lanczos": Image.LANCZOS, 
+            - "hamming": Image.HAMMING
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "area": cv2.INTER_AREA, 
+            - "bicubic": cv2.INTER_CUBIC, 
+            - "lanczos": cv2.INTER_LANCZOS4
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import Resize
 
             transform = Resize(size=224)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(100, 120, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
     """
 
-    def __init__(self, size, interpolation=1):
+    def __init__(self, size, interpolation='bilinear', keys=None):
+        super(Resize, self).__init__(keys)
         assert isinstance(size, int) or (isinstance(size, Iterable) and
                                          len(size) == 2)
         self.size = size
         self.interpolation = interpolation
 
-    def __call__(self, img):
+    def _apply_image(self, img):
         return F.resize(img, self.size, self.interpolation)
 
 
-class RandomResizedCrop(object):
+class RandomResizedCrop(BaseTransform):
     """Crop the input data to random size and aspect ratio.
     A crop of random size (default: of 0.08 to 1.0) of the original size and a random
     aspect ratio (default: of 3/4 to 1.33) of the original aspect ratio is made.
     After applying crop transfrom, the input data will be resized to given size.
 
     Args:
-        output_size (int|list|tuple): Target size of output image, with (height, width) shape.
+        size (int|list|tuple): Target size of output image, with (height, width) shape.
         scale (list|tuple): Range of size of the origin size cropped. Default: (0.08, 1.0)
         ratio (list|tuple): Range of aspect ratio of the origin aspect ratio cropped. Default: (0.75, 1.33)
-        interpolation (int, optional): Interpolation mode of resize. Default: 1.
-            0 : cv2.INTER_NEAREST 
-            1 : cv2.INTER_LINEAR 
-            2 : cv2.INTER_CUBIC 
-            3 : cv2.INTER_AREA 
-            4 : cv2.INTER_LANCZOS4 
-            5 : cv2.INTER_LINEAR_EXACT
-            7 : cv2.INTER_MAX 
-            8 : cv2.WARP_FILL_OUTLIERS 
-            16: cv2.WARP_INVERSE_MAP 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'. when use pil backend, 
+            support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC, 
+            - "box": Image.BOX, 
+            - "lanczos": Image.LANCZOS, 
+            - "hamming": Image.HAMMING
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "area": cv2.INTER_AREA, 
+            - "bicubic": cv2.INTER_CUBIC, 
+            - "lanczos": cv2.INTER_LANCZOS4
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import RandomResizedCrop
 
             transform = RandomResizedCrop(224)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
+
     """
 
     def __init__(self,
-                 output_size,
+                 size,
                  scale=(0.08, 1.0),
                  ratio=(3. / 4, 4. / 3),
-                 interpolation=1):
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
+                 interpolation='bilinear',
+                 keys=None):
+        super(RandomResizedCrop, self).__init__(keys)
+        if isinstance(size, int):
+            self.size = (size, size)
         else:
-            self.output_size = output_size
+            self.size = size
         assert (scale[0] <= scale[1]), "scale should be of kind (min, max)"
         assert (ratio[0] <= ratio[1]), "ratio should be of kind (min, max)"
         self.scale = scale
         self.ratio = ratio
         self.interpolation = interpolation
 
-    def _get_params(self, image, attempts=10):
-        height, width, _ = image.shape
+    def _get_param(self, image, attempts=10):
+        width, height = _get_image_size(image)
         area = height * width
 
         for _ in range(attempts):
@@ -316,9 +466,9 @@ def _get_params(self, image, attempts=10):
             h = int(round(math.sqrt(target_area / aspect_ratio)))
 
             if 0 < w <= width and 0 < h <= height:
-                x = np.random.randint(0, width - w + 1)
-                y = np.random.randint(0, height - h + 1)
-                return x, y, w, h
+                i = random.randint(0, height - h)
+                j = random.randint(0, width - w)
+                return i, j, h, w
 
         # Fallback to central crop
         in_ratio = float(width) / float(height)
@@ -328,179 +478,123 @@ def _get_params(self, image, attempts=10):
         elif in_ratio > max(self.ratio):
             h = height
             w = int(round(h * max(self.ratio)))
-        else:  # whole image
+        else:
+            # return whole image
             w = width
             h = height
-        x = (width - w) // 2
-        y = (height - h) // 2
-        return x, y, w, h
-
-    def __call__(self, img):
-        x, y, w, h = self._get_params(img)
-        cropped_img = img[y:y + h, x:x + w]
-        return F.resize(cropped_img, self.output_size, self.interpolation)
-
-
-class CenterCropResize(object):
-    """Crops to center of image with padding then scales size.
-
-    Args:
-        size (int|list|tuple): Target size of output image, with (height, width) shape.
-        crop_padding (int): Center crop with the padding. Default: 32.
-        interpolation (int, optional): Interpolation mode of resize. Default: 1.
-            0 : cv2.INTER_NEAREST 
-            1 : cv2.INTER_LINEAR 
-            2 : cv2.INTER_CUBIC 
-            3 : cv2.INTER_AREA 
-            4 : cv2.INTER_LANCZOS4 
-            5 : cv2.INTER_LINEAR_EXACT
-            7 : cv2.INTER_MAX 
-            8 : cv2.WARP_FILL_OUTLIERS 
-            16: cv2.WARP_INVERSE_MAP 
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
+        i = (height - h) // 2
+        j = (width - w) // 2
+        return i, j, h, w
 
-            from paddle.vision.transforms import CenterCropResize
+    def _apply_image(self, img):
+        i, j, h, w = self._get_param(img)
 
-            transform = CenterCropResize(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self, size, crop_padding=32, interpolation=1):
-        if isinstance(size, int):
-            self.size = (size, size)
-        else:
-            self.size = size
-        self.crop_padding = crop_padding
-        self.interpolation = interpolation
-
-    def _get_params(self, img):
-        h, w = img.shape[:2]
-        size = min(self.size)
-        c = int(size / (size + self.crop_padding) * min((h, w)))
-        x = (h + 1 - c) // 2
-        y = (w + 1 - c) // 2
-        return c, x, y
-
-    def __call__(self, img):
-        c, x, y = self._get_params(img)
-        cropped_img = img[x:x + c, y:y + c, :]
+        cropped_img = F.crop(img, i, j, h, w)
         return F.resize(cropped_img, self.size, self.interpolation)
 
 
-class CenterCrop(object):
+class CenterCrop(BaseTransform):
     """Crops the given the input data at the center.
 
     Args:
-        output_size: Target size of output image, with (height, width) shape.
-    
+        size (int|list|tuple): Target size of output image, with (height, width) shape.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import CenterCrop
 
             transform = CenterCrop(224)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
     """
 
-    def __init__(self, output_size):
-        if isinstance(output_size, int):
-            self.output_size = (output_size, output_size)
+    def __init__(self, size, keys=None):
+        super(CenterCrop, self).__init__(keys)
+        if isinstance(size, numbers.Number):
+            self.size = (int(size), int(size))
         else:
-            self.output_size = output_size
-
-    def _get_params(self, img):
-        th, tw = self.output_size
-        h, w, _ = img.shape
-        assert th <= h and tw <= w, "output size is bigger than image size"
-        x = int(round((w - tw) / 2.0))
-        y = int(round((h - th) / 2.0))
-        return x, y
+            self.size = size
 
-    def __call__(self, img):
-        x, y = self._get_params(img)
-        th, tw = self.output_size
-        return img[y:y + th, x:x + tw]
+    def _apply_image(self, img):
+        return F.center_crop(img, self.size)
 
 
-class RandomHorizontalFlip(object):
+class RandomHorizontalFlip(BaseTransform):
     """Horizontally flip the input data randomly with a given probability.
 
     Args:
-        prob (float): Probability of the input data being flipped. Default: 0.5
+        prob (float, optional): Probability of the input data being flipped. Default: 0.5
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import RandomHorizontalFlip
 
             transform = RandomHorizontalFlip(224)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
     """
 
-    def __init__(self, prob=0.5):
+    def __init__(self, prob=0.5, keys=None):
+        super(RandomHorizontalFlip, self).__init__(keys)
         self.prob = prob
 
-    def __call__(self, img):
-        if np.random.random() < self.prob:
-            return F.flip(img, code=1)
+    def _apply_image(self, img):
+        if random.random() < self.prob:
+            return F.hflip(img)
         return img
 
 
-class RandomVerticalFlip(object):
+class RandomVerticalFlip(BaseTransform):
     """Vertically flip the input data randomly with a given probability.
 
     Args:
-        prob (float): Probability of the input data being flipped. Default: 0.5
+        prob (float, optional): Probability of the input data being flipped. Default: 0.5
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import RandomVerticalFlip
 
             transform = RandomVerticalFlip(224)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
+
     """
 
-    def __init__(self, prob=0.5):
+    def __init__(self, prob=0.5, keys=None):
+        super(RandomVerticalFlip, self).__init__(keys)
         self.prob = prob
 
-    def __call__(self, img):
-        if np.random.random() < self.prob:
-            return F.flip(img, code=0)
+    def _apply_image(self, img):
+        if random.random() < self.prob:
+            return F.vflip(img)
         return img
 
 
-class Normalize(object):
+class Normalize(BaseTransform):
     """Normalize the input data with mean and standard deviation.
     Given mean: ``(M1,...,Mn)`` and std: ``(S1,..,Sn)`` for ``n`` channels,
     this transform will normalize each channel of the input data.
@@ -509,286 +603,240 @@ class Normalize(object):
     Args:
         mean (int|float|list): Sequence of means for each channel.
         std (int|float|list): Sequence of standard deviations for each channel.
-
+        data_format (str, optional): Data format of img, should be 'HWC' or 
+            'CHW'. Default: 'CHW'.
+        to_rgb (bool, optional): Whether to convert to rgb. Default: False.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import Normalize
 
-            normalize = Normalize(mean=[0.5, 0.5, 0.5], 
-                                std=[0.5, 0.5, 0.5])
+            normalize = Normalize(mean=[127.5, 127.5, 127.5], 
+                                  std=[127.5, 127.5, 127.5],
+                                  data_format='HWC')
 
-            fake_img = np.random.rand(3, 500, 500).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
 
             fake_img = normalize(fake_img)
             print(fake_img.shape)
+            print(fake_img.max, fake_img.max)
     
     """
 
-    def __init__(self, mean=0.0, std=1.0):
+    def __init__(self,
+                 mean=0.0,
+                 std=1.0,
+                 data_format='CHW',
+                 to_rgb=False,
+                 keys=None):
+        super(Normalize, self).__init__(keys)
         if isinstance(mean, numbers.Number):
             mean = [mean, mean, mean]
 
         if isinstance(std, numbers.Number):
             std = [std, std, std]
 
-        self.mean = np.array(mean, dtype=np.float32).reshape(len(mean), 1, 1)
-        self.std = np.array(std, dtype=np.float32).reshape(len(std), 1, 1)
+        self.mean = mean
+        self.std = std
+        self.data_format = data_format
+        self.to_rgb = to_rgb
 
-    def __call__(self, img):
-        return (img - self.mean) / self.std
+    def _apply_image(self, img):
+        return F.normalize(img, self.mean, self.std, self.data_format,
+                           self.to_rgb)
 
 
-class Permute(object):
-    """Change input data to a target mode.
+class Transpose(BaseTransform):
+    """Transpose input data to a target format.
     For example, most transforms use HWC mode image,
     while the Neural Network might use CHW mode input tensor.
-    Input image should be HWC mode and an instance of numpy.ndarray. 
+    output image will be an instance of numpy.ndarray. 
 
     Args:
-        mode (str): Output mode of input. Default: "CHW".
-        to_rgb (bool): Convert 'bgr' image to 'rgb'. Default: True.
-
+        order (list|tuple, optional): Target order of input data. Default: (2, 0, 1).
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
     Examples:
     
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import Transpose
 
-            from paddle.vision.transforms import Permute
+            transform = Transpose()
 
-            transform = Permute()
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(300, 320, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
             print(fake_img.shape)
-    """
-
-    def __init__(self, mode="CHW", to_rgb=True):
-        assert mode in [
-            "CHW"
-        ], "Only support 'CHW' mode, but received mode: {}".format(mode)
-        self.mode = mode
-        self.to_rgb = to_rgb
-
-    def __call__(self, img):
-        if self.to_rgb:
-            img = img[..., ::-1]
-        if self.mode == "CHW":
-            return img.transpose((2, 0, 1))
-        return img
-
-
-class GaussianNoise(object):
-    """Add random gaussian noise to the input data.
-    Gaussian noise is generated with given mean and std.
-
-    Args:
-        mean (float): Gaussian mean used to generate noise.
-        std (float): Gaussian standard deviation used to generate noise.
-    
-    Examples:
     
-        .. code-block:: python
-
-            import numpy as np
-
-            from paddle.vision.transforms import GaussianNoise
-
-            transform = GaussianNoise()
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
     """
 
-    def __init__(self, mean=0.0, std=1.0):
-        self.mean = np.array(mean, dtype=np.float32)
-        self.std = np.array(std, dtype=np.float32)
+    def __init__(self, order=(2, 0, 1), keys=None):
+        super(Transpose, self).__init__(keys)
+        self.order = order
+
+    def _apply_image(self, img):
+        if F._is_pil_image(img):
+            img = np.asarray(img)
 
-    def __call__(self, img):
-        dtype = img.dtype
-        noise = np.random.normal(self.mean, self.std, img.shape) * 255
-        img = img + noise.astype(np.float32)
-        return np.clip(img, 0, 255).astype(dtype)
+        return img.transpose(self.order)
 
 
-class BrightnessTransform(object):
+class BrightnessTransform(BaseTransform):
     """Adjust brightness of the image.
 
     Args:
         value (float): How much to adjust the brightness. Can be any
             non negative number. 0 gives the original image
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import BrightnessTransform
 
             transform = BrightnessTransform(0.4)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            
     """
 
-    def __init__(self, value):
-        if value < 0:
-            raise ValueError("brightness value should be non-negative")
-        self.value = value
+    def __init__(self, value, keys=None):
+        super(BrightnessTransform, self).__init__(keys)
+        self.value = _check_input(value, 'brightness')
 
-    def __call__(self, img):
-        if self.value == 0:
+    def _apply_image(self, img):
+        if self.value is None:
             return img
 
-        dtype = img.dtype
-        img = img.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        img = img * alpha
-        return img.clip(0, 255).astype(dtype)
+        brightness_factor = random.uniform(self.value[0], self.value[1])
+        return F.adjust_brightness(img, brightness_factor)
 
 
-class ContrastTransform(object):
+class ContrastTransform(BaseTransform):
     """Adjust contrast of the image.
 
     Args:
         value (float): How much to adjust the contrast. Can be any
             non negative number. 0 gives the original image
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import ContrastTransform
 
             transform = ContrastTransform(0.4)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+
     """
 
-    def __init__(self, value):
+    def __init__(self, value, keys=None):
+        super(ContrastTransform, self).__init__(keys)
         if value < 0:
             raise ValueError("contrast value should be non-negative")
-        self.value = value
+        self.value = _check_input(value, 'contrast')
 
-    def __call__(self, img):
-        if self.value == 0:
+    def _apply_image(self, img):
+        if self.value is None:
             return img
 
-        cv2 = try_import('cv2')
-        dtype = img.dtype
-        img = img.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        img = img * alpha + cv2.cvtColor(img, cv2.COLOR_BGR2GRAY).mean() * (
-            1 - alpha)
-        return img.clip(0, 255).astype(dtype)
+        contrast_factor = random.uniform(self.value[0], self.value[1])
+        return F.adjust_contrast(img, contrast_factor)
 
 
-class SaturationTransform(object):
+class SaturationTransform(BaseTransform):
     """Adjust saturation of the image.
 
     Args:
         value (float): How much to adjust the saturation. Can be any
             non negative number. 0 gives the original image
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import SaturationTransform
 
             transform = SaturationTransform(0.4)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
         
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+
     """
 
-    def __init__(self, value):
-        if value < 0:
-            raise ValueError("saturation value should be non-negative")
-        self.value = value
+    def __init__(self, value, keys=None):
+        super(SaturationTransform, self).__init__(keys)
+        self.value = _check_input(value, 'saturation')
 
-    def __call__(self, img):
-        if self.value == 0:
+    def _apply_image(self, img):
+        if self.value is None:
             return img
 
-        cv2 = try_import('cv2')
+        saturation_factor = random.uniform(self.value[0], self.value[1])
+        return F.adjust_saturation(img, saturation_factor)
 
-        dtype = img.dtype
-        img = img.astype(np.float32)
-        alpha = np.random.uniform(max(0, 1 - self.value), 1 + self.value)
-        gray_img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
-        gray_img = gray_img[..., np.newaxis]
-        img = img * alpha + gray_img * (1 - alpha)
-        return img.clip(0, 255).astype(dtype)
 
-
-class HueTransform(object):
+class HueTransform(BaseTransform):
     """Adjust hue of the image.
 
     Args:
         value (float): How much to adjust the hue. Can be any number
             between 0 and 0.5, 0 gives the original image
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import HueTransform
 
             transform = HueTransform(0.4)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+
     """
 
-    def __init__(self, value):
-        if value < 0 or value > 0.5:
-            raise ValueError("hue value should be in [0.0, 0.5]")
-        self.value = value
+    def __init__(self, value, keys=None):
+        super(HueTransform, self).__init__(keys)
+        self.value = _check_input(
+            value, 'hue', center=0, bound=(-0.5, 0.5), clip_first_on_zero=False)
 
-    def __call__(self, img):
-        if self.value == 0:
+    def _apply_image(self, img):
+        if self.value is None:
             return img
 
-        cv2 = try_import('cv2')
-        dtype = img.dtype
-        img = img.astype(np.uint8)
-        hsv_img = cv2.cvtColor(img, cv2.COLOR_BGR2HSV_FULL)
-        h, s, v = cv2.split(hsv_img)
-
-        alpha = np.random.uniform(-self.value, self.value)
-        h = h.astype(np.uint8)
-        # uint8 addition take cares of rotation across boundaries
-        with np.errstate(over="ignore"):
-            h += np.uint8(alpha * 255)
-        hsv_img = cv2.merge([h, s, v])
-        return cv2.cvtColor(hsv_img, cv2.COLOR_HSV2BGR_FULL).astype(dtype)
+        hue_factor = random.uniform(self.value[0], self.value[1])
+        return F.adjust_hue(img, hue_factor)
 
 
-class ColorJitter(object):
+class ColorJitter(BaseTransform):
     """Randomly change the brightness, contrast, saturation and hue of an image.
 
     Args:
@@ -800,42 +848,74 @@ class ColorJitter(object):
             Chosen uniformly from [max(0, 1 - saturation), 1 + saturation]. Should be non negative numbers.
         hue: How much to jitter hue.
             Chosen uniformly from [-hue, hue]. Should have 0<= hue <= 0.5.
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
 
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import ColorJitter
 
-            transform = ColorJitter(0.4)
+            transform = ColorJitter(0.4, 0.4, 0.4, 0.4)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+
     """
 
-    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0):
+    def __init__(self, brightness=0, contrast=0, saturation=0, hue=0,
+                 keys=None):
+        super(ColorJitter, self).__init__(keys)
+        self.brightness = brightness
+        self.contrast = contrast
+        self.saturation = saturation
+        self.hue = hue
+
+    def _get_param(self, brightness, contrast, saturation, hue):
+        """Get a randomized transform to be applied on image.
+
+        Arguments are same as that of __init__.
+
+        Returns:
+            Transform which randomly adjusts brightness, contrast and
+            saturation in a random order.
+        """
         transforms = []
-        if brightness != 0:
-            transforms.append(BrightnessTransform(brightness))
-        if contrast != 0:
-            transforms.append(ContrastTransform(contrast))
-        if saturation != 0:
-            transforms.append(SaturationTransform(saturation))
-        if hue != 0:
-            transforms.append(HueTransform(hue))
+
+        if brightness is not None:
+            transforms.append(BrightnessTransform(brightness, self.keys))
+
+        if contrast is not None:
+            transforms.append(ContrastTransform(contrast, self.keys))
+
+        if saturation is not None:
+            transforms.append(SaturationTransform(saturation, self.keys))
+
+        if hue is not None:
+            transforms.append(HueTransform(hue, self.keys))
 
         random.shuffle(transforms)
-        self.transforms = Compose(transforms)
+        transform = Compose(transforms)
 
-    def __call__(self, img):
-        return self.transforms(img)
+        return transform
 
+    def _apply_image(self, img):
+        """
+        Args:
+            img (PIL Image): Input image.
 
-class RandomCrop(object):
+        Returns:
+            PIL Image: Color jittered image.
+        """
+        transform = self._get_param(self.brightness, self.contrast,
+                                    self.saturation, self.hue)
+        return transform(img)
+
+
+class RandomCrop(BaseTransform):
     """Crops the given CV Image at a random location.
 
     Args:
@@ -847,159 +927,88 @@ class RandomCrop(object):
             top, right, bottom borders respectively. Default: 0.
         pad_if_needed (boolean|optional): It will pad the image if smaller than the
             desired size to avoid raising an exception. Default: False.
-    
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import RandomCrop
 
             transform = RandomCrop(224)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(324, 300, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
     """
 
-    def __init__(self, size, padding=0, pad_if_needed=False):
+    def __init__(self,
+                 size,
+                 padding=None,
+                 pad_if_needed=False,
+                 fill=0,
+                 padding_mode='constant',
+                 keys=None):
+        super(RandomCrop, self).__init__(keys)
         if isinstance(size, numbers.Number):
             self.size = (int(size), int(size))
         else:
             self.size = size
         self.padding = padding
         self.pad_if_needed = pad_if_needed
+        self.fill = fill
+        self.padding_mode = padding_mode
 
-    def _get_params(self, img, output_size):
+    def _get_param(self, img, output_size):
         """Get parameters for ``crop`` for a random crop.
 
         Args:
-            img (numpy.ndarray): Image to be cropped.
+            img (PIL Image): Image to be cropped.
             output_size (tuple): Expected output size of the crop.
 
         Returns:
             tuple: params (i, j, h, w) to be passed to ``crop`` for random crop.
-
         """
-        h, w, _ = img.shape
+        w, h = _get_image_size(img)
         th, tw = output_size
         if w == tw and h == th:
             return 0, 0, h, w
 
-        try:
-            i = random.randint(0, h - th)
-        except ValueError:
-            i = random.randint(h - th, 0)
-        try:
-            j = random.randint(0, w - tw)
-        except ValueError:
-            j = random.randint(w - tw, 0)
+        i = random.randint(0, h - th)
+        j = random.randint(0, w - tw)
         return i, j, th, tw
 
-    def __call__(self, img):
+    def _apply_image(self, img):
         """
-
         Args:
-            img (numpy.ndarray): Image to be cropped.
-        Returns:
-            numpy.ndarray: Cropped image.
+            img (PIL Image): Image to be cropped.
 
+        Returns:
+            PIL Image: Cropped image.
         """
-        if self.padding > 0:
-            img = F.pad(img, self.padding)
+        if self.padding is not None:
+            img = F.pad(img, self.padding, self.fill, self.padding_mode)
+
+        w, h = _get_image_size(img)
 
         # pad the width if needed
-        if self.pad_if_needed and img.shape[1] < self.size[1]:
-            img = F.pad(img, (int((1 + self.size[1] - img.shape[1]) / 2), 0))
+        if self.pad_if_needed and w < self.size[1]:
+            img = F.pad(img, (self.size[1] - w, 0), self.fill,
+                        self.padding_mode)
         # pad the height if needed
-        if self.pad_if_needed and img.shape[0] < self.size[0]:
-            img = F.pad(img, (0, int((1 + self.size[0] - img.shape[0]) / 2)))
-
-        i, j, h, w = self._get_params(img, self.size)
-
-        return img[i:i + h, j:j + w]
-
-
-class RandomErasing(object):
-    """Randomly selects a rectangle region in an image and erases its pixels.
-    ``Random Erasing Data Augmentation`` by Zhong et al.
-    See https://arxiv.org/pdf/1708.04896.pdf
-
-    Args:
-         prob (float): probability that the random erasing operation will be performed.
-         scale (tuple): range of proportion of erased area against input image. Should be (min, max).
-         ratio (float): range of aspect ratio of erased area.
-         value (float|list|tuple): erasing value. If a single int, it is used to
-            erase all pixels. If a tuple of length 3, it is used to erase
-            R, G, B channels respectively. Default: 0. 
-
-    Examples:
-    
-        .. code-block:: python
-
-            import numpy as np
-
-            from paddle.vision.transforms import RandomCrop
-
-            transform = RandomCrop(224)
-
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
-
-            fake_img = transform(fake_img)
-            print(fake_img.shape)
-    """
-
-    def __init__(self,
-                 prob=0.5,
-                 scale=(0.02, 0.4),
-                 ratio=0.3,
-                 value=[0., 0., 0.]):
-        assert isinstance(value, (
-            float, Sequence
-        )), "Expected type of value in [float, list, tupue], but got {}".format(
-            type(value))
-        assert scale[0] <= scale[1], "scale range should be of kind (min, max)!"
-
-        if isinstance(value, float):
-            self.value = [value, value, value]
-        else:
-            self.value = value
-
-        self.p = prob
-        self.scale = scale
-        self.ratio = ratio
-
-    def __call__(self, img):
-        if random.uniform(0, 1) > self.p:
-            return img
-
-        for _ in range(100):
-            area = img.shape[0] * img.shape[1]
-
-            target_area = random.uniform(self.scale[0], self.scale[1]) * area
-            aspect_ratio = random.uniform(self.ratio, 1 / self.ratio)
-
-            h = int(round(math.sqrt(target_area * aspect_ratio)))
-            w = int(round(math.sqrt(target_area / aspect_ratio)))
+        if self.pad_if_needed and h < self.size[0]:
+            img = F.pad(img, (0, self.size[0] - h), self.fill,
+                        self.padding_mode)
 
-            if w < img.shape[1] and h < img.shape[0]:
-                x1 = random.randint(0, img.shape[0] - h)
-                y1 = random.randint(0, img.shape[1] - w)
+        i, j, h, w = self._get_param(img, self.size)
 
-                if len(img.shape) == 3 and img.shape[2] == 3:
-                    img[x1:x1 + h, y1:y1 + w, 0] = self.value[0]
-                    img[x1:x1 + h, y1:y1 + w, 1] = self.value[1]
-                    img[x1:x1 + h, y1:y1 + w, 2] = self.value[2]
-                else:
-                    img[x1:x1 + h, y1:y1 + w] = self.value[1]
-                return img
-
-        return img
+        return F.crop(img, i, j, h, w)
 
 
-class Pad(object):
+class Pad(BaseTransform):
     """Pads the given CV Image on all sides with the given "pad" value.
 
     Args:
@@ -1020,64 +1029,73 @@ class Pad(object):
             ``symmetric`` menas pads with reflection of image (repeating the last value on the edge)
             padding ``[1, 2, 3, 4]`` with 2 elements on both sides in symmetric mode 
             will result in ``[2, 1, 1, 2, 3, 4, 4, 3]``.
-
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
     Examples:
     
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import Pad
 
             transform = Pad(2)
 
-            fake_img = np.random.rand(500, 500, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
     """
 
-    def __init__(self, padding, fill=0, padding_mode='constant'):
+    def __init__(self, padding, fill=0, padding_mode='constant', keys=None):
         assert isinstance(padding, (numbers.Number, list, tuple))
         assert isinstance(fill, (numbers.Number, str, list, tuple))
         assert padding_mode in ['constant', 'edge', 'reflect', 'symmetric']
-        if isinstance(padding,
-                      collections.Sequence) and len(padding) not in [2, 4]:
+
+        if isinstance(padding, list):
+            padding = tuple(padding)
+        if isinstance(fill, list):
+            fill = tuple(fill)
+
+        if isinstance(padding, Sequence) and len(padding) not in [2, 4]:
             raise ValueError(
                 "Padding must be an int or a 2, or 4 element tuple, not a " +
                 "{} element tuple".format(len(padding)))
 
+        super(Pad, self).__init__(keys)
         self.padding = padding
         self.fill = fill
         self.padding_mode = padding_mode
 
-    def __call__(self, img):
+    def _apply_image(self, img):
         """
         Args:
-            img (numpy.ndarray): Image to be padded.
+            img (PIL Image): Image to be padded.
+
         Returns:
-            numpy.ndarray: Padded image.
+            PIL Image: Padded image.
         """
         return F.pad(img, self.padding, self.fill, self.padding_mode)
 
 
-class RandomRotate(object):
+class RandomRotation(BaseTransform):
     """Rotates the image by angle.
 
     Args:
         degrees (sequence or float or int): Range of degrees to select from.
             If degrees is a number instead of sequence like (min, max), the range of degrees
             will be (-degrees, +degrees) clockwise order.
-        interpolation (int, optional): Interpolation mode of resize. Default: 1.
-            0 : cv2.INTER_NEAREST 
-            1 : cv2.INTER_LINEAR 
-            2 : cv2.INTER_CUBIC 
-            3 : cv2.INTER_AREA 
-            4 : cv2.INTER_LANCZOS4 
-            5 : cv2.INTER_LINEAR_EXACT
-            7 : cv2.INTER_MAX 
-            8 : cv2.WARP_FILL_OUTLIERS 
-            16: cv2.WARP_INVERSE_MAP 
+        interpolation (int|str, optional): Interpolation method. Default: 'bilinear'.
+        resample (int|str, optional): An optional resampling filter. If omitted, or if the 
+            image has only one channel, it is set to PIL.Image.NEAREST or cv2.INTER_NEAREST 
+            according the backend. when use pil backend, support method are as following: 
+            - "nearest": Image.NEAREST, 
+            - "bilinear": Image.BILINEAR, 
+            - "bicubic": Image.BICUBIC
+            when use cv2 backend, support method are as following: 
+            - "nearest": cv2.INTER_NEAREST, 
+            - "bilinear": cv2.INTER_LINEAR, 
+            - "bicubic": cv2.INTER_CUBIC
         expand (bool|optional): Optional expansion flag. Default: False.
             If true, expands the output to make it large enough to hold the entire rotated image.
             If false or omitted, make the output image the same size as the input image.
@@ -1085,24 +1103,31 @@ class RandomRotate(object):
         center (2-tuple|optional): Optional center of rotation.
             Origin is the upper left corner.
             Default is the center of the image.
-    
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
     Examples:
     
         .. code-block:: python
 
             import numpy as np
+            from PIL import Image
+            from paddle.vision.transforms import RandomRotation
 
-            from paddle.vision.transforms import RandomRotate
-
-            transform = RandomRotate(90)
+            transform = RandomRotation(90)
 
-            fake_img = np.random.rand(500, 400, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(200, 150, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(fake_img.size)
     """
 
-    def __init__(self, degrees, interpolation=1, expand=False, center=None):
+    def __init__(self,
+                 degrees,
+                 resample=False,
+                 expand=False,
+                 center=None,
+                 fill=0,
+                 keys=None):
         if isinstance(degrees, numbers.Number):
             if degrees < 0:
                 raise ValueError(
@@ -1114,37 +1139,39 @@ def __init__(self, degrees, interpolation=1, expand=False, center=None):
                     "If degrees is a sequence, it must be of len 2.")
             self.degrees = degrees
 
-        self.interpolation = interpolation
+        super(RandomRotation, self).__init__(keys)
+        self.resample = resample
         self.expand = expand
         self.center = center
+        self.fill = fill
 
-    def _get_params(self, degrees):
-        """Get parameters for ``rotate`` for a random rotation.
-        Returns:
-            sequence: params to be passed to ``rotate`` for random rotation.
-        """
+    def _get_param(self, degrees):
         angle = random.uniform(degrees[0], degrees[1])
 
         return angle
 
-    def __call__(self, img):
+    def _apply_image(self, img):
         """
-            img (np.ndarray): Image to be rotated.
+        Args:
+            img (PIL.Image|np.array): Image to be rotated.
+
         Returns:
-            np.ndarray: Rotated image.
+            PIL.Image or np.array: Rotated image.
         """
 
-        angle = self._get_params(self.degrees)
+        angle = self._get_param(self.degrees)
 
-        return F.rotate(img, angle, self.interpolation, self.expand,
-                        self.center)
+        return F.rotate(img, angle, self.resample, self.expand, self.center,
+                        self.fill)
 
 
-class Grayscale(object):
+class Grayscale(BaseTransform):
     """Converts image to grayscale.
 
     Args:
-        output_channels (int): (1 or 3) number of channels desired for output image
+        num_output_channels (int): (1 or 3) number of channels desired for output image
+        keys (list[str]|tuple[str], optional): Same as ``BaseTransform``. Default: None.
+        
     Returns:
         CV Image: Grayscale version of the input.
         - If output_channels == 1 : returned image is single channel
@@ -1155,25 +1182,27 @@ class Grayscale(object):
         .. code-block:: python
 
             import numpy as np
-
+            from PIL import Image
             from paddle.vision.transforms import Grayscale
 
             transform = Grayscale()
 
-            fake_img = np.random.rand(500, 400, 3).astype('float32')
+            fake_img = Image.fromarray((np.random.rand(224, 224, 3) * 255.).astype(np.uint8))
 
             fake_img = transform(fake_img)
-            print(fake_img.shape)
+            print(np.array(fake_img).shape)
     """
 
-    def __init__(self, output_channels=1):
-        self.output_channels = output_channels
+    def __init__(self, num_output_channels=1, keys=None):
+        super(Grayscale, self).__init__(keys)
+        self.num_output_channels = num_output_channels
 
-    def __call__(self, img):
+    def _apply_image(self, img):
         """
         Args:
-            img (numpy.ndarray): Image to be converted to grayscale.
+            img (PIL Image): Image to be converted to grayscale.
+
         Returns:
-            numpy.ndarray: Randomly grayscaled image.
+            PIL Image: Randomly grayscaled image.
         """
-        return F.to_grayscale(img, num_output_channels=self.output_channels)
+        return F.to_grayscale(img, self.num_output_channels)

From a911c19eb03755431e6416c4da3423ebd9c7e716 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 20 Oct 2020 04:48:56 -0500
Subject: [PATCH 008/185] fill_constant op supports NaN and Inf (#28109)

* fill_constant supports nan and inf

* add ut
---
 paddle/fluid/operators/fill_constant_op.h     | 25 +++++++++++++------
 .../tests/unittests/test_fill_constant_op.py  | 10 ++++++++
 2 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 41fcf3750878e..239083f88d9c6 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -14,9 +14,11 @@ limitations under the License. */
 
 #pragma once
 
+#include <limits>
 #include <sstream>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -45,15 +47,22 @@ class FillConstantKernel : public framework::OpKernel<T> {
     if (str_value.empty()) {
       value = static_cast<T>(float_value);
     } else {
-      std::stringstream convert_stream(str_value);
-      if (std::is_same<int64_t, T>::value) {
-        int64_t tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
+      // handle NaN/Inf first, which cannot be read from stream.
+      if (str_value == "inf") {
+        value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "nan") {
+        value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
       } else {
-        double tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
+        std::stringstream convert_stream(str_value);
+        if (std::is_same<int64_t, T>::value) {
+          int64_t tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        } else {
+          double tmp_value;
+          convert_stream >> tmp_value;
+          value = static_cast<T>(tmp_value);
+        }
       }
     }
     if (ctx.HasInput("ValueTensor")) {
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index 43069470680c7..babfcdb9040df 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -330,6 +330,16 @@ def test_api(self):
                 res4.numpy(), np.full(
                     [1, 2], 88, dtype="int32"))
 
+    def test_nan(self):
+        with fluid.dygraph.guard():
+            res = fluid.layers.fill_constant([1], 'float32', np.nan)
+            self.assertTrue(np.isnan(res.numpy().item(0)))
+
+    def test_inf(self):
+        with fluid.dygraph.guard():
+            res = fluid.layers.fill_constant([1], 'float32', np.inf)
+            self.assertTrue(np.isinf(res.numpy().item(0)))
+
 
 class TestFillConstantOpError(unittest.TestCase):
     def test_errors(self):

From 80c5d23ad056213286bdf8c953b481bb3ec54f15 Mon Sep 17 00:00:00 2001
From: LoveAn <mr.avin0323@gmail.com>
Date: Tue, 20 Oct 2020 18:34:13 +0800
Subject: [PATCH 009/185] Fix the name error and exit caused by judgment
 failed, test=document_fix (#28118)

---
 tools/windows/build_compile_environment.bat | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/tools/windows/build_compile_environment.bat b/tools/windows/build_compile_environment.bat
index 889ea005259cc..736a19ddf52f4 100644
--- a/tools/windows/build_compile_environment.bat
+++ b/tools/windows/build_compile_environment.bat
@@ -30,6 +30,7 @@
 
 :: Echo command is not required.
 @echo off
+cd /d %~dp0%
 
 :: ===== start step 0: wget tool =====
 :: Download wget for windows when there is not wget tool.
@@ -145,7 +146,7 @@ echo Install Visual Studio 2015 ...
 :: /norestart [no restart]
 :: /NoRefresh [no refresh]
 :: /InstallSelectableItems NativeLanguageSupport_Group [select Visual C++ for installing]
-start /wait visual_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
+start /wait vs_installer.exe /passive /norestart /NoRefresh /InstallSelectableItems NativeLanguageSupport_Group
 if %errorlevel% == 0 (
   echo Install Visual Studio 2015 success!
 ) else (
@@ -158,7 +159,7 @@ goto :eof
 :: ===== start step 5: CUDA 10 =====
 :cuda10
 echo ">>>>>>>> step [5/7]: CUDA 10.2"
-nvcc --version | findstr /C:"10.2" > nul 2> nul || call :install_cuda
+cmd /C nvcc --version 2> nul | findstr /C:"10.2" > nul 2> nul || call :install_cuda
 goto java-jre
 
 :install_cuda
@@ -178,9 +179,9 @@ del cuda_installer.exe
 echo Download cudnn from "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
 wget -O cudnn-10.2-windows10-x64-v7.6.5.32.zip "https://paddle-ci.gz.bcebos.com/window_requirement/cudnn-10.2-windows10-x64-v7.6.5.32.zip"
 tar xf cudnn-10.2-windows10-x64-v7.6.5.32.zip
-xcopy "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
-xcopy "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
-xcopy "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
+xcopy /E /Y /R "cuda\bin\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\bin"
+xcopy /E /Y /R "cuda\include\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\include"
+xcopy /E /Y /R "cuda\lib\*" "C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.2\lib"
 rd /s /q cuda
 del cudnn-10.2-windows10-x64-v7.6.5.32.zip
 goto :eof
@@ -189,7 +190,7 @@ goto :eof
 :: ===== start step 6: java jre =====
 :java-jre
 echo ">>>>>>>> step [6/7]: java jre"
-java > nul 2> nul || call :install_java
+cmd /C java -version > nul 2> nul || call :install_java
 goto xly-agent
 
 :install_java
@@ -212,5 +213,6 @@ goto :eof
 :xly-agent
 echo ">>>>>>>> step [7/7]: xly agent"
 wget -O agent.jar "https://paddle-ci.gz.bcebos.com/window_requirement/agent.jar"
-goto :eof
-:: ===== end step 8: xly agent =====
\ No newline at end of file
+:: ===== end step 8: xly agent =====
+
+pause

From afe68cb9928ca02568c573c30a702cb1caf3e7ac Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Tue, 20 Oct 2020 18:57:29 +0800
Subject: [PATCH 010/185] unset proxy for block file in bos (#28119)

---
 tools/check_file_diff_approvals.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 8b0be9d8a6a66..66b0bf67d7097 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -304,8 +304,8 @@ fi
 # Get the list of PR authors with unresolved unit test issues
 pip install PyGithub
 # For getting PR related data
-wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate
-wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate
+wget https://sys-p0.bj.bcebos.com/blk/block.txt --no-check-certificate --no-proxy
+wget https://sys-p0.bj.bcebos.com/bk-ci/bk.txt --no-check-certificate --no-proxy
 HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true`
 if [ "${HASUTFIXED}" != "" ]; then
   echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n"

From d1e1f174829fca3f98aad7a22718ac84788a7565 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Tue, 20 Oct 2020 19:01:00 +0800
Subject: [PATCH 011/185] fix generate_proposal_labels in cascade-rcnn series
 model, test=develop (#27892)

* fix generate_proposal_labels in cascade-rcnn series model, test=develop

* fix example code & unittest, test=develop

* update code from review comments, test=develop
---
 paddle/fluid/operators/detection/bbox_util.h  |  15 ++
 .../detection/generate_proposal_labels_op.cc  | 176 +++++++++++++-----
 python/paddle/fluid/layers/detection.py       |  43 +++--
 python/paddle/fluid/tests/test_detection.py   |  94 +++++-----
 .../test_generate_proposal_labels_op.py       | 122 +++++++-----
 5 files changed, 303 insertions(+), 147 deletions(-)

diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index afc39c1db9fba..6c9fea1fd4419 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -149,5 +149,20 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx,
   }
 }
 
+// Calculate max IoU between each box and ground-truth and
+// each row represents one box
+template <typename T>
+void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
+  const T* iou_data = iou.data<T>();
+  int row = iou.dims()[0];
+  int col = iou.dims()[1];
+  T* max_iou_data = max_iou->data<T>();
+  for (int i = 0; i < row; ++i) {
+    const T* v = iou_data + i * col;
+    T max_v = *std::max_element(v, v + col);
+    max_iou_data[i] = max_v;
+  }
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 884aa1f6f4e99..0b8fcbb74277d 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -33,6 +33,28 @@ void AppendRois(LoDTensor* out, int64_t offset, Tensor* to_add) {
   memcpy(out_data + offset, to_add_data, to_add->numel() * sizeof(T));
 }
 
+// Filter the ground-truth in RoIs and the RoIs with non-positive area.
+// The ground-truth has max overlap with itself so the max_overlap is 1
+// and the corresponding RoI will be removed.
+template <typename T>
+void FilterRoIs(const platform::DeviceContext& ctx, const Tensor& rpn_rois,
+                const Tensor& max_overlap, Tensor* keep) {
+  const T* rpn_rois_dt = rpn_rois.data<T>();
+  const T* max_overlap_dt = max_overlap.data<T>();
+  int rois_num = max_overlap.numel();
+  keep->Resize({rois_num});
+  int* keep_data = keep->mutable_data<int>(ctx.GetPlace());
+  int keep_len = 0;
+  for (int i = 0; i < rois_num; ++i) {
+    if ((rpn_rois_dt[i * 4 + 2] - rpn_rois_dt[i * 4 + 0] + 1) > 0 &&
+        (rpn_rois_dt[i * 4 + 3] - rpn_rois_dt[i * 4 + 1] + 1) > 0 &&
+        max_overlap_dt[i] < 1.) {
+      keep_data[keep_len++] = i;
+    }
+  }
+  keep->Resize({keep_len});
+}
+
 class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -98,12 +120,21 @@ class GenerateProposalLabelsOp : public framework::OperatorWithKernel {
                           im_info_dims.size(), im_info_dims));
 
     int class_nums = ctx->Attrs().Get<int>("class_nums");
+    bool is_cascade_rcnn = ctx->Attrs().Get<bool>("is_cascade_rcnn");
+    if (is_cascade_rcnn) {
+      PADDLE_ENFORCE_EQ(
+          ctx->HasInput("MaxOverlap"), true,
+          platform::errors::NotFound(
+              "Input(MaxOverlap) of GenerateProposalLabelsOp "
+              "should not be null when is_cascade_rcnn is True."));
+    }
 
     ctx->SetOutputDim("Rois", {-1, 4});
     ctx->SetOutputDim("LabelsInt32", {-1, 1});
     ctx->SetOutputDim("BboxTargets", {-1, 4 * class_nums});
     ctx->SetOutputDim("BboxInsideWeights", {-1, 4 * class_nums});
     ctx->SetOutputDim("BboxOutsideWeights", {-1, 4 * class_nums});
+    ctx->SetOutputDim("MaxOverlapWithGT", {-1});
   }
 
  protected:
@@ -142,7 +173,6 @@ std::vector<std::vector<int>> SampleFgBgGt(
   int64_t row = iou->dims()[0];
   int64_t col = iou->dims()[1];
   float epsilon = 0.00001;
-  const T* rpn_rois_dt = rpn_rois.data<T>();
   // Follow the Faster RCNN's implementation
   for (int64_t i = 0; i < row; ++i) {
     const T* v = proposal_to_gt_overlaps + i * col;
@@ -151,11 +181,6 @@ std::vector<std::vector<int>> SampleFgBgGt(
     if ((i < gt_num) && (crowd_data[i])) {
       max_overlap = -1.0;
     }
-    if (is_cascade_rcnn &&
-        ((rpn_rois_dt[i * 4 + 2] - rpn_rois_dt[i * 4 + 0] + 1) <= 0 ||
-         (rpn_rois_dt[i * 4 + 3] - rpn_rois_dt[i * 4 + 1] + 1) <= 0)) {
-      continue;
-    }
     if (max_overlap >= fg_thresh) {
       // fg mapped gt label index
       for (int64_t j = 0; j < col; ++j) {
@@ -232,12 +257,13 @@ std::vector<std::vector<int>> SampleFgBgGt(
 
 template <typename T>
 void GatherBoxesLabels(const platform::CPUDeviceContext& context,
-                       const Tensor& boxes, const Tensor& gt_boxes,
-                       const Tensor& gt_classes,
+                       const Tensor& boxes, const Tensor& max_overlap,
+                       const Tensor& gt_boxes, const Tensor& gt_classes,
                        const std::vector<int>& fg_inds,
                        const std::vector<int>& bg_inds,
                        const std::vector<int>& gt_inds, Tensor* sampled_boxes,
-                       Tensor* sampled_labels, Tensor* sampled_gts) {
+                       Tensor* sampled_labels, Tensor* sampled_gts,
+                       Tensor* sampled_max_overlap) {
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
   Tensor fg_inds_t, bg_inds_t, gt_box_inds_t, gt_label_inds_t;
@@ -264,6 +290,13 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
   math::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
+
+  Tensor fg_max_overlap, bg_max_overlap;
+  fg_max_overlap.mutable_data<T>({fg_num}, context.GetPlace());
+  CPUGather<T>(context, max_overlap, fg_inds_t, &fg_max_overlap);
+  bg_max_overlap.mutable_data<T>({bg_num}, context.GetPlace());
+  CPUGather<T>(context, max_overlap, bg_inds_t, &bg_max_overlap);
+  Concat<T>(context, fg_max_overlap, bg_max_overlap, sampled_max_overlap);
 }
 
 template <typename T>
@@ -274,43 +307,58 @@ std::vector<Tensor> SampleRoisForOneImage(
     const float fg_thresh, const float bg_thresh_hi, const float bg_thresh_lo,
     const std::vector<float>& bbox_reg_weights, const int class_nums,
     std::minstd_rand engine, bool use_random, bool is_cascade_rcnn,
-    bool is_cls_agnostic) {
+    bool is_cls_agnostic, const Tensor& max_overlap) {
   // 1.1 map to original image
   auto im_scale = im_info.data<T>()[2];
-
   Tensor rpn_rois;
   rpn_rois.mutable_data<T>(rpn_rois_in.dims(), context.GetPlace());
   const T* rpn_rois_in_dt = rpn_rois_in.data<T>();
   T* rpn_rois_dt = rpn_rois.data<T>();
-  int gt_num = gt_boxes.dims()[0] * 4;
+
   for (int i = 0; i < rpn_rois.numel(); ++i) {
-    if (i < gt_num && is_cascade_rcnn) {
-      rpn_rois_dt[i] = rpn_rois_in_dt[i];
+    rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
+  }
+
+  int proposals_num = 1;
+
+  if (is_cascade_rcnn) {
+    Tensor keep;
+    FilterRoIs<T>(context, rpn_rois, max_overlap, &keep);
+    Tensor roi_filter;
+    // Tensor box_filter;
+    if (keep.numel() == 0) {
+      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
+      set_zero(context, &roi_filter, static_cast<T>(0));
     } else {
-      rpn_rois_dt[i] = rpn_rois_in_dt[i] / im_scale;
+      proposals_num = keep.numel();
+      roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
+      CPUGather<T>(context, rpn_rois, keep, &roi_filter);
     }
+    T* roi_filter_dt = roi_filter.data<T>();
+    memcpy(rpn_rois_dt, roi_filter_dt, roi_filter.numel() * sizeof(T));
+    rpn_rois.Resize(roi_filter.dims());
+  } else {
+    proposals_num = rpn_rois.dims()[0];
   }
-
   // 1.2 compute overlaps
-  int proposals_num = rpn_rois.dims()[0];
-  if (!is_cascade_rcnn) {
-    proposals_num += gt_boxes.dims()[0];
-  }
+  proposals_num += gt_boxes.dims()[0];
+
   Tensor proposal_to_gt_overlaps;
   proposal_to_gt_overlaps.mutable_data<T>({proposals_num, gt_boxes.dims()[0]},
                                           context.GetPlace());
 
   Tensor boxes;
   boxes.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
-  if (!is_cascade_rcnn) {
-    Concat<T>(context, gt_boxes, rpn_rois, &boxes);
-  } else {
-    T* boxes_dt = boxes.data<T>();
-    for (int i = 0; i < boxes.numel(); ++i) {
-      boxes_dt[i] = rpn_rois_dt[i];
-    }
-  }
+  Concat<T>(context, gt_boxes, rpn_rois, &boxes);
   BboxOverlaps<T>(boxes, gt_boxes, &proposal_to_gt_overlaps);
+
+  Tensor proposal_with_max_overlap;
+  proposal_with_max_overlap.mutable_data<T>({proposals_num},
+                                            context.GetPlace());
+
+  MaxIoU<T>(proposal_to_gt_overlaps, &proposal_with_max_overlap);
+
   // Generate proposal index
   std::vector<std::vector<int>> fg_bg_gt =
       SampleFgBgGt<T>(context, &proposal_to_gt_overlaps, is_crowd,
@@ -321,7 +369,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   std::vector<int> mapped_gt_inds = fg_bg_gt[2];  // mapped_gt_labels
 
   // Gather boxes and labels
-  Tensor sampled_boxes, sampled_labels, sampled_gts;
+  Tensor sampled_boxes, sampled_labels, sampled_gts, sampled_max_overlap;
   int fg_num = fg_inds.size();
   int bg_num = bg_inds.size();
   int boxes_num = fg_num + bg_num;
@@ -329,9 +377,11 @@ std::vector<Tensor> SampleRoisForOneImage(
   sampled_boxes.mutable_data<T>(bbox_dim, context.GetPlace());
   sampled_labels.mutable_data<int>({boxes_num}, context.GetPlace());
   sampled_gts.mutable_data<T>({fg_num, kBoxDim}, context.GetPlace());
-  GatherBoxesLabels<T>(context, boxes, gt_boxes, gt_classes, fg_inds, bg_inds,
-                       mapped_gt_inds, &sampled_boxes, &sampled_labels,
-                       &sampled_gts);
+  sampled_max_overlap.mutable_data<T>({boxes_num}, context.GetPlace());
+  GatherBoxesLabels<T>(context, boxes, proposal_with_max_overlap, gt_boxes,
+                       gt_classes, fg_inds, bg_inds, mapped_gt_inds,
+                       &sampled_boxes, &sampled_labels, &sampled_gts,
+                       &sampled_max_overlap);
 
   // Compute targets
   Tensor bbox_targets_single;
@@ -390,6 +440,7 @@ std::vector<Tensor> SampleRoisForOneImage(
   res.emplace_back(bbox_targets);
   res.emplace_back(bbox_inside_weights);
   res.emplace_back(bbox_outside_weights);
+  res.emplace_back(sampled_max_overlap);
   return res;
 }
 
@@ -409,6 +460,7 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     auto* bbox_inside_weights = context.Output<LoDTensor>("BboxInsideWeights");
     auto* bbox_outside_weights =
         context.Output<LoDTensor>("BboxOutsideWeights");
+    auto* max_overlap_with_gt = context.Output<LoDTensor>("MaxOverlapWithGT");
 
     int batch_size_per_im = context.Attr<int>("batch_size_per_im");
     float fg_fraction = context.Attr<float>("fg_fraction");
@@ -446,16 +498,21 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
             "received level of LoD is [%d], LoD is [%s].",
             gt_boxes->lod().size(), gt_boxes->lod()));
     int64_t n = static_cast<int64_t>(rpn_rois->lod().back().size() - 1);
-
-    rois->mutable_data<T>({n * batch_size_per_im, kBoxDim}, context.GetPlace());
-    labels_int32->mutable_data<int>({n * batch_size_per_im, 1},
-                                    context.GetPlace());
-    bbox_targets->mutable_data<T>({n * batch_size_per_im, kBoxDim * class_nums},
+    int64_t rois_num = rpn_rois->dims()[0];
+    int64_t gts_num = gt_boxes->dims()[0];
+    int64_t init_num =
+        is_cascade_rcnn ? rois_num + gts_num : n * batch_size_per_im;
+
+    rois->mutable_data<T>({init_num, kBoxDim}, context.GetPlace());
+    labels_int32->mutable_data<int>({init_num, 1}, context.GetPlace());
+    bbox_targets->mutable_data<T>({init_num, kBoxDim * class_nums},
                                   context.GetPlace());
-    bbox_inside_weights->mutable_data<T>(
-        {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace());
-    bbox_outside_weights->mutable_data<T>(
-        {n * batch_size_per_im, kBoxDim * class_nums}, context.GetPlace());
+    bbox_inside_weights->mutable_data<T>({init_num, kBoxDim * class_nums},
+                                         context.GetPlace());
+    bbox_outside_weights->mutable_data<T>({init_num, kBoxDim * class_nums},
+                                          context.GetPlace());
+    max_overlap_with_gt->Resize({init_num});
+    max_overlap_with_gt->mutable_data<T>(context.GetPlace());
 
     std::random_device rnd;
     std::minstd_rand engine;
@@ -486,25 +543,36 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
       Tensor gt_boxes_slice =
           gt_boxes->Slice(gt_boxes_lod[i], gt_boxes_lod[i + 1]);
       Tensor im_info_slice = im_info->Slice(i, i + 1);
+      Tensor max_overlap_slice;
+      if (is_cascade_rcnn) {
+        auto* max_overlap = context.Input<Tensor>("MaxOverlap");
+        max_overlap_slice =
+            max_overlap->Slice(rpn_rois_lod[i], rpn_rois_lod[i + 1]);
+      } else {
+        max_overlap_slice.mutable_data<T>({rpn_rois_slice.dims()[0]},
+                                          context.GetPlace());
+      }
       std::vector<Tensor> tensor_output = SampleRoisForOneImage<T>(
           dev_ctx, rpn_rois_slice, gt_classes_slice, is_crowd_slice,
           gt_boxes_slice, im_info_slice, batch_size_per_im, fg_fraction,
           fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
-          engine, use_random, is_cascade_rcnn, is_cls_agnostic);
+          engine, use_random, is_cascade_rcnn, is_cls_agnostic,
+          max_overlap_slice);
       Tensor sampled_rois = tensor_output[0];
       Tensor sampled_labels_int32 = tensor_output[1];
       Tensor sampled_bbox_targets = tensor_output[2];
       Tensor sampled_bbox_inside_weights = tensor_output[3];
       Tensor sampled_bbox_outside_weights = tensor_output[4];
+      Tensor sampled_max_overlap = tensor_output[5];
 
       AppendRois<T>(rois, kBoxDim * num_rois, &sampled_rois);
       AppendRois<int>(labels_int32, num_rois, &sampled_labels_int32);
-      AppendRois<T>(bbox_targets, kBoxDim * num_rois * class_nums,
-                    &sampled_bbox_targets);
-      AppendRois<T>(bbox_inside_weights, kBoxDim * num_rois * class_nums,
-                    &sampled_bbox_inside_weights);
-      AppendRois<T>(bbox_outside_weights, kBoxDim * num_rois * class_nums,
+      int64_t offset = kBoxDim * num_rois * class_nums;
+      AppendRois<T>(bbox_targets, offset, &sampled_bbox_targets);
+      AppendRois<T>(bbox_inside_weights, offset, &sampled_bbox_inside_weights);
+      AppendRois<T>(bbox_outside_weights, offset,
                     &sampled_bbox_outside_weights);
+      AppendRois<T>(max_overlap_with_gt, num_rois, &sampled_max_overlap);
 
       num_rois += sampled_rois.dims()[0];
       lod0.emplace_back(num_rois);
@@ -521,6 +589,8 @@ class GenerateProposalLabelsKernel : public framework::OpKernel<T> {
     bbox_targets->Resize({num_rois, kBoxDim * class_nums});
     bbox_inside_weights->Resize({num_rois, kBoxDim * class_nums});
     bbox_outside_weights->Resize({num_rois, kBoxDim * class_nums});
+    max_overlap_with_gt->Resize({num_rois});
+    max_overlap_with_gt->set_lod(lod);
   }
 };
 
@@ -550,6 +620,12 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
              "(Tensor), This input is a 2D Tensor with shape [B, 3]. "
              "B is the number of input images, "
              "each element consists of im_height, im_width, im_scale.");
+    AddInput("MaxOverlap",
+             "(LoDTensor), This input is a 1D LoDTensor with shape [N]."
+             "N is the number of Input(RpnRois), "
+             "each element is the maximum overlap between "
+             "the proposal RoI and ground-truth.")
+        .AsDispensable();
 
     AddOutput(
         "Rois",
@@ -573,6 +649,12 @@ class GenerateProposalLabelsOpMaker : public framework::OpProtoAndCheckerMaker {
         "(LoDTensor), This output is a 2D LoDTensor with shape [P, 4 * "
         "class_nums], "
         "each element indicates whether a box should contribute to loss.");
+    AddOutput("MaxOverlapWithGT",
+              "(LoDTensor), This output is a 1D LoDTensor with shape [P], "
+              "each element indicates the maxoverlap "
+              "between output RoIs and ground-truth. "
+              "The output RoIs may include ground-truth "
+              "and the output maxoverlap may contain 1.");
 
     AddAttr<int>("batch_size_per_im", "Batch size of rois per images.");
     AddAttr<float>("fg_fraction",
diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py
index 951817db015d5..f7e79f79f8bfd 100644
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -2601,7 +2601,9 @@ def generate_proposal_labels(rpn_rois,
                              class_nums=None,
                              use_random=True,
                              is_cls_agnostic=False,
-                             is_cascade_rcnn=False):
+                             is_cascade_rcnn=False,
+                             max_overlap=None,
+                             return_max_overlap=False):
     """
 
     **Generate Proposal Labels of Faster-RCNN**
@@ -2638,25 +2640,29 @@ def generate_proposal_labels(rpn_rois,
         use_random(bool): Use random sampling to choose foreground and background boxes.
         is_cls_agnostic(bool): bbox regression use class agnostic simply which only represent fg and bg boxes.
         is_cascade_rcnn(bool): it will filter some bbox crossing the image's boundary when setting True.
+        max_overlap(Variable): Maximum overlap between each proposal box and ground-truth.
+        return_max_overlap(bool): Whether return the maximum overlap between each sampled RoI and ground-truth.
 
     Returns:
         tuple:
-        A tuple with format``(rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights)``.
+        A tuple with format``(rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, max_overlap)``.
 
         - **rois**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4]``. The data type is the same as ``rpn_rois``.
         - **labels_int32**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 1]``. The data type must be int32.
         - **bbox_targets**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The regression targets of all RoIs. The data type is the same as ``rpn_rois``.
         - **bbox_inside_weights**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The weights of foreground boxes' regression loss. The data type is the same as ``rpn_rois``.
         - **bbox_outside_weights**: 2-D LoDTensor with shape ``[batch_size_per_im * batch_size, 4 * class_num]``. The weights of regression loss. The data type is the same as ``rpn_rois``.
-
+        - **max_overlap**: 1-D LoDTensor with shape ``[P]``. P is the number of output ``rois``. The maximum overlap between each sampled RoI and ground-truth.
 
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
+            paddle.enable_static()
             rpn_rois = fluid.data(name='rpn_rois', shape=[None, 4], dtype='float32')
-            gt_classes = fluid.data(name='gt_classes', shape=[None, 1], dtype='float32')
-            is_crowd = fluid.data(name='is_crowd', shape=[None, 1], dtype='float32')
+            gt_classes = fluid.data(name='gt_classes', shape=[None, 1], dtype='int32')
+            is_crowd = fluid.data(name='is_crowd', shape=[None, 1], dtype='int32')
             gt_boxes = fluid.data(name='gt_boxes', shape=[None, 4], dtype='float32')
             im_info = fluid.data(name='im_info', shape=[None, 3], dtype='float32')
             rois, labels, bbox, inside_weights, outside_weights = fluid.layers.generate_proposal_labels(
@@ -2673,6 +2679,8 @@ def generate_proposal_labels(rpn_rois,
                              'generate_proposal_labels')
     check_variable_and_dtype(is_crowd, 'is_crowd', ['int32'],
                              'generate_proposal_labels')
+    if is_cascade_rcnn:
+        assert max_overlap is not None, "Input max_overlap of generate_proposal_labels should not be None if is_cascade_rcnn is True"
 
     rois = helper.create_variable_for_type_inference(dtype=rpn_rois.dtype)
     labels_int32 = helper.create_variable_for_type_inference(
@@ -2683,22 +2691,28 @@ def generate_proposal_labels(rpn_rois,
         dtype=rpn_rois.dtype)
     bbox_outside_weights = helper.create_variable_for_type_inference(
         dtype=rpn_rois.dtype)
+    max_overlap_with_gt = helper.create_variable_for_type_inference(
+        dtype=rpn_rois.dtype)
 
+    inputs = {
+        'RpnRois': rpn_rois,
+        'GtClasses': gt_classes,
+        'IsCrowd': is_crowd,
+        'GtBoxes': gt_boxes,
+        'ImInfo': im_info,
+    }
+    if max_overlap is not None:
+        inputs['MaxOverlap'] = max_overlap
     helper.append_op(
         type="generate_proposal_labels",
-        inputs={
-            'RpnRois': rpn_rois,
-            'GtClasses': gt_classes,
-            'IsCrowd': is_crowd,
-            'GtBoxes': gt_boxes,
-            'ImInfo': im_info
-        },
+        inputs=inputs,
         outputs={
             'Rois': rois,
             'LabelsInt32': labels_int32,
             'BboxTargets': bbox_targets,
             'BboxInsideWeights': bbox_inside_weights,
-            'BboxOutsideWeights': bbox_outside_weights
+            'BboxOutsideWeights': bbox_outside_weights,
+            'MaxOverlapWithGT': max_overlap_with_gt
         },
         attrs={
             'batch_size_per_im': batch_size_per_im,
@@ -2718,7 +2732,10 @@ def generate_proposal_labels(rpn_rois,
     bbox_targets.stop_gradient = True
     bbox_inside_weights.stop_gradient = True
     bbox_outside_weights.stop_gradient = True
+    max_overlap_with_gt.stop_gradient = True
 
+    if return_max_overlap:
+        return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, max_overlap_with_gt
     return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights
 
 
diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py
index 05b9067ec400f..9348b0b50a1c0 100644
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -289,40 +289,39 @@ def test_anchor_generator(self):
 
 
 class TestGenerateProposalLabels(unittest.TestCase):
+    def check_out(self, outs):
+        rois = outs[0]
+        labels_int32 = outs[1]
+        bbox_targets = outs[2]
+        bbox_inside_weights = outs[3]
+        bbox_outside_weights = outs[4]
+        assert rois.shape[1] == 4
+        assert rois.shape[0] == labels_int32.shape[0]
+        assert rois.shape[0] == bbox_targets.shape[0]
+        assert rois.shape[0] == bbox_inside_weights.shape[0]
+        assert rois.shape[0] == bbox_outside_weights.shape[0]
+        assert bbox_targets.shape[1] == 4 * self.class_nums
+        assert bbox_inside_weights.shape[1] == 4 * self.class_nums
+        assert bbox_outside_weights.shape[1] == 4 * self.class_nums
+        if len(outs) == 6:
+            max_overlap_with_gt = outs[5]
+            assert max_overlap_with_gt.shape[0] == rois.shape[0]
+
     def test_generate_proposal_labels(self):
         program = Program()
         with program_guard(program):
-            rpn_rois = layers.data(
-                name='rpn_rois',
-                shape=[4, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_classes = layers.data(
-                name='gt_classes',
-                shape=[6],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            is_crowd = layers.data(
-                name='is_crowd',
-                shape=[6],
-                dtype='int32',
-                lod_level=1,
-                append_batch_size=False)
-            gt_boxes = layers.data(
-                name='gt_boxes',
-                shape=[6, 4],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            im_info = layers.data(
-                name='im_info',
-                shape=[1, 3],
-                dtype='float32',
-                lod_level=1,
-                append_batch_size=False)
-            class_nums = 5
+            rpn_rois = fluid.data(
+                name='rpn_rois', shape=[4, 4], dtype='float32', lod_level=1)
+            gt_classes = fluid.data(
+                name='gt_classes', shape=[6], dtype='int32', lod_level=1)
+            is_crowd = fluid.data(
+                name='is_crowd', shape=[6], dtype='int32', lod_level=1)
+            gt_boxes = fluid.data(
+                name='gt_boxes', shape=[6, 4], dtype='float32', lod_level=1)
+            im_info = fluid.data(name='im_info', shape=[1, 3], dtype='float32')
+            max_overlap = fluid.data(
+                name='max_overlap', shape=[4], dtype='float32', lod_level=1)
+            self.class_nums = 5
             outs = fluid.layers.generate_proposal_labels(
                 rpn_rois=rpn_rois,
                 gt_classes=gt_classes,
@@ -335,20 +334,27 @@ def test_generate_proposal_labels(self):
                 bg_thresh_hi=0.5,
                 bg_thresh_lo=0.0,
                 bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
-                class_nums=class_nums)
+                class_nums=self.class_nums)
+            outs_1 = fluid.layers.generate_proposal_labels(
+                rpn_rois=rpn_rois,
+                gt_classes=gt_classes,
+                is_crowd=is_crowd,
+                gt_boxes=gt_boxes,
+                im_info=im_info,
+                batch_size_per_im=2,
+                fg_fraction=0.5,
+                fg_thresh=0.5,
+                bg_thresh_hi=0.5,
+                bg_thresh_lo=0.0,
+                bbox_reg_weights=[0.1, 0.1, 0.2, 0.2],
+                class_nums=self.class_nums,
+                is_cascade_rcnn=True,
+                max_overlap=max_overlap,
+                return_max_overlap=True)
+
+            self.check_out(outs)
+            self.check_out(outs_1)
             rois = outs[0]
-            labels_int32 = outs[1]
-            bbox_targets = outs[2]
-            bbox_inside_weights = outs[3]
-            bbox_outside_weights = outs[4]
-            assert rois.shape[1] == 4
-            assert rois.shape[0] == labels_int32.shape[0]
-            assert rois.shape[0] == bbox_targets.shape[0]
-            assert rois.shape[0] == bbox_inside_weights.shape[0]
-            assert rois.shape[0] == bbox_outside_weights.shape[0]
-            assert bbox_targets.shape[1] == 4 * class_nums
-            assert bbox_inside_weights.shape[1] == 4 * class_nums
-            assert bbox_outside_weights.shape[1] == 4 * class_nums
 
 
 class TestGenerateMaskLabels(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
index 5054256ca7247..2e9a5229e2ee0 100644
--- a/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposal_labels_op.py
@@ -22,66 +22,91 @@
 from op_test import OpTest
 
 
-def generate_proposal_labels_in_python(
-        rpn_rois, gt_classes, is_crowd, gt_boxes, im_info, batch_size_per_im,
-        fg_fraction, fg_thresh, bg_thresh_hi, bg_thresh_lo, bbox_reg_weights,
-        class_nums, use_random, is_cls_agnostic, is_cascade_rcnn):
+def generate_proposal_labels_in_python(rpn_rois,
+                                       gt_classes,
+                                       is_crowd,
+                                       gt_boxes,
+                                       im_info,
+                                       batch_size_per_im,
+                                       fg_fraction,
+                                       fg_thresh,
+                                       bg_thresh_hi,
+                                       bg_thresh_lo,
+                                       bbox_reg_weights,
+                                       class_nums,
+                                       use_random,
+                                       is_cls_agnostic,
+                                       is_cascade_rcnn,
+                                       max_overlaps=None):
     rois = []
     labels_int32 = []
     bbox_targets = []
     bbox_inside_weights = []
     bbox_outside_weights = []
+    max_overlap_with_gt = []
     lod = []
     assert len(rpn_rois) == len(
         im_info), 'batch size of rpn_rois and ground_truth is not matched'
 
     for im_i in range(len(im_info)):
+        max_overlap = max_overlaps[im_i] if is_cascade_rcnn else None
         frcn_blobs = _sample_rois(
             rpn_rois[im_i], gt_classes[im_i], is_crowd[im_i], gt_boxes[im_i],
             im_info[im_i], batch_size_per_im, fg_fraction, fg_thresh,
             bg_thresh_hi, bg_thresh_lo, bbox_reg_weights, class_nums,
-            use_random, is_cls_agnostic, is_cascade_rcnn)
+            use_random, is_cls_agnostic, is_cascade_rcnn, max_overlap)
         lod.append(frcn_blobs['rois'].shape[0])
         rois.append(frcn_blobs['rois'])
         labels_int32.append(frcn_blobs['labels_int32'])
         bbox_targets.append(frcn_blobs['bbox_targets'])
         bbox_inside_weights.append(frcn_blobs['bbox_inside_weights'])
         bbox_outside_weights.append(frcn_blobs['bbox_outside_weights'])
+        max_overlap_with_gt.append(frcn_blobs['max_overlap'])
 
-    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, lod
+    return rois, labels_int32, bbox_targets, bbox_inside_weights, bbox_outside_weights, max_overlap_with_gt, lod
+
+
+def filter_roi(rois, max_overlap):
+    ws = rois[:, 2] - rois[:, 0] + 1
+    hs = rois[:, 3] - rois[:, 1] + 1
+    keep = np.where((ws > 0) & (hs > 0) & (max_overlap < 1.0))[0]
+    if len(keep) > 0:
+        return rois[keep, :]
+    return np.zeros((1, 4)).astype('float32')
 
 
 def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
                  batch_size_per_im, fg_fraction, fg_thresh, bg_thresh_hi,
                  bg_thresh_lo, bbox_reg_weights, class_nums, use_random,
-                 is_cls_agnostic, is_cascade_rcnn):
+                 is_cls_agnostic, is_cascade_rcnn, max_overlap):
     rois_per_image = int(batch_size_per_im)
     fg_rois_per_im = int(np.round(fg_fraction * rois_per_image))
 
     # Roidb
     im_scale = im_info[2]
     inv_im_scale = 1. / im_scale
-    if is_cascade_rcnn:
-        rpn_rois = rpn_rois[len(gt_boxes):, :]
     rpn_rois = rpn_rois * inv_im_scale
+
+    if is_cascade_rcnn:
+        rpn_rois = filter_roi(rpn_rois, max_overlap)
+
     boxes = np.vstack([gt_boxes, rpn_rois])
 
     gt_overlaps = np.zeros((boxes.shape[0], class_nums))
     box_to_gt_ind_map = np.zeros((boxes.shape[0]), dtype=np.int32)
-    if len(gt_boxes) > 0:
-        proposal_to_gt_overlaps = _bbox_overlaps(boxes, gt_boxes)
-
-        overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
-        overlaps_max = proposal_to_gt_overlaps.max(axis=1)
-        # Boxes which with non-zero overlap with gt boxes
-        overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
-        overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
-            overlapped_boxes_ind]]
-        gt_overlaps[overlapped_boxes_ind,
-                    overlapped_boxes_gt_classes] = overlaps_max[
-                        overlapped_boxes_ind]
-        box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
-            overlapped_boxes_ind]
+    proposal_to_gt_overlaps = _bbox_overlaps(boxes, gt_boxes)
+
+    overlaps_argmax = proposal_to_gt_overlaps.argmax(axis=1)
+    overlaps_max = proposal_to_gt_overlaps.max(axis=1)
+    # Boxes which with non-zero overlap with gt boxes
+    overlapped_boxes_ind = np.where(overlaps_max > 0)[0]
+    overlapped_boxes_gt_classes = gt_classes[overlaps_argmax[
+        overlapped_boxes_ind]]
+    gt_overlaps[overlapped_boxes_ind,
+                overlapped_boxes_gt_classes] = overlaps_max[
+                    overlapped_boxes_ind]
+    box_to_gt_ind_map[overlapped_boxes_ind] = overlaps_argmax[
+        overlapped_boxes_ind]
 
     crowd_ind = np.where(is_crowd)[0]
     gt_overlaps[crowd_ind] = -1.0
@@ -90,11 +115,6 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
 
     if is_cascade_rcnn:
         # Cascade RCNN Decode Filter
-        ws = boxes[:, 2] - boxes[:, 0] + 1
-        hs = boxes[:, 3] - boxes[:, 1] + 1
-        keep = np.where((ws > 0) & (hs > 0))[0]
-        boxes = boxes[keep]
-        max_overlaps = max_overlaps[keep]
         fg_inds = np.where(max_overlaps >= fg_thresh)[0]
         bg_inds = np.where((max_overlaps < bg_thresh_hi) & (max_overlaps >=
                                                             bg_thresh_lo))[0]
@@ -125,6 +145,7 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
     sampled_labels = max_classes[keep_inds]
     sampled_labels[fg_rois_per_this_image:] = 0
     sampled_boxes = boxes[keep_inds]
+    sampled_max_overlap = max_overlaps[keep_inds]
     sampled_gts = gt_boxes[box_to_gt_ind_map[keep_inds]]
     sampled_gts[fg_rois_per_this_image:, :] = gt_boxes[0]
     bbox_label_targets = _compute_targets(sampled_boxes, sampled_gts,
@@ -142,7 +163,8 @@ def _sample_rois(rpn_rois, gt_classes, is_crowd, gt_boxes, im_info,
         labels_int32=sampled_labels,
         bbox_targets=bbox_targets,
         bbox_inside_weights=bbox_inside_weights,
-        bbox_outside_weights=bbox_outside_weights)
+        bbox_outside_weights=bbox_outside_weights,
+        max_overlap=sampled_max_overlap)
     return frcn_blobs
 
 
@@ -226,9 +248,9 @@ class TestGenerateProposalLabelsOp(OpTest):
     def set_data(self):
         #self.use_random = False
         self.init_use_random()
-        self.init_test_cascade()
         self.init_test_params()
         self.init_test_input()
+        self.init_test_cascade()
         self.init_test_output()
 
         self.inputs = {
@@ -236,8 +258,12 @@ def set_data(self):
             'GtClasses': (self.gt_classes[0], self.gts_lod),
             'IsCrowd': (self.is_crowd[0], self.gts_lod),
             'GtBoxes': (self.gt_boxes[0], self.gts_lod),
-            'ImInfo': self.im_info
+            'ImInfo': self.im_info,
         }
+        if self.max_overlaps is not None:
+            self.inputs['MaxOverlap'] = (self.max_overlaps[0],
+                                         self.rpn_rois_lod)
+
         self.attrs = {
             'batch_size_per_im': self.batch_size_per_im,
             'fg_fraction': self.fg_fraction,
@@ -256,6 +282,7 @@ def set_data(self):
             'BboxTargets': (self.bbox_targets, [self.lod]),
             'BboxInsideWeights': (self.bbox_inside_weights, [self.lod]),
             'BboxOutsideWeights': (self.bbox_outside_weights, [self.lod]),
+            'MaxOverlapWithGT': (self.max_overlap_with_gt, [self.lod]),
         }
 
     def test_check_output(self):
@@ -267,12 +294,13 @@ def setUp(self):
 
     def init_test_cascade(self, ):
         self.is_cascade_rcnn = False
+        self.max_overlaps = None
 
     def init_use_random(self):
         self.use_random = False
 
     def init_test_params(self):
-        self.batch_size_per_im = 512
+        self.batch_size_per_im = 100
         self.fg_fraction = 0.25
         self.fg_thresh = 0.5
         self.bg_thresh_hi = 0.5
@@ -284,7 +312,7 @@ def init_test_params(self):
     def init_test_input(self):
         np.random.seed(0)
         gt_nums = 6  # Keep same with batch_size_per_im for unittest
-        proposal_nums = 2000 if not self.is_cascade_rcnn else 512  #self.batch_size_per_im - gt_nums
+        proposal_nums = 200
         images_shape = [[64, 64]]
         self.im_info = np.ones((len(images_shape), 3)).astype(np.float32)
         for i in range(len(images_shape)):
@@ -301,24 +329,16 @@ def init_test_input(self):
         self.gt_boxes = [gt['boxes'] for gt in ground_truth]
         self.is_crowd = [gt['is_crowd'] for gt in ground_truth]
 
-        if self.is_cascade_rcnn:
-            rpn_rois_new = []
-            for im_i in range(len(self.im_info)):
-                gt_boxes = self.gt_boxes[im_i]
-                rpn_rois = np.vstack(
-                    [gt_boxes, self.rpn_rois[im_i][len(gt_boxes):, :]])
-                rpn_rois_new.append(rpn_rois)
-            self.rpn_rois = rpn_rois_new
-
     def init_test_output(self):
         self.rois, self.labels_int32, self.bbox_targets, \
         self.bbox_inside_weights, self.bbox_outside_weights, \
+        self.max_overlap_with_gt, \
         self.lod = generate_proposal_labels_in_python(
                 self.rpn_rois, self.gt_classes, self.is_crowd, self.gt_boxes, self.im_info,
                 self.batch_size_per_im, self.fg_fraction,
                 self.fg_thresh, self.bg_thresh_hi, self.bg_thresh_lo,
                 self.bbox_reg_weights, self.class_nums, self.use_random,
-                self.is_cls_agnostic, self.is_cascade_rcnn
+                self.is_cls_agnostic, self.is_cascade_rcnn, self.max_overlaps
             )
         self.rois = np.vstack(self.rois)
         self.labels_int32 = np.hstack(self.labels_int32)
@@ -326,11 +346,18 @@ def init_test_output(self):
         self.bbox_targets = np.vstack(self.bbox_targets)
         self.bbox_inside_weights = np.vstack(self.bbox_inside_weights)
         self.bbox_outside_weights = np.vstack(self.bbox_outside_weights)
+        self.max_overlap_with_gt = np.vstack(self.max_overlap_with_gt)
 
 
 class TestCascade(TestGenerateProposalLabelsOp):
     def init_test_cascade(self):
         self.is_cascade_rcnn = True
+        roi_num = len(self.rpn_rois[0])
+        self.max_overlaps = []
+        max_overlap = np.random.rand(roi_num).astype('float32')
+        # Make GT samples with overlap = 1
+        max_overlap[max_overlap > 0.9] = 1.
+        self.max_overlaps.append(max_overlap)
 
 
 class TestUseRandom(TestGenerateProposalLabelsOp):
@@ -389,6 +416,15 @@ def init_test_input(self):
         self.rpn_rois_lod = self.gts_lod
 
 
+class TestOnlyGT2(TestCascade):
+    def init_test_cascade(self):
+        self.is_cascade_rcnn = True
+        roi_num = len(self.rpn_rois[0])
+        self.max_overlaps = []
+        max_overlap = np.ones(roi_num).astype('float32')
+        self.max_overlaps.append(max_overlap)
+
+
 def _generate_proposals(images_shape, proposal_nums):
     rpn_rois = []
     rpn_rois_lod = []

From d87d286707fb674dae12a6993130d1741ecda024 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 20 Oct 2020 19:21:55 +0800
Subject: [PATCH 012/185] Add build paddle inference (#28131)

* Add build paddle inference;test=document_fix

* Add build paddle inference;test=document_fix
---
 paddle/scripts/paddle_build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 913af90de5ad8..e4ad4f80b3887 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1668,6 +1668,10 @@ function main() {
         tar_fluid_lib
         test_fluid_lib
         ;;
+      build_inference_lib)
+        cmake_gen ${PYTHON_ABI:-""}
+        gen_fluid_lib ${parallel_number}
+        ;;
       check_style)
         check_style
         ;;

From cd372447b9a1e0785e7fce79694fcb0ad4746714 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Tue, 20 Oct 2020 22:44:49 +0800
Subject: [PATCH 013/185] disable test_dist_mnist_hallreduce, test=develop
 (#28129)

---
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index ba4f5ecf90323..2fa03f205ba78 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -427,6 +427,9 @@ if(WITH_DISTRIBUTE)
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transformer")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_transpiler")
 
+    # TODO(sandyhouse): fix and add the ut back
+    list(REMOVE_ITEM DIST_TEST_OPS "test_dist_mnist_hallreduce")
+
     #not need
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_base")
     list(REMOVE_ITEM DIST_TEST_OPS "test_dist_fleet_base")

From 446d184e11fa6336a44f18bf3f4e5c3fe306d8ce Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Wed, 21 Oct 2020 10:02:47 +0800
Subject: [PATCH 014/185] Add new api: is_tensor (#28111)

* Add new api: is_tensor

* Add new api: is_tensor

* Add new api: is_tensor

* Add new api: is_tensor
---
 python/paddle/__init__.py                     |  1 +
 .../fluid/tests/unittests/test_is_tensor.py   | 56 +++++++++++++++++++
 python/paddle/tensor/__init__.py              |  1 +
 python/paddle/tensor/logic.py                 | 35 ++++++++++++
 4 files changed, 93 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_is_tensor.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 21827166d1882..3640dd22bb0cd 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -108,6 +108,7 @@
 from .tensor.logic import allclose  #DEFINE_ALIAS
 from .tensor.logic import equal_all  #DEFINE_ALIAS
 # from .tensor.logic import isnan        #DEFINE_ALIAS
+from .tensor.logic import is_tensor  #DEFINE_ALIAS
 from .tensor.manipulation import cast  #DEFINE_ALIAS
 from .tensor.manipulation import concat  #DEFINE_ALIAS
 from .tensor.manipulation import expand  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/test_is_tensor.py b/python/paddle/fluid/tests/unittests/test_is_tensor.py
new file mode 100644
index 0000000000000..97d6c60d631d3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_is_tensor.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+
+DELTA = 0.00001
+
+
+class TestIsTensorApi(unittest.TestCase):
+    def test_is_tensor_real(self, dtype="float32"):
+        """Test is_tensor api with a real tensor
+        """
+        paddle.disable_static()
+        x = paddle.rand([3, 2, 4], dtype=dtype)
+        self.assertTrue(paddle.is_tensor(x))
+
+    def test_is_tensor_complex(self, dtype="float32"):
+        """Test is_tensor api with a complex tensor
+        """
+        paddle.disable_static()
+        r = paddle.to_tensor(1)
+        i = paddle.to_tensor(2)
+        x = paddle.ComplexTensor(r, i)
+        self.assertTrue(paddle.is_tensor(x))
+
+    def test_is_tensor_list(self, dtype="float32"):
+        """Test is_tensor api with a list
+        """
+        paddle.disable_static()
+        x = [1, 2, 3]
+        self.assertFalse(paddle.is_tensor(x))
+
+    def test_is_tensor_number(self, dtype="float32"):
+        """Test is_tensor api with a number
+        """
+        paddle.disable_static()
+        x = 5
+        self.assertFalse(paddle.is_tensor(x))
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 773e6ebc7af2e..958bfb304fb14 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -71,6 +71,7 @@
 from .logic import allclose  #DEFINE_ALIAS
 from .logic import equal_all  #DEFINE_ALIAS
 # from .logic import isnan        #DEFINE_ALIAS
+from .logic import is_tensor  #DEFINE_ALIAS
 from .manipulation import cast  #DEFINE_ALIAS
 from .manipulation import concat  #DEFINE_ALIAS
 from .manipulation import expand  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 1fc1c17d2edb2..27671a4f15747 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -19,6 +19,8 @@
 from .. import fluid
 from ..fluid.framework import in_dygraph_mode
 from paddle.common_ops_import import *
+from ..framework import VarBase as Tensor
+from ..framework import ComplexVariable as ComplexTensor
 
 # TODO: define logic functions of a tensor  
 from ..fluid.layers import is_empty  #DEFINE_ALIAS
@@ -43,6 +45,7 @@
     'logical_xor',
     'not_equal',
     'allclose',
+    'is_tensor'
     #       'isnan'
 ]
 
@@ -372,3 +375,35 @@ def not_equal(x, y, name=None):
     """
     out = fluid.layers.not_equal(x, y, name=name, cond=None)
     return out
+
+
+def is_tensor(x):
+    """
+
+    This function tests whether input object is a paddle.Tensor or a paddle.ComplexTensor.
+
+    Args:
+        x (object): Object to test.
+
+    Returns:
+        A boolean value. True if 'x' is a paddle.Tensor or a paddle.ComplexTensor, otherwise False.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            input1 = paddle.rand(shape=[2, 3, 5], dtype='float32')
+            check = paddle.is_tensor(input1)
+            print(check)  #True
+
+            input2 = paddle.ComplexTensor(input1, input1)
+            check = paddle.is_tensor(input2)
+            print(check)  #True
+
+            input3 = [1, 4]
+            check = paddle.is_tensor(input3)
+            print(check)  #False
+            
+    """
+    return isinstance(x, Tensor) or isinstance(x, ComplexTensor)

From 5d7000215a05a100942332911b6750484915325a Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 21 Oct 2020 10:05:09 +0800
Subject: [PATCH 015/185] fix dynamic_loader more safe and error message on
 windows (#28117)

---
 CMakeLists.txt                                | 27 +++---
 cmake/cuda.cmake                              |  5 +-
 cmake/cudnn.cmake                             | 11 ++-
 .../fluid/platform/dynload/dynamic_loader.cc  | 92 ++++++++++++++-----
 paddle/fluid/platform/port.h                  |  6 +-
 5 files changed, 96 insertions(+), 45 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 06d687fc9c4f3..1a8eef5e66b1c 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -193,10 +193,19 @@ if(WITH_BRPC_RDMA)
     endif()
 endif()
 
-# lite subgraph compilation depends on CUDNN_ROOT,
-# so include(cudnn) needs to be in front of include(third_party/lite)
-include(cudnn)              # set cudnn libraries, must before configure
-include(third_party)        # download, build, install third_party
+if(WITH_GPU)
+    include(cuda)
+    # lite subgraph compilation depends on CUDNN_ROOT,
+    # so include(cudnn) needs to be in front of include(third_party/lite)
+    include(cudnn)              # set cudnn libraries, must before configure
+    include(tensorrt)
+    # there is no official support of nccl, cupti in windows
+    if(NOT WIN32)
+        include(cupti)
+    endif()
+endif()
+
+include(third_party)  # download, build, install third_party, Contains about 20+ dependencies
 
 if(WITH_DISTRIBUTE)
     if(WITH_GRPC)
@@ -209,18 +218,8 @@ if(WITH_DISTRIBUTE)
     endif()
 endif()
 
-# there is no official support of nccl, cupti in windows
-if(NOT WIN32)
-    include(cupti)
-endif()
-
 include(flags)              # set paddle compile flags
 
-if(WITH_GPU)
-    include(cuda)
-    include(tensorrt)
-endif()
-
 if(WITH_PROFILER)
     find_package(Gperftools REQUIRED)
     include_directories(${GPERFTOOLS_INCLUDE_DIR})
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index c78fe5f6c7fbd..146cbee1c6a88 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -198,7 +198,9 @@ elseif (${CMAKE_CUDA_COMPILER_VERSION} LESS 12.0) # CUDA 11.x
   set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -D__STRICT_ANSI__")
 endif()
 
-add_definitions("-DPADDLE_CUDA_BINVER=\"${CUDA_VERSION_MAJOR}${CUDA_VERSION_MINOR}\"")
+add_definitions("-DCUDA_VERSION_MAJOR=\"${CUDA_VERSION_MAJOR}\"")
+add_definitions("-DCUDA_VERSION_MINOR=\"${CUDA_VERSION_MINOR}\"")
+add_definitions("-DCUDA_TOOLKIT_ROOT_DIR=\"${CUDA_TOOLKIT_ROOT_DIR}\"")
 
 # setting nvcc arch flags
 select_nvcc_arch_flags(NVCC_FLAGS_EXTRA)
@@ -249,3 +251,4 @@ endif()
 
 mark_as_advanced(CUDA_BUILD_CUBIN CUDA_BUILD_EMULATION CUDA_VERBOSE_BUILD)
 mark_as_advanced(CUDA_SDK_ROOT_DIR CUDA_SEPARABLE_COMPILATION)
+
diff --git a/cmake/cudnn.cmake b/cmake/cudnn.cmake
index b68e1b4070c88..d8d8f634e76b6 100644
--- a/cmake/cudnn.cmake
+++ b/cmake/cudnn.cmake
@@ -35,17 +35,18 @@ list(APPEND CUDNN_CHECK_LIBRARY_DIRS
 	${CUDA_TOOLKIT_ROOT_DIR}/lib/x64
 	)
 set(CUDNN_LIB_NAME "")
+
 if (LINUX)
-set(CUDNN_LIB_NAME "libcudnn.so")
+    set(CUDNN_LIB_NAME "libcudnn.so")
 endif(LINUX)
 
 if(WIN32)
-# only support cudnn7
-set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
+    # only support cudnn7
+    set(CUDNN_LIB_NAME "cudnn.lib" "cudnn64_7.dll")
 endif(WIN32)
 
 if(APPLE)
-set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
+    set(CUDNN_LIB_NAME "libcudnn.dylib" "libcudnn.so")
 endif(APPLE)
 
 find_library(CUDNN_LIBRARY NAMES ${CUDNN_LIB_NAME} # libcudnn_static.a
@@ -88,7 +89,7 @@ macro(find_cudnn_version cudnn_header_file)
         if(NOT CUDNN_MAJOR_VERSION)
             set(CUDNN_VERSION "???")
         else()
-            add_definitions("-DPADDLE_CUDNN_BINVER=\"${CUDNN_MAJOR_VERSION}\"")
+            add_definitions("-DCUDNN_MAJOR_VERSION=\"${CUDNN_MAJOR_VERSION}\"")
             math(EXPR CUDNN_VERSION
                 "${CUDNN_MAJOR_VERSION} * 1000 +
                  ${CUDNN_MINOR_VERSION} * 100 + ${CUDNN_PATCHLEVEL_VERSION}")
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index 0c8a64ccf6943..4d911d12e5520 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -57,17 +57,26 @@ struct PathNode {
 
 static constexpr char cupti_lib_path[] = CUPTI_LIB_PATH;
 
-// NOTE: In order to adapt to the default installation path of cuda on linux
-static constexpr char linux_cudnn_lib_path[] = "/usr/local/cuda/lib64";
+// NOTE: In order to adapt to the default installation path of cuda
+#if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
+static constexpr char cuda_lib_path[] = CUDA_TOOLKIT_ROOT_DIR "/bin";
+#else
+static constexpr char cuda_lib_path[] = "/usr/local/cuda/lib64";
+#endif
 
 static PathNode s_py_site_pkg_path;
 
 #if defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-static constexpr char* win_cublas_lib = "cublas64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_curand_lib = "curand64_" PADDLE_CUDA_BINVER ".dll";
-static constexpr char* win_cudnn_lib = "cudnn64_" PADDLE_CUDNN_BINVER ".dll";
+static constexpr char* win_cublas_lib =
+    "cublas64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cublas64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_curand_lib =
+    "curand64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;curand64_" CUDA_VERSION_MAJOR ".dll";
+static constexpr char* win_cudnn_lib = "cudnn64_" CUDNN_MAJOR_VERSION ".dll";
 static constexpr char* win_cusolver_lib =
-    "cusolver64_" PADDLE_CUDA_BINVER ".dll";
+    "cusolver64_" CUDA_VERSION_MAJOR CUDA_VERSION_MINOR
+    ".dll;cusolver64_" CUDA_VERSION_MAJOR ".dll";
 #endif
 
 static inline std::string join(const std::string& part1,
@@ -87,6 +96,24 @@ static inline std::string join(const std::string& part1,
   return ret;
 }
 
+static inline std::vector<std::string> split(
+    const std::string& str, const std::string separator = " ") {
+  std::vector<std::string> str_list;
+  std::string::size_type firstPos;
+  firstPos = str.find_first_not_of(separator, 0);
+  std::string::size_type lastPos;
+  lastPos = str.find_first_of(separator, firstPos);
+  while (std::string::npos != firstPos && std::string::npos != lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+    firstPos = str.find_first_not_of(separator, lastPos);
+    lastPos = str.find_first_of(separator, firstPos);
+  }
+  if (std::string::npos == lastPos) {
+    str_list.push_back(str.substr(firstPos, lastPos - firstPos));
+  }
+  return str_list;
+}
+
 void SetPaddleLibPath(const std::string& py_site_pkg_path) {
   s_py_site_pkg_path.path = py_site_pkg_path;
   VLOG(3) << "Set paddle lib path : " << py_site_pkg_path;
@@ -147,26 +174,31 @@ static inline void* GetDsoHandleFromSearchPath(
 #else
   int dynload_flags = 0;
 #endif  // !_WIN32
-  // 1. search in user config path by FLAGS
-  void* dso_handle =
-      GetDsoHandleFromSpecificPath(config_path, dso_name, dynload_flags);
-  // 2. search in system default path
-  if (nullptr == dso_handle) {
-    dso_handle = GetDsoHandleFromDefaultPath(dso_name, dynload_flags);
-  }
-  // 3. search in extra paths
-  if (nullptr == dso_handle) {
-    for (auto path : extra_paths) {
-      dso_handle = GetDsoHandleFromSpecificPath(path, dso_name, dynload_flags);
+  std::vector<std::string> dso_names = split(dso_name, ";");
+  void* dso_handle = nullptr;
+  for (auto dso : dso_names) {
+    // 1. search in user config path by FLAGS
+    dso_handle = GetDsoHandleFromSpecificPath(config_path, dso, dynload_flags);
+    // 2. search in extra paths
+    if (nullptr == dso_handle) {
+      for (auto path : extra_paths) {
+        VLOG(3) << "extra_paths: " << path;
+        dso_handle = GetDsoHandleFromSpecificPath(path, dso, dynload_flags);
+      }
+    }
+    // 3. search in system default path
+    if (nullptr == dso_handle) {
+      dso_handle = GetDsoHandleFromDefaultPath(dso, dynload_flags);
     }
+    if (nullptr != dso_handle) break;
   }
 
-  // 4. [If Failed] logging warning if exists
+  // 4. [If Failed for All dso_names] logging warning if exists
   if (nullptr == dso_handle && !warning_msg.empty()) {
     LOG(WARNING) << warning_msg;
   }
 
-  // 5. [If Failed] logging or throw error info
+  // 5. [If Failed for All dso_names] logging or throw error info
   if (nullptr == dso_handle) {
     auto error_msg =
         "The third-party dynamic library (%s) that Paddle depends on is not "
@@ -203,7 +235,8 @@ void* GetCublasDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cublas_lib, true,
+                                    {cuda_lib_path});
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcublas.so");
 #endif
@@ -220,10 +253,19 @@ void* GetCUDNNDsoHandle() {
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.dylib", false,
                                     {}, mac_warn_meg);
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib);
+  std::string win_warn_meg(
+      "Note: [Recommend] copy cudnn into CUDA installation directory. \n "
+      "For instance, download cudnn-10.0-windows10-x64-v7.6.5.32.zip from "
+      "NVIDIA's official website, \n"
+      "then, unzip it and copy it into C:\\Program Files\\NVIDIA GPU Computing "
+      "Toolkit\\CUDA/v10.0\n"
+      "You should do this according to your CUDA installation directory and "
+      "CUDNN version.");
+  return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, win_cudnn_lib, true,
+                                    {cuda_lib_path}, win_warn_meg);
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cudnn_dir, "libcudnn.so", false,
-                                    {linux_cudnn_lib_path});
+                                    {cuda_lib_path});
 #endif
 }
 
@@ -241,7 +283,8 @@ void* GetCurandDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_curand_lib, true,
+                                    {cuda_lib_path});
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so");
 #endif
@@ -251,7 +294,8 @@ void* GetCusolverDsoHandle() {
 #if defined(__APPLE__) || defined(__OSX__)
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.dylib");
 #elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusolver_lib);
+  return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, win_cusolver_lib, true,
+                                    {cuda_lib_path});
 #else
   return GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcusolver.so");
 #endif
diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h
index c5e8ff807a2d3..b2f26ba9581e0 100644
--- a/paddle/fluid/platform/port.h
+++ b/paddle/fluid/platform/port.h
@@ -56,7 +56,11 @@ static void *dlopen(const char *filename, int flag) {
   std::string file_name(filename);
   HMODULE hModule = LoadLibrary(file_name.c_str());
   if (!hModule) {
-    throw std::runtime_error(file_name + " not found.");
+    if (flag) {
+      throw std::runtime_error(file_name + " not found.");
+    } else {
+      return nullptr;
+    }
   }
   return reinterpret_cast<void *>(hModule);
 }

From 085b9619f531237bf7fea0337b827500cdf03c6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E6=9D=8E=E7=81=BF?= <646741404@qq.com>
Date: Wed, 21 Oct 2020 11:12:23 +0800
Subject: [PATCH 016/185] fix bugs test=develop (#28125)

---
 python/paddle/tensor/math.py   | 2 +-
 python/paddle/tensor/random.py | 2 ++
 python/paddle/tensor/search.py | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 19ba7f1b38ce4..895d0c175905c 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1100,7 +1100,7 @@ def max(x, axis=None, keepdim=False, name=None):
             float64, int32, int64.
         axis(list|int, optional): The axis along which the maximum is computed.
             If :attr:`None`, compute the maximum over all elements of
-             `x` and return a Tensor variable with a single element,
+            `x` and return a Tensor variable with a single element,
             otherwise must be in the range :math:`[-x.ndim(x), x.ndim(x))`.
             If :math:`axis[i] < 0`, the axis to reduce is :math:`x.ndim + axis[i]`.
         keepdim(bool, optional): Whether to reserve the reduced dimension in the
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index eb9750bcc3957..3a0435e776eac 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -380,7 +380,9 @@ def uniform(shape, dtype=None, min=-1.0, max=1.0, seed=0, name=None):
     distribution in the range [``min``, ``max``), with ``shape`` and ``dtype``.
 
     Examples:
+
     ::
+
         Input:
           shape = [1, 2]
         Output:
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 7adf1b7cc4bd0..3b7906730247c 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -62,7 +62,9 @@ def argsort(x, axis=-1, descending=False, name=None):
         and with data type int64).
 
     Examples:
+
         .. code-block:: python
+
             import paddle
             
             paddle.disable_static()
@@ -358,6 +360,7 @@ def nonzero(x, as_tuple=False):
     
         .. code-block:: python
 
+
             import paddle
 
             x1 = paddle.to_tensor([[1.0, 0.0, 0.0],

From 5289b72accfaf57e18ed47f475088daaf7d58409 Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Wed, 21 Oct 2020 11:33:49 +0800
Subject: [PATCH 017/185] fix Wmaybe-uninitialized warning in pooling.cc,
 test=develop (#28126)

---
 paddle/fluid/operators/math/pooling.cc | 40 +++++++++++++-------------
 1 file changed, 20 insertions(+), 20 deletions(-)

diff --git a/paddle/fluid/operators/math/pooling.cc b/paddle/fluid/operators/math/pooling.cc
index fec738378a64c..d43d34a1d7d7d 100644
--- a/paddle/fluid/operators/math/pooling.cc
+++ b/paddle/fluid/operators/math/pooling.cc
@@ -52,8 +52,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
-    int hstart, hend;
-    int wstart, wend;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
@@ -133,8 +133,8 @@ class Pool2dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
-    int hstart, hend;
-    int wstart, wend;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     if (!channel_last) {
       const int input_stride = input_height * input_width;
       const int output_stride = output_height * output_width;
@@ -272,8 +272,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    int hstart, hend;
-    int wstart, wend;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int ph = 0; ph < output_height; ++ph) {
@@ -359,8 +359,8 @@ class Pool2dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    int hstart, hend;
-    int wstart, wend;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     if (!channel_last) {
       const int input_stride = input_height * input_width;
       const int output_stride = output_height * output_width;
@@ -705,9 +705,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
+    int dstart = 0, dend = 1;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
 
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
@@ -808,9 +808,9 @@ class Pool3dFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* input_data = input.data<T>();
     T* output_data = output->mutable_data<T>(context.GetPlace());
 
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
+    int dstart = 0, dend = 1;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
       const int output_stride = output_depth * output_height * output_width;
@@ -998,9 +998,9 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
+    int dstart = 0, dend = 1;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     for (int i = 0; i < batch_size; i++) {
       for (int c = 0; c < output_channels; ++c) {
         for (int pd = 0; pd < output_depth; ++pd) {
@@ -1106,9 +1106,9 @@ class Pool3dGradFunctor<platform::CPUDeviceContext, PoolProcess, T> {
     const T* output_grad_data = output_grad.data<T>();
     T* input_grad_data = input_grad->mutable_data<T>(context.GetPlace());
 
-    int dstart, dend;
-    int hstart, hend;
-    int wstart, wend;
+    int dstart = 0, dend = 1;
+    int hstart = 0, hend = 1;
+    int wstart = 0, wend = 1;
     if (!channel_last) {
       const int input_stride = input_depth * input_height * input_width;
       const int output_stride = output_depth * output_height * output_width;

From 602d2ce5c9eac603dc74e04a5695aa1894f8decf Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Wed, 21 Oct 2020 13:13:08 +0800
Subject: [PATCH 018/185] change avg pooling from trt plugin to trt layer
 (#28032)

---
 paddle/fluid/inference/tensorrt/convert/pool2d_op.cc | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
index c031630f36d78..303130e74f512 100644
--- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc
@@ -88,6 +88,9 @@ class Pool2dOpConverter : public OpConverter {
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("strides"));
     std::vector<int> paddings =
         BOOST_GET_CONST(std::vector<int>, op_desc.GetAttr("paddings"));
+    bool exclusive = op_desc.HasAttr("exclusive")
+                         ? BOOST_GET_CONST(bool, op_desc.GetAttr("exclusive"))
+                         : true;
     bool ceil_mode = BOOST_GET_CONST(bool, op_desc.GetAttr("ceil_mode"));
     bool adaptive = false;
     if (op_desc.HasAttr("adaptive"))
@@ -166,7 +169,7 @@ class Pool2dOpConverter : public OpConverter {
       return;
     }
 
-    if (!adaptive && pool_type == "max") {
+    if (!adaptive) {
       // Under ceil mode, the pre_pad and post_pad are used to
       // record the the padding size. In some ceil mode cases,
       // we do not need padding, so we initialize the two vars to 0.
@@ -194,6 +197,7 @@ class Pool2dOpConverter : public OpConverter {
                           "trt pool layer in converter could not be created."));
       pool_layer->setStride(nv_strides);
       pool_layer->setPadding(nv_paddings);
+      pool_layer->setAverageCountExcludesPadding(exclusive);
       layer = pool_layer;
     } else {
       // Average pooling needs to exclude the padding pixels from the average
@@ -213,7 +217,6 @@ class Pool2dOpConverter : public OpConverter {
               "trt pool plugin layer in converter could not be created."));
       layer = pool_layer;
     }
-
     auto output_name = op_desc.Output("Out")[0];
     RreplenishLayerAndOutput(layer, "pool2d", {output_name}, test_mode);
   }

From 5cd97a1cb04049aa31fcd0d4d0a6825917520114 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Wed, 21 Oct 2020 14:06:20 +0800
Subject: [PATCH 019/185] support multiclass nms for multi-batch, test=develop
 (#28154)

---
 .../operators/detection/multiclass_nms_op.cc  | 10 +++-
 .../tests/unittests/test_multiclass_nms_op.py | 52 +++++++++++++++++++
 2 files changed, 61 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 4b38779c136e4..0e835a62839b4 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -290,6 +290,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
       } else {
         sdata = scores_data + label * predict_dim;
       }
+
       for (size_t j = 0; j < indices.size(); ++j) {
         int idx = indices[j];
         odata[count * out_dim] = label;  // label
@@ -333,6 +334,7 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     Tensor boxes_slice, scores_slice;
     int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
     for (int i = 0; i < n; ++i) {
+      std::map<int, std::vector<int>> indices;
       if (score_size == 3) {
         scores_slice = scores->Slice(i, i + 1);
         scores_slice.Resize({score_dims[1], score_dims[2]});
@@ -340,10 +342,14 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         boxes_slice.Resize({score_dims[2], box_dim});
       } else {
         auto boxes_lod = boxes->lod().back();
+        if (boxes_lod[i] == boxes_lod[i + 1]) {
+          all_indices.push_back(indices);
+          batch_starts.push_back(batch_starts.back());
+          continue;
+        }
         scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
         boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
       }
-      std::map<int, std::vector<int>> indices;
       MultiClassNMS(ctx, scores_slice, boxes_slice, score_size, &indices,
                     &num_nmsed_out);
       all_indices.push_back(indices);
@@ -375,12 +381,14 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
           }
         } else {
           auto boxes_lod = boxes->lod().back();
+          if (boxes_lod[i] == boxes_lod[i + 1]) continue;
           scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
           boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
           if (return_index) {
             offset = boxes_lod[i] * score_dims[1];
           }
         }
+
         int64_t s = batch_starts[i];
         int64_t e = batch_starts[i + 1];
         if (e > s) {
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index ab58d4bc88ef9..34c19b88bcdba 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -17,6 +17,7 @@
 import numpy as np
 import copy
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import Program, program_guard
 
@@ -171,6 +172,9 @@ def lod_multiclass_nms(boxes, scores, background, score_threshold,
     lod = []
     head = 0
     for n in range(len(box_lod[0])):
+        if box_lod[0][n] == 0:
+            lod.append(0)
+            continue
         box = boxes[head:head + box_lod[0][n]]
         score = scores[head:head + box_lod[0][n]]
         offset = head
@@ -357,6 +361,53 @@ def test_check_output(self):
         self.check_output()
 
 
+class TestMulticlassNMSNoBox(TestMulticlassNMSLoDInput):
+    def setUp(self):
+        self.set_argument()
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        box_lod = [[0, 1200, 0]]
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+        normalized = False
+
+        scores = np.random.random((M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+
+        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0] = boxes[:, :, 0] * 10
+        boxes[:, :, 1] = boxes[:, :, 1] * 10
+        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
+        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
+
+        det_outs, lod = lod_multiclass_nms(
+            boxes, scores, background, score_threshold, nms_threshold,
+            nms_top_k, keep_top_k, box_lod, normalized)
+        det_outs = np.array(det_outs).astype('float32')
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms'
+        self.inputs = {
+            'BBoxes': (boxes, box_lod),
+            'Scores': (scores, box_lod),
+        }
+        self.outputs = {'Out': (nmsed_outs, [lod])}
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': normalized,
+        }
+
+
 class TestIOU(unittest.TestCase):
     def test_iou(self):
         box1 = np.array([4.0, 3.0, 7.0, 5.0]).astype('float32')
@@ -521,4 +572,5 @@ def test_scores_Variable():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()

From f29fb396df2f354cc677e2483a98f76cd2c6f4be Mon Sep 17 00:00:00 2001
From: danleifeng <52735331+danleifeng@users.noreply.github.com>
Date: Wed, 21 Oct 2020 15:16:11 +0800
Subject: [PATCH 020/185] dygraph nccl init support host domain name (#28107)

* nccl init support hostname and ip; test=develop
---
 paddle/fluid/imperative/nccl_context.cc            | 14 +++++++++++++-
 paddle/fluid/imperative/nccl_context.h             |  1 +
 paddle/fluid/imperative/tests/nccl_context_test.cc |  2 +-
 3 files changed, 15 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index c8fd31fcbffe6..9ffec11354d8a 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -100,7 +100,19 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
   serv_addr.sin_family = AF_INET;
   serv_addr.sin_port = htons(port);
 
-  if (inet_pton(AF_INET, host.c_str(), &serv_addr.sin_addr) <= 0) {
+  char *ip = NULL;
+  struct hostent *hp;
+  if ((hp = gethostbyname(host.c_str())) == NULL) {
+    PADDLE_THROW(platform::errors::InvalidArgument(
+        "Fail to get host by name %s.", host));
+  }
+  int i = 0;
+  while (hp->h_addr_list[i] != NULL) {
+    ip = inet_ntoa(*(struct in_addr *)hp->h_addr_list[i]);
+    VLOG(3) << "gethostbyname  host:" << host << "  ->ip: " << ip;
+    break;
+  }
+  if (inet_pton(AF_INET, ip, &serv_addr.sin_addr) <= 0) {
     PADDLE_THROW(platform::errors::Unavailable("Open address %s failed.", ep));
   }
 
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index ac36ed77b482f..cbd169f8da77e 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -16,6 +16,7 @@
 // network header files
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32)
 #include <arpa/inet.h>
+#include <netdb.h>
 #include <netinet/in.h>
 #include <stdlib.h>
 #include <sys/socket.h>
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index 93ea988d638e4..e0d6950a97e30 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -20,7 +20,7 @@ namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
 
 imperative::ParallelStrategy GetStrategy(int local_rank) {
-  std::vector<std::string> eps = {"127.0.0.1:9866", "127.0.0.1:9867"};
+  std::vector<std::string> eps = {"127.0.0.1:9866", "localhost:9867"};
   imperative::ParallelStrategy strategy;
   strategy.trainer_endpoints_ = eps;
   strategy.current_endpoint_ = eps[local_rank];

From 2d45d9a04f50f136c904f76ad3cba78801de5a7a Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 21 Oct 2020 15:25:59 +0800
Subject: [PATCH 021/185] add static_mode_white_list (#28112)

* add static_mode_white_list

* add Mac CI static list

* add Win CI white_list

* add Coverage and Py3 CI white_list, add test_unittest
---
 .../test_dygraph_mode_of_unittest.py          |  29 +
 tools/static_mode_white_list.py               | 656 ++++++++++++++++++
 tools/test_runner.py                          |   9 +-
 3 files changed, 692 insertions(+), 2 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
 create mode 100644 tools/static_mode_white_list.py

diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py b/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
new file mode 100644
index 0000000000000..739a0fbbfd323
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_mode_of_unittest.py
@@ -0,0 +1,29 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle
+
+
+class TestDygraphModeOfUnittest(unittest.TestCase):
+    def test_dygraph_mode(self):
+        self.assertTrue(paddle.in_dynamic_mode(
+        ), 'Default Mode of Unittest should be dygraph mode, but get static mode.'
+                        )
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
new file mode 100644
index 0000000000000..05e931a9a25ef
--- /dev/null
+++ b/tools/static_mode_white_list.py
@@ -0,0 +1,656 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+STATIC_MODE_TESTING_LIST = [
+    'test_affine_channel_op',
+    'test_concat_op',
+    'test_elementwise_add_op',
+    'test_elementwise_sub_op',
+    'test_fill_zeros_like2_op',
+    'test_linear_chain_crf_op',
+    'test_lod_reset_op',
+    'test_lookup_table_op',
+    'test_pad2d_op',
+    'test_scatter_op',
+    'test_sequence_concat',
+    'test_sequence_conv',
+    'test_sequence_pool',
+    'test_sequence_expand_as',
+    'test_sequence_expand',
+    'test_sequence_pad_op',
+    'test_sequence_unpad_op',
+    'test_sequence_scatter_op',
+    'test_sequence_slice_op',
+    'test_slice_op',
+    'test_space_to_depth_op',
+    'test_squared_l2_distance_op',
+    'test_accuracy_op',
+    'test_activation_nn_grad',
+    'test_adadelta_op',
+    'test_adagrad_op',
+    'test_adam_op',
+    'test_adam_optimizer_fp32_fp64',
+    'test_adamax_api',
+    'test_adamax_op',
+    'test_adamw_op',
+    'test_adaptive_avg_pool1d',
+    'test_adaptive_max_pool1d',
+    'test_add_position_encoding_op',
+    'test_add_reader_dependency',
+    'test_addcmul',
+    'test_addmm_op',
+    'test_affine_grid_op',
+    'test_allclose_layer',
+    'test_amp_check_finite_and_scale_op',
+    'test_anchor_generator_op',
+    'test_arange',
+    'test_arg_min_max_op',
+    'test_argsort_op',
+    'test_array_read_write_op',
+    'test_assert_op',
+    'test_assign_op',
+    'test_assign_value_op',
+    'test_attention_lstm_op',
+    'test_auc_op',
+    'test_auc_single_pred_op',
+    'test_avoid_twice_initialization',
+    'test_backward',
+    'test_basic_rnn_name',
+    'test_batch_norm_op',
+    'test_batch_norm_op_v2',
+    'test_bce_loss',
+    'test_beam_search_decode_op',
+    'test_beam_search_op',
+    'test_bicubic_interp_op',
+    'test_bicubic_interp_v2_op',
+    'test_bilateral_slice_op',
+    'test_bilinear_api',
+    'test_bilinear_interp_v2_op',
+    'test_bilinear_tensor_product_op',
+    'test_bipartite_match_op',
+    'test_bmm_op',
+    'test_box_clip_op',
+    'test_box_coder_op',
+    'test_box_decoder_and_assign_op',
+    'test_bpr_loss_op',
+    'test_calc_gradient',
+    'test_case',
+    'test_cast_op',
+    'test_center_loss',
+    'test_cholesky_op',
+    'test_chunk_eval_op',
+    'test_chunk_op',
+    'test_clip_by_norm_op',
+    'test_clip_op',
+    'test_collect_fpn_proposals_op',
+    'test_compare_reduce_op',
+    'test_compiled_program',
+    'test_cond',
+    'test_conditional_block',
+    'test_context_manager',
+    'test_conv1d_layer',
+    'test_conv1d_transpose_layer',
+    'test_conv2d_layer',
+    'test_conv2d_op',
+    'test_conv2d_transpose_layer',
+    'test_conv3d_layer',
+    'test_conv3d_op',
+    'test_conv3d_transpose_layer',
+    'test_conv3d_transpose_part2_op',
+    'test_conv_nn_grad',
+    'test_conv_shift_op',
+    'test_cos_sim_op',
+    'test_create_global_var',
+    'test_crf_decoding_op',
+    'test_crop_op',
+    'test_crop_tensor_op',
+    'test_cross_entropy2_op',
+    'test_cross_entropy_loss',
+    'test_cross_entropy_op',
+    'test_cross_op',
+    'test_ctc_align',
+    'test_cumsum_op',
+    'test_cvm_op',
+    'test_data',
+    'test_dataloader_early_reset',
+    'test_dataloader_keep_order',
+    'test_dataloader_unkeep_order',
+    'test_debugger',
+    'test_decayed_adagrad_op',
+    'test_decoupled_py_reader',
+    'test_decoupled_py_reader_data_check',
+    'test_deformable_conv_v1_op',
+    'test_deformable_psroi_pooling',
+    'test_density_prior_box_op',
+    'test_deprecated_memory_optimize_interfaces',
+    'test_dequantize_abs_max_op',
+    'test_dequantize_log_op',
+    'test_desc_clone',
+    'test_detach',
+    'test_device',
+    'test_device_guard',
+    'test_diag_embed',
+    'test_distribute_fpn_proposals_op',
+    'test_distributed_strategy',
+    'test_distributions',
+    'test_dot_op',
+    'test_downpoursgd',
+    'test_dpsgd_op',
+    'test_dropout_op',
+    'test_dygraph_multi_forward',
+    'test_dyn_rnn',
+    'test_dynamic_rnn_stop_gradient',
+    'test_dynrnn_gradient_check',
+    'test_dynrnn_static_input',
+    'test_eager_deletion_conditional_block',
+    'test_eager_deletion_delete_vars',
+    'test_eager_deletion_gru_net',
+    'test_eager_deletion_lstm_net',
+    'test_eager_deletion_padding_rnn',
+    'test_eager_deletion_recurrent_op',
+    'test_eager_deletion_while_op',
+    'test_edit_distance_op',
+    'test_elementwise_div_op',
+    'test_elementwise_floordiv_op',
+    'test_elementwise_gradient_op',
+    'test_elementwise_max_op',
+    'test_elementwise_min_op',
+    'test_elementwise_mod_op',
+    'test_elementwise_mul_op',
+    'test_elementwise_nn_grad',
+    'test_elementwise_pow_op',
+    'test_ema',
+    'test_embedding_id_stop_gradient',
+    'test_empty_like_op',
+    'test_entry_attr',
+    'test_entry_attr2',
+    'test_erf_op',
+    'test_executor_and_mul',
+    'test_executor_and_use_program_cache',
+    'test_executor_check_feed',
+    'test_executor_feed_non_tensor',
+    'test_executor_return_tensor_not_overwriting',
+    'test_expand_as_op',
+    'test_expand_as_v2_op',
+    'test_expand_op',
+    'test_expand_v2_op',
+    'test_eye_op',
+    'test_fake_dequantize_op',
+    'test_fake_quantize_op',
+    'test_fc_op',
+    'test_feed_data_check_shape_type',
+    'test_fetch_lod_tensor_array',
+    'test_fetch_unmerged',
+    'test_fetch_var',
+    'test_fill_any_like_op',
+    'test_fill_constant_op',
+    'test_fill_op',
+    'test_fill_zeros_like_op',
+    'test_filter_by_instag_op',
+    'test_flatten2_op',
+    'test_flatten_contiguous_range_op',
+    'test_flatten_op',
+    'test_fleet',
+    'test_fleet_nocvm_1',
+    'test_fleet_pyramid_hash',
+    'test_fleet_rolemaker',
+    'test_fleet_rolemaker_3',
+    'test_fleet_unitaccessor',
+    'test_fleet_util',
+    'test_fleet_utils',
+    'test_flip',
+    'test_framework_debug_str',
+    'test_fsp_op',
+    'test_ftrl_op',
+    'test_full_like_op',
+    'test_full_op',
+    'test_functional_conv2d',
+    'test_functional_conv2d_transpose',
+    'test_functional_conv3d',
+    'test_functional_conv3d_transpose',
+    'test_fuse_all_reduce_pass',
+    'test_fuse_optimizer_pass',
+    'test_fuse_relu_depthwise_conv_pass',
+    'test_fused_elemwise_activation_op',
+    'test_fused_emb_seq_pool_op',
+    'test_fused_embedding_fc_lstm_op',
+    'test_fusion_gru_op',
+    'test_fusion_lstm_op',
+    'test_fusion_repeated_fc_relu_op',
+    'test_fusion_seqconv_eltadd_relu_op',
+    'test_fusion_seqpool_concat_op',
+    'test_fusion_seqpool_cvm_concat_op',
+    'test_fusion_squared_mat_sub_op',
+    'test_gather_tree_op',
+    'test_gaussian_random_op',
+    'test_generate_mask_labels_op',
+    'test_generate_proposal_labels_op',
+    'test_generate_proposals_op',
+    'test_generator_dataloader',
+    'test_get_places_op',
+    'test_get_tensor_from_selected_rows_op',
+    'test_gradient_clip',
+    'test_grid_sample_function',
+    'test_grid_sampler_op',
+    'test_group_norm_op',
+    'test_group_norm_op_v2',
+    'test_gru_op',
+    'test_gru_unit_op',
+    'test_hash_op',
+    'test_hinge_loss_op',
+    'test_histogram_op',
+    'test_huber_loss_op',
+    'test_im2sequence_op',
+    'test_image_classification_layer',
+    'test_imperative_basic',
+    'test_imperative_deepcf',
+    'test_imperative_framework',
+    'test_imperative_gan',
+    'test_imperative_gnn',
+    'test_imperative_load_static_param',
+    'test_imperative_lod_tensor_to_selected_rows',
+    'test_imperative_optimizer',
+    'test_imperative_ptb_rnn',
+    'test_imperative_ptb_rnn_sorted_gradient',
+    'test_imperative_recurrent_usage',
+    'test_imperative_reinforcement',
+    'test_imperative_selected_rows_to_lod_tensor',
+    'test_imperative_star_gan_with_gradient_penalty',
+    'test_imperative_transformer_sorted_gradient',
+    'test_increment',
+    'test_index_sample_op',
+    'test_index_select_op',
+    'test_infer_no_need_buffer_slots',
+    'test_inference_model_io',
+    'test_initializer',
+    'test_inplace_abn_op',
+    'test_inplace_addto_strategy',
+    'test_inplace_softmax_with_cross_entropy',
+    'test_input_spec',
+    'test_instance_norm_op',
+    'test_instance_norm_op_v2',
+    'test_inverse_op',
+    'test_io_save_load',
+    'test_iou_similarity_op',
+    'test_ir_memory_optimize_ifelse_op',
+    'test_ir_memory_optimize_pass',
+    'test_is_empty_op',
+    'test_isfinite_op',
+    'test_kldiv_loss_op',
+    'test_kron_op',
+    'test_l1_norm_op',
+    'test_label_smooth_op',
+    'test_lamb_op',
+    'test_layer_norm_op',
+    'test_layer_norm_op_v2',
+    'test_learning_rate_scheduler',
+    'test_linear_interp_op',
+    'test_linear_interp_v2_op',
+    'test_linspace',
+    'test_load_op',
+    'test_load_vars_shape_check',
+    'test_locality_aware_nms_op',
+    'test_lod_append_op',
+    'test_lod_array_length_op',
+    'test_lod_rank_table',
+    'test_lod_tensor_array_ops',
+    'test_log_loss_op',
+    'test_log_softmax',
+    'test_logsumexp',
+    'test_lookup_table_dequant_op',
+    'test_lookup_table_v2_op',
+    'test_lrn_op',
+    'test_lstm_op',
+    'test_lstmp_op',
+    'test_margin_rank_loss_op',
+    'test_math_op_patch',
+    'test_matmul_op',
+    'test_matmul_v2_op',
+    'test_matrix_nms_op',
+    'test_mean_iou',
+    'test_memory_reuse_exclude_feed_var',
+    'test_memory_usage',
+    'test_merge_ids_op',
+    'test_meshgrid_op',
+    'test_mine_hard_examples_op',
+    'test_minus_op',
+    'test_mish_op',
+    'test_modified_huber_loss_op',
+    'test_momentum_op',
+    'test_monitor',
+    'test_mse_loss',
+    'test_mul_op',
+    'test_multiclass_nms_op',
+    'test_multihead_attention',
+    'test_multiplex_op',
+    'test_multiprocess_reader_exception',
+    'test_name_scope',
+    'test_nce',
+    'test_nearest_interp_v2_op',
+    'test_network_with_dtype',
+    'test_nll_loss',
+    'test_nn_functional_embedding_static',
+    'test_nn_functional_hot_op',
+    'test_nonzero_api',
+    'test_norm_all',
+    'test_norm_nn_grad',
+    'test_norm_op',
+    'test_normal',
+    'test_normalization_wrapper',
+    'test_npair_loss_op',
+    'test_numel_op',
+    'test_one_hot_op',
+    'test_one_hot_v2_op',
+    'test_ones_like',
+    'test_ones_op',
+    'test_op_name_conflict',
+    'test_operator_desc',
+    'test_optimizer',
+    'test_optimizer_in_control_flow',
+    'test_pad_constant_like',
+    'test_pad_op',
+    'test_pairwise_distance',
+    'test_parallel_executor_drop_scope',
+    'test_parallel_executor_dry_run',
+    'test_parallel_executor_feed_persistable_var',
+    'test_parallel_executor_inference_feed_partial_data',
+    'test_parallel_executor_mnist',
+    'test_parallel_executor_run_load_infer_program',
+    'test_parallel_executor_test_while_train',
+    'test_parallel_ssa_graph_inference_feed_partial_data',
+    'test_parameter',
+    'test_partial_concat_op',
+    'test_partial_eager_deletion_transformer',
+    'test_partial_sum_op',
+    'test_pass_builder',
+    'test_pixel_shuffle',
+    'test_polygon_box_transform',
+    'test_pool1d_api',
+    'test_pool2d_api',
+    'test_pool2d_op',
+    'test_pool3d_api',
+    'test_pool3d_op',
+    'test_pool_max_op',
+    'test_positive_negative_pair_op',
+    'test_precision_recall_op',
+    'test_prelu_op',
+    'test_print_op',
+    'test_prior_box_op',
+    'test_profiler',
+    'test_program',
+    'test_program_code',
+    'test_program_prune_backward',
+    'test_program_to_string',
+    'test_protobuf_descs',
+    'test_proximal_adagrad_op',
+    'test_proximal_gd_op',
+    'test_prroi_pool_op',
+    'test_prune',
+    'test_psroi_pool_op',
+    'test_py_func_op',
+    'test_py_reader_combination',
+    'test_py_reader_lod_level_share',
+    'test_py_reader_pin_memory',
+    'test_py_reader_push_pop',
+    'test_py_reader_return_list',
+    'test_py_reader_sample_generator',
+    'test_py_reader_using_executor',
+    'test_pyramid_hash_op',
+    'test_queue',
+    'test_randint_op',
+    'test_randn_op',
+    'test_random_crop_op',
+    'test_randperm_op',
+    'test_range',
+    'test_rank_loss_op',
+    'test_reader_reset',
+    'test_recurrent_op',
+    'test_reduce_op',
+    'test_ref_by_trainer_id_op',
+    'test_registry',
+    'test_regularizer',
+    'test_regularizer_api',
+    'test_reorder_lod_tensor',
+    'test_reshape_op',
+    'test_retinanet_detection_output',
+    'test_reverse_op',
+    'test_rmsprop_op',
+    'test_rnn_cell_api',
+    'test_rnn_memory_helper_op',
+    'test_roi_align_op',
+    'test_roi_perspective_transform_op',
+    'test_roi_pool_op',
+    'test_roll_op',
+    'test_row_conv',
+    'test_row_conv_op',
+    'test_rpn_target_assign_op',
+    'test_run_program_op',
+    'test_runtime_and_compiletime_exception',
+    'test_sample_logits_op',
+    'test_save_model_without_var',
+    'test_scale_op',
+    'test_scaled_dot_product_attention',
+    'test_scatter_nd_op',
+    'test_seed_op',
+    'test_segment_ops',
+    'test_select_input_output_op',
+    'test_selu_op',
+    'test_set_bool_attr',
+    'test_sgd_op',
+    'test_shape_op',
+    'test_shard_index_op',
+    'test_shrink_rnn_memory',
+    'test_shuffle_batch_op',
+    'test_shuffle_channel_op',
+    'test_sigmoid_cross_entropy_with_logits_op',
+    'test_sigmoid_focal_loss_op',
+    'test_sign_op',
+    'test_similarity_focus_op',
+    'test_size_op',
+    'test_smooth_l1_loss',
+    'test_smooth_l1_loss_op',
+    'test_softmax_with_cross_entropy_op',
+    'test_spectral_norm_op',
+    'test_split_and_merge_lod_tensor_op',
+    'test_split_ids_op',
+    'test_split_op',
+    'test_spp_op',
+    'test_square_error_cost',
+    'test_squared_l2_norm_op',
+    'test_stack_op',
+    'test_static_save_load',
+    'test_sum_op',
+    'test_switch',
+    'test_switch_case',
+    'test_target_assign_op',
+    'test_tdm_child_op',
+    'test_tdm_sampler_op',
+    'test_teacher_student_sigmoid_loss_op',
+    'test_temporal_shift_op',
+    'test_tensor_array_to_tensor',
+    'test_tile_op',
+    'test_top_k_op',
+    'test_trace_op',
+    'test_trainable',
+    'test_transpose_op',
+    'test_tree_conv_op',
+    'test_tril_triu_op',
+    'test_trilinear_interp_op',
+    'test_trilinear_interp_v2_op',
+    'test_truncated_gaussian_random_op',
+    'test_unbind_op',
+    'test_unfold_op',
+    'test_uniform_random_op',
+    'test_unique',
+    'test_unique_with_counts',
+    'test_unpool_op',
+    'test_unstack_op',
+    'test_update_loss_scaling_op',
+    'test_var_info',
+    'test_variable',
+    'test_weight_normalization',
+    'test_where_index',
+    'test_where_op',
+    'test_yolo_box_op',
+    'test_yolov3_loss_op',
+    'test_zeros_like_op',
+    'test_zeros_op',
+    'test_adam_op_multi_thread',
+    'test_bilinear_interp_op',
+    'test_nearest_interp_op',
+    'test_imperative_resnet',
+    'test_imperative_resnet_sorted_gradient',
+    'test_imperative_mnist',
+    'test_imperative_mnist_sorted_gradient',
+    'test_imperative_se_resnext',
+    'test_imperative_ocr_attention_model',
+    'test_imperative_static_runner_mnist',
+    'test_imperative_static_runner_while',
+    'test_recv_save_op',
+    'test_transpiler_ops',
+    'test_communicator_sync',
+    'test_collective_optimizer',
+    'test_parallel_executor_crf',
+    'test_parallel_executor_profiler',
+    'test_parallel_executor_transformer',
+    'test_parallel_executor_transformer_auto_growth',
+    'test_data_norm_op',
+    'test_fuse_bn_act_pass',
+    'test_parallel_executor_seresnext_base_cpu',
+    'test_parallel_executor_seresnext_with_reduce_cpu',
+    'test_parallel_executor_seresnext_with_fuse_all_reduce_cpu',
+    'test_layers',
+    'test_parallel_executor_fetch_feed',
+    'test_sequence_concat',
+    'test_sequence_conv',
+    'test_sequence_enumerate_op',
+    'test_sequence_erase_op',
+    'test_sequence_expand',
+    'test_sequence_expand_as',
+    'test_sequence_first_step',
+    'test_sequence_last_step',
+    'test_sequence_mask',
+    'test_sequence_pad_op',
+    'test_sequence_pool',
+    'test_sequence_reshape',
+    'test_sequence_reverse',
+    'test_sequence_scatter_op',
+    'test_sequence_slice_op',
+    'test_sequence_softmax_op',
+    'test_sequence_topk_avg_pooling',
+    'test_sequence_unpad_op',
+    'test_ast_util',
+    'test_basic_api_transformation',
+    'test_function_spec',
+    'test_len',
+    'test_slice',
+    'test_variable_trans_func',
+    'test_ir_embedding_eltwise_layernorm_fuse_pass',
+    'test_ir_fc_fuse_pass',
+    'test_ir_skip_layernorm_pass',
+    'test_conv_affine_channel_fuse_pass',
+    'test_conv_bias_mkldnn_fuse_pass',
+    'test_conv_bn_fuse_pass',
+    'test_conv_elementwise_add2_act_fuse_pass',
+    'test_conv_elementwise_add_act_fuse_pass',
+    'test_conv_elementwise_add_fuse_pass',
+    'test_fc_fuse_pass',
+    'test_fc_gru_fuse_pass',
+    'test_fc_lstm_fuse_pass',
+    'test_repeated_fc_relu_fuse_pass',
+    'test_seqconv_eltadd_relu_fuse_pass',
+    'test_squared_mat_sub_fuse_pass',
+    'test_transpose_flatten_concat_fuse_pass',
+    'test_detection_map_op',
+    'test_fuse_elewise_add_act_pass',
+    'test_fusion_seqexpand_concat_fc_op',
+    'test_match_matrix_tensor_op',
+    'test_matmul_op_with_head',
+    'test_var_conv_2d',
+    'test_batch_norm_mkldnn_op',
+    'test_concat_int8_mkldnn_op',
+    'test_concat_mkldnn_op',
+    'test_conv2d_bf16_mkldnn_op',
+    'test_conv2d_int8_mkldnn_op',
+    'test_conv2d_mkldnn_op',
+    'test_conv2d_transpose_mkldnn_op',
+    'test_conv3d_mkldnn_op',
+    'test_dequantize_mkldnn_op',
+    'test_elementwise_add_mkldnn_op',
+    'test_elementwise_mul_mkldnn_op',
+    'test_fc_mkldnn_op',
+    'test_fusion_gru_int8_mkldnn_op',
+    'test_fusion_gru_mkldnn_op',
+    'test_gaussian_random_mkldnn_op',
+    'test_lrn_mkldnn_op',
+    'test_matmul_mkldnn_op',
+    'test_mul_int8_mkldnn_op',
+    'test_pool2d_int8_mkldnn_op',
+    'test_pool2d_mkldnn_op',
+    'test_quantize_mkldnn_op',
+    'test_requantize_mkldnn_op',
+    'test_softmax_mkldnn_op',
+    'test_sum_mkldnn_op',
+    'test_transpose_int8_mkldnn_op',
+    'test_transpose_mkldnn_op',
+    'test_mkldnn_conv_activation_fuse_pass',
+    'test_mkldnn_conv_concat_relu_mkldnn_fuse_pass',
+    'test_mkldnn_matmul_op_output_fuse_pass',
+    'test_mkldnn_matmul_transpose_reshape_fuse_pass',
+    'test_mkldnn_scale_matmul_fuse_pass',
+    'test_batch_fc_op',
+    'test_c_comm_init_all_op',
+    'test_conv2d_fusion_op',
+    'test_dataset_dataloader',
+    'test_fleet_metric',
+    'test_fused_bn_add_act',
+    'test_fused_multihead_matmul_op',
+    'test_ir_inplace_pass',
+    'test_mix_precision_all_reduce_fuse',
+    'test_parallel_executor_pg',
+    'test_rank_attention_op',
+    'test_fleet_base',
+    'test_fleet_graph_executor',
+    'test_fleet_meta_optimizer_base',
+    'test_ir_memory_optimize_transformer',
+    'test_trt_fc_fuse_pass',
+    'test_trt_quant_conv2d_dequant_fuse_pass',
+    'test_trt_slice_plugin',
+    'test_trt_transpose_flatten_concat_fuse_pass',
+    'test_mean_op',
+    'test_build_strategy_fusion_group_pass',
+    'test_coalesce_tensor_op',
+    'test_dataset',
+    'test_fleet_base_single',
+    'test_fleet_rolemaker_new',
+    'test_fused_fc_elementwise_layernorm_op',
+    'test_fusion_transpose_flatten_concat_op',
+    'test_ir_memory_optimize_nlp',
+    'test_nvprof',
+    'test_pipeline',
+    'test_weight_decay',
+    'test_fleet_base_2',
+    'test_fleet_pipeline_meta_optimizer',
+    'test_fleet_checkpoint',
+    'test_ir_fusion_group_pass',
+    'test_trt_pad_op',
+    'test_trt_shuffle_channel_detect_pass',
+    'test_trt_subgraph_pass',
+    'test_parallel_executor_seresnext_base_gpu',
+    'test_parallel_executor_seresnext_with_fuse_all_reduce_gpu',
+    'test_parallel_executor_seresnext_with_reduce_gpu',
+    'test_sync_batch_norm_op',
+    'test_multiprocess_dataloader_iterable_dataset_static',
+    'test_multiprocess_dataloader_static',
+]
diff --git a/tools/test_runner.py b/tools/test_runner.py
index bad98f9b5c3e8..248819a8d475e 100644
--- a/tools/test_runner.py
+++ b/tools/test_runner.py
@@ -21,13 +21,17 @@
 import paddle.fluid as fluid
 import importlib
 from six.moves import cStringIO
+import static_mode_white_list
 
 
 def main():
-    paddle.enable_static()
     sys.path.append(os.getcwd())
     some_test_failed = False
     for module_name in sys.argv[1:]:
+        flag_need_static_mode = False
+        if module_name in static_mode_white_list.STATIC_MODE_TESTING_LIST:
+            flag_need_static_mode = True
+            paddle.enable_static()
         buffer = cStringIO()
         main = fluid.Program()
         startup = fluid.Program()
@@ -46,7 +50,8 @@ def main():
                             'failed\n',
                             buffer.getvalue(),
                             file=sys.stderr)
-    paddle.disable_static()
+        if flag_need_static_mode:
+            paddle.disable_static()
 
     if some_test_failed:
         exit(1)

From bc4606922578d2efab4d75425e15b8c380973b59 Mon Sep 17 00:00:00 2001
From: liu zhengxi <380185688@qq.com>
Date: Wed, 21 Oct 2020 16:32:48 +0800
Subject: [PATCH 022/185] fix dynamic decode imperative (#28160)

---
 python/paddle/fluid/layers/rnn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/rnn.py b/python/paddle/fluid/layers/rnn.py
index 079187e09c916..05272a7cefb08 100644
--- a/python/paddle/fluid/layers/rnn.py
+++ b/python/paddle/fluid/layers/rnn.py
@@ -1395,7 +1395,7 @@ def _maybe_copy(state, new_state, step_mask):
         control_flow.increment(x=step_idx_tensor, value=1.0, in_place=True)
         step_idx += 1
 
-        control_flow.logical_not(nn.reduce_all(finished), cond)
+        cond = control_flow.logical_not(nn.reduce_all(finished))
         if max_step_num is not None and step_idx > max_step_num:
             break
 

From 68c473e3e01bcd147049e34274c71b91873b8d6d Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 21 Oct 2020 17:06:43 +0800
Subject: [PATCH 023/185] fix Automatic GPU detection failed on windows
 (#28148)

---
 cmake/cuda.cmake                |  4 ----
 paddle/scripts/paddle_build.bat | 22 ++++++++++++++--------
 2 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
index 146cbee1c6a88..83c00acfc638a 100644
--- a/cmake/cuda.cmake
+++ b/cmake/cuda.cmake
@@ -62,10 +62,6 @@ function(detect_installed_gpus out_variable)
   if(NOT CUDA_gpu_detect_output)
     message(STATUS "Automatic GPU detection failed. Building for all known architectures.")
     set(${out_variable} ${paddle_known_gpu_archs} PARENT_SCOPE)
-    #Todo: fix Automatic GPU detection failed on windows
-    if(WIN32)
-      set(${out_variable} "61 75" PARENT_SCOPE)
-    endif()
   else()
     set(${out_variable} ${CUDA_gpu_detect_output} PARENT_SCOPE)
   endif()
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index afb3f360a9abd..8b1377415d481 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -1,6 +1,3 @@
-@ECHO ON
-SETLOCAL
-
 rem Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 rem
 rem Licensed under the Apache License, Version 2.0 (the "License");
@@ -19,9 +16,13 @@ rem =================================================
 rem       Paddle CI Task On Windows Platform
 rem =================================================
 
+@ECHO ON
+SETLOCAL
+
 rem -------clean up environment-----------
 set work_dir=%cd%
-wmic process where name="op_function_generator.exe" call terminate  2>NUL
+taskkill /f /im op_function_generator.exe
+wmic process where name="op_function_generator.exe" call terminate
 
 rem ------initialize common variable------
 if not defined CUDA_TOOLKIT_ROOT_DIR set CUDA_TOOLKIT_ROOT_DIR="C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v10.0"
@@ -175,6 +176,7 @@ rem ----------------------------------------------------------------------------
 echo    ========================================
 echo    Step 1. Cmake ...
 echo    ========================================
+call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set start=%%#
 set start=%start:~4,10%
@@ -201,9 +203,8 @@ rem ----------------------------------------------------------------------------
 echo    ========================================
 echo    Step 2. Buile Paddle ...
 echo    ========================================
-call "C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\vcvarsall.bat" amd64
 
-for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*8/10
+for /F %%# in ('wmic cpu get NumberOfLogicalProcessors^|findstr [0-9]') do set /a PARALLEL_PROJECT_COUNT=%%#*9/10
 set build_times=1
 :build_tp
 echo Build third_party the %build_times% time:
@@ -248,13 +249,18 @@ echo    ========================================
 echo    Step 3. Test pip install whl package ...
 echo    ========================================
 
+setlocal enabledelayedexpansion
+
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "Build"
 tree /F %cd%\paddle_inference_install_dir\paddle
-%cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
+%cache_dir%\tools\busybox64.exe du -h -d 0 -k %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
 set /p libsize=< lib_size.txt
-for /F %%i in ("%libsize%") do echo "Windows Paddle_Inference Size: %%i"
+for /F %%i in ("%libsize%") do (
+    set /a libsize_m=%%i/1024
+    echo "Windows Paddle_Inference Size: !libsize_m!M"
+)
 %cache_dir%\tools\busybox64.exe du -h -d 0 %cd%\python\dist > whl_size.txt
 set /p whlsize=< whl_size.txt
 for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"

From 7c1aa0d69dd21d7db98b1c46873f3a028e344e95 Mon Sep 17 00:00:00 2001
From: cnn <liuhui29@baidu.com>
Date: Wed, 21 Oct 2020 17:32:38 +0800
Subject: [PATCH 024/185] 2.0rc api rename (#28088)

* rename manual_seed to seed

* rename xxx1d-->xxx1D, xxx2d-->xxx2D, xxx3d-->xxx3D

* rename manual_seed --> seed

* do not rename .cc, .cu and .h file

* rename manual_seed --> seed

* rename manual_seed --> seed

* rename manual_seed --> seed

* rename manual_seed --> seed

* disable_static on doc example code

* donot change manual_seed on generator

* add enable_static on sample code

* convert python/paddle/fluid/layers/nn.py to bak

* fix typo

* fix code style

* fix seed to manual_seed when call functions of Generator()

* fix bug
---
 python/paddle/__init__.py                     |   2 +-
 python/paddle/amp/auto_cast.py                |   2 +-
 python/paddle/amp/grad_scaler.py              |   6 +-
 python/paddle/distribution.py                 |  20 +--
 .../contrib/tests/test_weight_decay_extend.py |   4 +-
 python/paddle/fluid/dygraph/layers.py         |   2 +-
 python/paddle/fluid/dygraph/nn.py             |   8 +-
 python/paddle/fluid/initializer.py            |   2 +-
 python/paddle/fluid/nets.py                   |  48 ++++---
 .../unittests/dygraph_to_static/test_bmn.py   |   2 +-
 .../unittests/dygraph_to_static/test_lac.py   |   2 +-
 .../dygraph_to_static/test_mobile_net.py      |   2 +-
 .../dygraph_to_static/test_ptb_lm.py          |   2 +-
 .../dygraph_to_static/test_ptb_lm_v2.py       |   2 +-
 .../test_reinforcement_learning.py            |   2 +-
 .../dygraph_to_static/test_resnet.py          |   2 +-
 .../dygraph_to_static/test_resnet_v2.py       |   4 +-
 .../dygraph_to_static/test_se_resnet.py       |   2 +-
 .../dygraph_to_static/test_sentiment.py       |   2 +-
 .../dygraph_to_static/test_simnet.py          |   2 +-
 .../dygraph_to_static/test_simnet_v2.py       |   2 +-
 .../dygraph_to_static/test_transformer.py     |   8 +-
 .../unittests/dygraph_to_static/test_tsm.py   |   2 +-
 .../mkldnn/test_conv2d_bf16_mkldnn_op.py      |  22 +--
 .../mkldnn/test_conv2d_int8_mkldnn_op.py      |  30 ++--
 .../unittests/mkldnn/test_conv2d_mkldnn_op.py |  48 +++----
 .../mkldnn/test_conv2d_transpose_mkldnn_op.py |  36 ++---
 .../unittests/mkldnn/test_conv3d_mkldnn_op.py |  10 +-
 .../mkldnn/test_pool2d_int8_mkldnn_op.py      |  10 +-
 .../parallel_dygraph_sync_batch_norm.py       |   6 +-
 .../unittests/parallel_executor_test_base.py  |   2 +-
 .../tests/unittests/rnn/test_rnn_nets.py      |   2 +-
 .../unittests/test_adaptive_avg_pool1d.py     |   4 +-
 .../unittests/test_adaptive_avg_pool2d.py     |  24 ++--
 .../unittests/test_adaptive_avg_pool3d.py     |  30 ++--
 .../unittests/test_adaptive_max_pool1d.py     |   4 +-
 .../unittests/test_adaptive_max_pool2d.py     |  24 ++--
 .../unittests/test_adaptive_max_pool3d.py     |  30 ++--
 .../tests/unittests/test_batch_norm_op_v2.py  |  32 ++---
 .../test_buffer_shared_memory_reuse_pass.py   |   2 +-
 .../tests/unittests/test_compiled_program.py  |   6 +-
 .../tests/unittests/test_conv1d_layer.py      |  46 +++---
 .../unittests/test_conv1d_transpose_layer.py  |  44 +++---
 .../tests/unittests/test_conv2d_fusion_op.py  |  36 ++---
 .../tests/unittests/test_conv2d_layer.py      |   2 +-
 .../fluid/tests/unittests/test_conv2d_op.py   | 114 +++++++--------
 .../unittests/test_conv2d_transpose_layer.py  |   2 +-
 .../unittests/test_conv2d_transpose_op.py     |  60 ++++----
 .../tests/unittests/test_conv3d_layer.py      |   2 +-
 .../fluid/tests/unittests/test_conv3d_op.py   |  62 ++++----
 .../unittests/test_conv3d_transpose_layer.py  |   2 +-
 .../unittests/test_conv3d_transpose_op.py     |  22 +--
 .../test_conv3d_transpose_part2_op.py         |  16 +--
 .../tests/unittests/test_conv_nn_grad.py      |   2 +-
 .../tests/unittests/test_cuda_random_seed.py  |  14 +-
 .../unittests/test_decoupled_py_reader.py     |   2 +-
 .../unittests/test_deformable_conv_op.py      |   2 +-
 .../fluid/tests/unittests/test_dropout_op.py  |  16 +--
 .../unittests/test_dygraph_multi_forward.py   |   4 +-
 .../unittests/test_dygraph_weight_norm.py     |   4 +-
 .../test_eager_deletion_padding_rnn.py        |   2 +-
 .../test_embedding_id_stop_gradient.py        |   2 +-
 .../fluid/tests/unittests/test_fc_op.py       |   2 +-
 .../tests/unittests/test_fuse_bn_act_pass.py  |   2 +-
 .../tests/unittests/test_fused_bn_add_act.py  |   2 +-
 .../unittests/test_gaussian_random_op.py      |   2 +-
 .../fluid/tests/unittests/test_generator.py   |   2 -
 .../unittests/test_generator_dataloader.py    |   2 +-
 .../fluid/tests/unittests/test_hsigmoid_op.py |   2 +-
 .../test_imperative_auto_mixed_precision.py   |   6 +-
 .../tests/unittests/test_imperative_deepcf.py |   6 +-
 .../unittests/test_imperative_double_grad.py  |   4 +-
 .../tests/unittests/test_imperative_gan.py    |   6 +-
 .../tests/unittests/test_imperative_gnn.py    |   6 +-
 .../unittests/test_imperative_layer_apply.py  |   8 +-
 .../test_imperative_layer_children.py         |   4 +-
 ..._imperative_lod_tensor_to_selected_rows.py |   4 +-
 .../test_imperative_ocr_attention_model.py    |   4 +-
 .../unittests/test_imperative_optimizer.py    |   6 +-
 .../unittests/test_imperative_optimizer_v2.py |   6 +-
 .../unittests/test_imperative_ptb_rnn.py      |   4 +-
 ...test_imperative_ptb_rnn_sorted_gradient.py |   4 +-
 .../test_imperative_reinforcement.py          |   4 +-
 .../tests/unittests/test_imperative_resnet.py |   4 +-
 .../test_imperative_resnet_sorted_gradient.py |   4 +-
 .../unittests/test_imperative_save_load.py    |  12 +-
 .../unittests/test_imperative_save_load_v2.py |  14 +-
 .../unittests/test_imperative_se_resnext.py   |   4 +-
 ..._imperative_selected_rows_to_lod_tensor.py |   4 +-
 ...perative_star_gan_with_gradient_penalty.py |   4 +-
 ..._imperative_transformer_sorted_gradient.py |   4 +-
 .../unittests/test_inplace_addto_strategy.py  |   2 +-
 .../unittests/test_instance_norm_op_v2.py     |  12 +-
 .../test_ir_memory_optimize_ifelse_op.py      |   2 +-
 .../tests/unittests/test_jit_save_load.py     |  10 +-
 .../fluid/tests/unittests/test_layers.py      |   4 +-
 .../fluid/tests/unittests/test_manual_seed.py |   6 +-
 .../fluid/tests/unittests/test_normal.py      |   8 +-
 .../tests/unittests/test_paddle_save_load.py  |   2 +-
 .../fluid/tests/unittests/test_pool1d_api.py  |  12 +-
 .../fluid/tests/unittests/test_pool2d_api.py  |  24 ++--
 .../fluid/tests/unittests/test_pool2d_op.py   |   4 +-
 .../fluid/tests/unittests/test_pool3d_api.py  |  22 +--
 .../fluid/tests/unittests/test_pool3d_op.py   |  54 +++----
 .../fluid/tests/unittests/test_py_func_op.py  |   2 +-
 .../fluid/tests/unittests/test_random_seed.py |  36 ++---
 .../fluid/tests/unittests/test_regularizer.py |   6 +-
 .../tests/unittests/test_regularizer_api.py   |   6 +-
 .../tests/unittests/test_retain_graph.py      |   6 +-
 .../tests/unittests/test_rnn_decode_api.py    |   2 +-
 .../unittests/test_sync_batch_norm_op.py      |   6 +-
 .../tests/unittests/test_transformer_api.py   |   6 +-
 .../tests/unittests/test_translated_layer.py  |   2 +-
 .../tests/unittests/test_uniform_random_op.py |  12 +-
 .../fluid/tests/unittests/test_var_base.py    |   2 +-
 .../fluid/tests/unittests/test_var_conv_2d.py |  18 +--
 .../tests/unittests/xpu/test_conv2d_op_xpu.py |  36 ++---
 python/paddle/framework/__init__.py           |   7 +-
 python/paddle/framework/random.py             |   8 +-
 python/paddle/hapi/model_summary.py           |   8 +-
 python/paddle/nn/__init__.py                  |  52 +++----
 python/paddle/nn/functional/conv.py           |   8 +-
 python/paddle/nn/functional/norm.py           |   4 +-
 python/paddle/nn/layer/__init__.py            |  40 +++---
 python/paddle/nn/layer/common.py              |  20 +--
 python/paddle/nn/layer/conv.py                |  80 ++++++-----
 python/paddle/nn/layer/norm.py                |  36 ++---
 python/paddle/nn/layer/pooling.py             | 136 +++++++++---------
 python/paddle/regularizer.py                  |   8 +-
 python/paddle/tensor/random.py                |  10 +-
 python/paddle/tensor/to_string.py             |   2 +-
 python/paddle/tests/test_model.py             |  12 +-
 python/paddle/vision/models/lenet.py          |   8 +-
 python/paddle/vision/models/mobilenetv1.py    |   6 +-
 python/paddle/vision/models/mobilenetv2.py    |  12 +-
 python/paddle/vision/models/resnet.py         |  24 ++--
 python/paddle/vision/models/vgg.py            |   8 +-
 137 files changed, 929 insertions(+), 906 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 3640dd22bb0cd..54e51200dc745 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -222,7 +222,7 @@
 
 from .tensor.to_string import set_printoptions
 
-from .framework.random import manual_seed  #DEFINE_ALIAS
+from .framework.random import seed  #DEFINE_ALIAS
 from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
 from .framework.random import set_cuda_rng_state  #DEFINE_ALIAS
 from .framework import ParamAttr  #DEFINE_ALIAS
diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index e33f6e2afc846..63c7d999fde77 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -37,7 +37,7 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
 
         import paddle
 
-        conv2d = paddle.nn.Conv2d(3, 2, 3, bias_attr=False)
+        conv2d = paddle.nn.Conv2D(3, 2, 3, bias_attr=False)
         data = paddle.rand([10, 3, 32, 32])
 
         with paddle.amp.auto_cast():
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index 0e43e5a6a17fe..e3cd05dcb30a8 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -50,7 +50,7 @@ class GradScaler(AmpScaler):
 
         import paddle
 
-        model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+        model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
         optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
         scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
         data = paddle.rand([10, 3, 32, 32])
@@ -90,7 +90,7 @@ def scale(self, var):
 
             import paddle
 
-            model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
             optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
             scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
             data = paddle.rand([10, 3, 32, 32])
@@ -122,7 +122,7 @@ def minimize(self, optimizer, *args, **kwargs):
 
             import paddle
 
-            model = paddle.nn.Conv2d(3, 2, 3, bias_attr=True)
+            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
             optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
             scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
             data = paddle.rand([10, 3, 32, 32])
diff --git a/python/paddle/distribution.py b/python/paddle/distribution.py
index 9133751a5309f..e9a15feb5170f 100644
--- a/python/paddle/distribution.py
+++ b/python/paddle/distribution.py
@@ -670,13 +670,13 @@ class Categorical(Distribution):
             import paddle
             from paddle.distribution import Categorical
 
-            paddle.manual_seed(100) # on CPU device
+            paddle.seed(100) # on CPU device
             x = paddle.rand([6])
             print(x.numpy())
             # [0.5535528  0.20714243 0.01162981
             #  0.51577556 0.36369765 0.2609165 ]
 
-            paddle.manual_seed(200) # on CPU device
+            paddle.seed(200) # on CPU device
             y = paddle.rand([6])
             print(y.numpy())
             # [0.77663314 0.90824795 0.15685187
@@ -685,7 +685,7 @@ class Categorical(Distribution):
             cat = Categorical(x)
             cat2 = Categorical(y)
 
-            paddle.manual_seed(1000) # on CPU device
+            paddle.seed(1000) # on CPU device
             cat.sample([2,3])
             # [[0, 0, 5],
             #  [3, 4, 5]]
@@ -744,7 +744,7 @@ def sample(self, shape):
                 import paddle
                 from paddle.distribution import Categorical
 
-                paddle.manual_seed(100) # on CPU device
+                paddle.seed(100) # on CPU device
                 x = paddle.rand([6])
                 print(x.numpy())
                 # [0.5535528  0.20714243 0.01162981
@@ -752,7 +752,7 @@ def sample(self, shape):
 
                 cat = Categorical(x)
 
-                paddle.manual_seed(1000) # on CPU device
+                paddle.seed(1000) # on CPU device
                 cat.sample([2,3])
                 # [[0, 0, 5],
                 #  [3, 4, 5]]
@@ -791,13 +791,13 @@ def kl_divergence(self, other):
                 import paddle
                 from paddle.distribution import Categorical
 
-                paddle.manual_seed(100) # on CPU device
+                paddle.seed(100) # on CPU device
                 x = paddle.rand([6])
                 print(x.numpy())
                 # [0.5535528  0.20714243 0.01162981
                 #  0.51577556 0.36369765 0.2609165 ]
 
-                paddle.manual_seed(200) # on CPU device
+                paddle.seed(200) # on CPU device
                 y = paddle.rand([6])
                 print(y.numpy())
                 # [0.77663314 0.90824795 0.15685187
@@ -842,7 +842,7 @@ def entropy(self):
                 import paddle
                 from paddle.distribution import Categorical
 
-                paddle.manual_seed(100) # on CPU device
+                paddle.seed(100) # on CPU device
                 x = paddle.rand([6])
                 print(x.numpy())
                 # [0.5535528  0.20714243 0.01162981
@@ -887,7 +887,7 @@ def probs(self, value):
                 import paddle
                 from paddle.distribution import Categorical
 
-                paddle.manual_seed(100) # on CPU device
+                paddle.seed(100) # on CPU device
                 x = paddle.rand([6])
                 print(x.numpy())
                 # [0.5535528  0.20714243 0.01162981
@@ -953,7 +953,7 @@ def log_prob(self, value):
                 import paddle
                 from paddle.distribution import Categorical
 
-                paddle.manual_seed(100) # on CPU device
+                paddle.seed(100) # on CPU device
                 x = paddle.rand([6])
                 print(x.numpy())
                 # [0.5535528  0.20714243 0.01162981
diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 6000a44ceb659..5ed7fd01a433b 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -114,7 +114,7 @@ def run_program(self, place, feed_list):
         return param_sum
 
     def check_weight_decay(self, place, model):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
@@ -137,7 +137,7 @@ def check_weight_decay(self, place, model):
         return param_sum
 
     def check_weight_decay2(self, place, model):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 3ae6d384be7e3..6fa531c573daa 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1058,7 +1058,7 @@ def __init__(self):
                         super(Mylayer, self).__init__()
                         self.linear1 = paddle.nn.Linear(10, 10)
                         self.linear2 = paddle.nn.Linear(5, 5)
-                        self.conv2d = paddle.nn.Conv2d(3, 2, 3)
+                        self.conv2d = paddle.nn.Conv2D(3, 2, 3)
                         self.embedding = paddle.nn.Embedding(128, 16)
                         self.h_0 = paddle.to_tensor(np.zeros([10, 10]).astype('float32'))
 
diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py
index 9a23e11b8a8bc..214a7cb802e6f 100644
--- a/python/paddle/fluid/dygraph/nn.py
+++ b/python/paddle/fluid/dygraph/nn.py
@@ -110,7 +110,7 @@ class Conv2D(layers.Layer):
         dilation (int or tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
-        groups (int, optional): The groups number of the Conv2d Layer. According to grouped
+        groups (int, optional): The groups number of the Conv2D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
@@ -345,7 +345,7 @@ class Conv3D(layers.Layer):
         dilation (int|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups (int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups (int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
@@ -574,7 +574,7 @@ class Conv3DTranspose(layers.Layer):
         dilation(int|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
@@ -2541,7 +2541,7 @@ class Conv2DTranspose(layers.Layer):
         dilation(int or tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
-        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
diff --git a/python/paddle/fluid/initializer.py b/python/paddle/fluid/initializer.py
index c21a96cb0108e..46fd93278850e 100644
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -749,7 +749,7 @@ class BilinearInitializer(Initializer):
                                       regularizer=L2Decay(0.),
                                       initializer=nn.initializer.Bilinear())
             data = paddle.rand([B, 3, H, W], dtype='float32')
-            conv_up = nn.ConvTranspose2d(3,
+            conv_up = nn.Conv2DTranspose(3,
                                          out_channels=C,
                                          kernel_size=2 * factor - factor % 2,
                                          padding=int(
diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py
index 8621fc654464c..8df8f6b689146 100644
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
@@ -43,7 +43,7 @@ def simple_img_conv_pool(input,
                          act=None,
                          use_cudnn=True):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     The simple_img_conv_pool api is composed of :ref:`api_fluid_layers_conv2d` and :ref:`api_fluid_layers_pool2d` .
 
@@ -106,6 +106,8 @@ def simple_img_conv_pool(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             img = fluid.data(name='img', shape=[100, 1, 28, 28], dtype='float32')
             conv_pool = fluid.nets.simple_img_conv_pool(input=img,
                                                         filter_size=5,
@@ -151,37 +153,37 @@ def img_conv_group(input,
                    pool_type="max",
                    use_cudnn=True):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     The Image Convolution Group is composed of Convolution2d, BatchNorm, DropOut,
-    and Pool2d. According to the input arguments, img_conv_group will do serials of
+    and Pool2D. According to the input arguments, img_conv_group will do serials of
     computation for Input using Convolution2d, BatchNorm, DropOut, and pass the last
-    result to Pool2d.
+    result to Pool2D.
 
     Args:
         input (Variable): The input is 4-D Tensor with shape [N, C, H, W], the data type of input is float32 or float64.
         conv_num_filter(list|tuple): Indicates the numbers of filter of this group.
-        pool_size (int|list|tuple): The pooling size of Pool2d Layer. If pool_size
+        pool_size (int|list|tuple): The pooling size of Pool2D Layer. If pool_size
             is a list or tuple, it must contain two integers, (pool_size_height, pool_size_width).
             Otherwise, the pool_size_height = pool_size_width = pool_size.
-        conv_padding (int|list|tuple): The padding size of the Conv2d Layer. If padding is
+        conv_padding (int|list|tuple): The padding size of the Conv2D Layer. If padding is
             a list or tuple, its length must be equal to the length of conv_num_filter.
-            Otherwise the conv_padding of all Conv2d Layers are the same. Default 1.
+            Otherwise the conv_padding of all Conv2D Layers are the same. Default 1.
         conv_filter_size (int|list|tuple): The filter size. If filter_size is a list or
             tuple, its length must be equal to the length of conv_num_filter.
-            Otherwise the conv_filter_size of all Conv2d Layers are the same. Default 3.
-        conv_act (str): Activation type for Conv2d Layer that is not followed by BatchNorm.
+            Otherwise the conv_filter_size of all Conv2D Layers are the same. Default 3.
+        conv_act (str): Activation type for Conv2D Layer that is not followed by BatchNorm.
             Default: None.
-        param_attr (ParamAttr): The parameters to the Conv2d Layer. Default: None
-        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2d Layer.
+        param_attr (ParamAttr): The parameters to the Conv2D Layer. Default: None
+        conv_with_batchnorm (bool|list): Indicates whether to use BatchNorm after Conv2D Layer.
             If conv_with_batchnorm is a list, its length must be equal to the length of
             conv_num_filter. Otherwise, conv_with_batchnorm indicates whether all the
-            Conv2d Layer follows a BatchNorm. Default False.
+            Conv2D Layer follows a BatchNorm. Default False.
         conv_batchnorm_drop_rate (float|list): Indicates the drop_rate of Dropout Layer
             after BatchNorm. If conv_batchnorm_drop_rate is a list, its length must be
             equal to the length of conv_num_filter. Otherwise, drop_rate of all Dropout
             Layers is conv_batchnorm_drop_rate. Default 0.0.
-        pool_stride (int|list|tuple): The pooling stride of Pool2d layer. If pool_stride
+        pool_stride (int|list|tuple): The pooling stride of Pool2D layer. If pool_stride
             is a list or tuple, it must contain two integers, (pooling_stride_H,
             pooling_stride_W). Otherwise, the pooling_stride_H = pooling_stride_W = pool_stride.
             Default 1.
@@ -192,12 +194,15 @@ def img_conv_group(input,
 
     Return:
         A Variable holding Tensor representing the final result after serial computation using Convolution2d,
-        BatchNorm, DropOut, and Pool2d, whose data type is the same with input.
+        BatchNorm, DropOut, and Pool2D, whose data type is the same with input.
 
     Examples:
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            
             img = fluid.data(name='img', shape=[None, 1, 28, 28], dtype='float32')
             conv_pool = fluid.nets.img_conv_group(input=img,
                                                   conv_padding=1,
@@ -261,7 +266,7 @@ def sequence_conv_pool(input,
                        pool_type="max",
                        bias_attr=None):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     **This api takes input as an LoDTensor. If input is a Tensor, please use** 
     :ref:`api_fluid_nets_simple_img_conv_pool` **instead**
@@ -300,6 +305,8 @@ def sequence_conv_pool(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             input_dim = 100 #len(word_dict)
             emb_dim = 128
             hid_dim = 512
@@ -327,7 +334,7 @@ def sequence_conv_pool(input,
 
 def glu(input, dim=-1):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     The Gated Linear Units(GLU) composed by :ref:`api_fluid_layers_split` , 
     :ref:`api_fluid_layers_sigmoid`  and :ref:`api_fluid_layers_elementwise_mul` . 
@@ -356,6 +363,9 @@ def glu(input, dim=-1):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            
             data = fluid.data(
                 name="words", shape=[-1, 6, 3, 9], dtype="float32")
             # shape of output: [-1, 3, 3, 9]
@@ -375,7 +385,7 @@ def scaled_dot_product_attention(queries,
                                  num_heads=1,
                                  dropout_rate=0.):
     """
-	:api_attr: Static Graph
+	:api_attr: Static Graph
 
     This interface Multi-Head Attention using scaled dot product.
     Attention mechanism can be seen as mapping a query and a set of key-value
@@ -435,7 +445,9 @@ def scaled_dot_product_attention(queries,
         .. code-block:: python
 
             import paddle.fluid as fluid
-
+            import paddle
+            paddle.enable_static()
+            
             queries = fluid.data(name="queries", shape=[3, 5, 9], dtype="float32")
             keys = fluid.data(name="keys", shape=[3, 6, 9], dtype="float32")
             values = fluid.data(name="values", shape=[3, 6, 10], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
index c4f5cc9e2bcbc..f69abb1e37669 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_bmn.py
@@ -564,7 +564,7 @@ def train_bmn(args, place, to_static):
     loss_data = []
 
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
         global local_random
         local_random = np.random.RandomState(SEED)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
index c9bc8cc647df3..63da7c2b1795d 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lac.py
@@ -450,7 +450,7 @@ def do_train(args, to_static):
     place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda(
     ) else fluid.CPUPlace()
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         reader = get_random_input_data(args.batch_size, args.vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
index a086bf1455a81..30c1955adcf9f 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_mobile_net.py
@@ -451,7 +451,7 @@ def train_mobilenet(args, to_static):
     with fluid.dygraph.guard(args.place):
 
         np.random.seed(SEED)
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         if args.model == "MobileNetV1":
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
index 61e1614c3ac0d..ea0529ffb28d4 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm.py
@@ -218,7 +218,7 @@ def train(place):
     batch_num = 200
 
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
         ptb_model = PtbModel(
             hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
index 2c74e5b221f7e..0d45d7edb2742 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_ptb_lm_v2.py
@@ -210,7 +210,7 @@ def train(place):
     batch_num = 200
 
     paddle.disable_static(place)
-    paddle.manual_seed(SEED)
+    paddle.seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
     ptb_model = PtbModel(
         hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
index 1d211197ebd48..c127e5882b538 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_reinforcement_learning.py
@@ -65,7 +65,7 @@ def train(args, place, to_static):
     env.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
         local_random = np.random.RandomState(SEED)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
index 095940d79eac6..dcc323d0644be 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet.py
@@ -219,7 +219,7 @@ def train(to_static):
     """
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
index 88c55f190768d..10346ab0cc442 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_resnet_v2.py
@@ -66,7 +66,7 @@ def __init__(self,
                  act=None):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = paddle.nn.Conv2d(
+        self._conv = paddle.nn.Conv2D(
             in_channels=num_channels,
             out_channels=num_filters,
             kernel_size=filter_size,
@@ -214,7 +214,7 @@ def train(to_static):
     """
     paddle.disable_static(place)
     np.random.seed(SEED)
-    paddle.manual_seed(SEED)
+    paddle.seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
 
     train_reader = paddle.batch(
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
index 15cff501838a1..eb17264977f50 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_se_resnet.py
@@ -334,7 +334,7 @@ def train(train_reader, to_static):
     np.random.seed(SEED)
 
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
         se_resnext = SeResNeXt()
         optimizer = optimizer_setting(train_parameters, se_resnext.parameters())
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
index 2aa3396fb7f85..db03bb9b33cc8 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_sentiment.py
@@ -286,7 +286,7 @@ def train(args, to_static):
 
     with fluid.dygraph.guard(place):
         np.random.seed(SEED)
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         train_reader = fake_data_reader(args.class_num, args.vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
index 14b9ac2e99584..01e9ed07efa83 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet.py
@@ -108,7 +108,7 @@ def train(conf_dict, to_static):
         place = fluid.CPUPlace()
 
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         conf_dict['dict_size'] = len(vocab)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
index 284087e61ec64..872d419ff8928 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_simnet_v2.py
@@ -106,7 +106,7 @@ def train(conf_dict, to_static):
         place = paddle.CPUPlace()
 
     paddle.disable_static(place)
-    paddle.manual_seed(SEED)
+    paddle.seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
 
     conf_dict['dict_size'] = len(vocab)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
index 6721e7a51d2bc..451ceea75c094 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_transformer.py
@@ -33,7 +33,7 @@
 
 def train_static(args, batch_generator):
     paddle.enable_static()
-    paddle.manual_seed(SEED)
+    paddle.seed(SEED)
     paddle.framework.random._manual_program_seed(SEED)
     train_prog = fluid.Program()
     startup_prog = fluid.Program()
@@ -131,7 +131,7 @@ def train_static(args, batch_generator):
 def train_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
         if SEED is not None:
-            paddle.manual_seed(SEED)
+            paddle.seed(SEED)
             paddle.framework.random._manual_program_seed(SEED)
         # define data loader
         train_loader = fluid.io.DataLoader.from_generator(capacity=10)
@@ -223,7 +223,7 @@ def train_dygraph(args, batch_generator):
 
 def predict_dygraph(args, batch_generator):
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         # define data loader
@@ -295,7 +295,7 @@ def predict_dygraph(args, batch_generator):
 def predict_static(args, batch_generator):
     test_prog = fluid.Program()
     with fluid.program_guard(test_prog):
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         # define input and reader
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
index bedca412157f0..c9d4bb2e79dee 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_tsm.py
@@ -272,7 +272,7 @@ def train(args, fake_data_reader, to_static):
     random.seed(0)
     np.random.seed(0)
     with fluid.dygraph.guard(place):
-        paddle.manual_seed(1000)
+        paddle.seed(1000)
         paddle.framework.random._manual_program_seed(1000)
 
         video_model = TSM_ResNet("TSM", train_config, 'Train')
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index 0311eb887adf3..efd0e95dd384f 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -20,7 +20,7 @@
 
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
-from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2DOp
 
 
 def conv2d_residual_naive(out, residual):
@@ -31,7 +31,7 @@ def conv2d_residual_naive(out, residual):
 
 @unittest.skipIf(not core.supports_bfloat16(),
                  "place does not support BF16 evaluation")
-class TestConv2dBf16Op(TestConv2dOp):
+class TestConv2DBf16Op(TestConv2DOp):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -110,7 +110,7 @@ def test_check_grad_no_input(self):
         pass
 
     def init_test_case(self):
-        TestConv2dOp.init_test_case(self)
+        TestConv2DOp.init_test_case(self)
         self.input_size = [1, 1, 5, 5]  # NCHW
         f_c = self.input_size[1] // self.groups
         self.input_residual_size = [1, 2, 3, 3]
@@ -130,7 +130,7 @@ def init_fuse_residual(self):
         self.fuse_residual = True
 
 
-class TestConv2d(TestConv2dBf16Op):
+class TestConv2D(TestConv2DBf16Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -144,19 +144,19 @@ def init_data_type(self):
         self.input_type = np.uint16
 
 
-class TestWithPad(TestConv2d):
+class TestWithPad(TestConv2D):
     def init_test_case(self):
-        TestConv2d.init_test_case(self)
+        TestConv2D.init_test_case(self)
         self.pad = [1, 1]
         self.input_residual_size = [2, 6, 5, 5]
 
 
-class TestWithGroup(TestConv2d):
+class TestWithGroup(TestConv2D):
     def init_group(self):
         self.groups = 3
 
 
-class TestWithStride(TestConv2dBf16Op):
+class TestWithStride(TestConv2DBf16Op):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -170,7 +170,7 @@ def init_data_type(self):
         self.input_type = np.uint16
 
 
-class TestWithDilations(TestConv2dBf16Op):
+class TestWithDilations(TestConv2DBf16Op):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -185,7 +185,7 @@ def init_data_type(self):
         self.input_type = np.uint16
 
 
-class TestWith1x1ForceFP32Output(TestConv2dBf16Op):
+class TestWith1x1ForceFP32Output(TestConv2DBf16Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -201,7 +201,7 @@ def init_fuse_residual(self):
         self.fuse_residual = False
 
 
-class TestWithInput1x1Filter1x1(TestConv2dBf16Op):
+class TestWithInput1x1Filter1x1(TestConv2DBf16Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
index 388eb38fc6e67..88f1fb7fd2d44 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -19,7 +19,7 @@
 
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
-from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2dOp
+from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive, TestConv2DOp
 
 
 def conv2d_forward_refer(input, filter, group, conv_param):
@@ -28,7 +28,7 @@ def conv2d_forward_refer(input, filter, group, conv_param):
     return out
 
 
-class TestConv2dInt8Op(TestConv2dOp):
+class TestConv2DInt8Op(TestConv2DOp):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -162,7 +162,7 @@ def test_check_grad_no_input(self):
         pass
 
     def init_test_case(self):
-        TestConv2dOp.init_test_case(self)
+        TestConv2DOp.init_test_case(self)
         self.input_size = [1, 1, 5, 5]  # NCHW
         f_c = self.input_size[1] // self.groups
         self.input_residual_size = [1, 2, 3, 3]
@@ -186,7 +186,7 @@ def init_fuse_residual(self):
 #--------------------test conv2d u8 in and u8 out with residual fuse--------------------
 
 
-class TestConv2d(TestConv2dInt8Op):
+class TestConv2D(TestConv2DInt8Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -201,19 +201,19 @@ def init_test_case(self):
         self.scale_in_eltwise = 0.6
 
 
-class TestWithPad(TestConv2d):
+class TestWithPad(TestConv2D):
     def init_test_case(self):
-        TestConv2d.init_test_case(self)
+        TestConv2D.init_test_case(self)
         self.pad = [1, 1]
         self.input_residual_size = [2, 6, 5, 5]
 
 
-class TestWithGroup(TestConv2d):
+class TestWithGroup(TestConv2D):
     def init_group(self):
         self.groups = 3
 
 
-class TestWithStride(TestConv2dInt8Op):
+class TestWithStride(TestConv2DInt8Op):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -228,7 +228,7 @@ def init_test_case(self):
         self.scale_in_eltwise = 0.5
 
 
-class TestWithDilations(TestConv2dInt8Op):
+class TestWithDilations(TestConv2DInt8Op):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -244,7 +244,7 @@ def init_test_case(self):
         self.scale_in_eltwise = 0.5
 
 
-class TestWith1x1(TestConv2dInt8Op):
+class TestWith1x1(TestConv2DInt8Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -259,7 +259,7 @@ def init_test_case(self):
         self.scale_in_eltwise = 0.5
 
 
-class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
+class TestWithInput1x1Filter1x1(TestConv2DInt8Op):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -356,7 +356,7 @@ def init_data_type(self):
     globals()[cls_name_u8s8_re_1] = TestU8S8ResCase
 
 
-create_test_int8_class(TestConv2dInt8Op)
+create_test_int8_class(TestConv2DInt8Op)
 create_test_int8_class(TestWithPad)
 create_test_int8_class(TestWithStride)
 create_test_int8_class(TestWithDilations)
@@ -365,7 +365,7 @@ def init_data_type(self):
 create_test_int8_class(TestWithInput1x1Filter1x1)
 
 
-class TestConv2dOp_AsyPadding_INT_MKLDNN(TestConv2dInt8Op):
+class TestConv2DOp_AsyPadding_INT_MKLDNN(TestConv2DInt8Op):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -374,13 +374,13 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestConv2dOp_Same_INT_MKLDNN(TestConv2dOp_AsyPadding_INT_MKLDNN):
+class TestConv2DOp_Same_INT_MKLDNN(TestConv2DOp_AsyPadding_INT_MKLDNN):
     def init_paddings(self):
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
-class TestConv2dOp_Valid_INT_MKLDNN(TestConv2dOp_AsyPadding_INT_MKLDNN):
+class TestConv2DOp_Valid_INT_MKLDNN(TestConv2DOp_AsyPadding_INT_MKLDNN):
     def init_paddings(self):
         self.pad = [1, 1]
         self.padding_algorithm = "VALID"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index 6fad98874e077..eb906684f0fb1 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -19,7 +19,7 @@
 
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest, skip_check_grad_ci
-from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2dOp, TestConv2dOp_v2
+from paddle.fluid.tests.unittests.test_conv2d_op import TestConv2DOp, TestConv2DOp_v2
 
 
 def conv2d_bias_naive(out, bias):
@@ -36,7 +36,7 @@ def conv2d_residual_naive(out, residual):
     return out
 
 
-class TestConv2dMKLDNNOp(TestConv2dOp):
+class TestConv2DMKLDNNOp(TestConv2DOp):
     def init_group(self):
         self.groups = 1
 
@@ -64,7 +64,7 @@ def setUp(self):
         self.fuse_residual_connection = False
         self.input_residual_size = None
 
-        TestConv2dOp.setUp(self)
+        TestConv2DOp.setUp(self)
 
         output = self.outputs['Output']
 
@@ -106,9 +106,9 @@ def setUp(self):
 
 @skip_check_grad_ci(
     reason="Fusion is for inference only, check_grad is not required.")
-class TestWithbreluFusion(TestConv2dMKLDNNOp):
+class TestWithbreluFusion(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.fuse_activation = "relu6"
         self.fuse_alpha = 6.0
         self.dsttype = np.float32
@@ -116,9 +116,9 @@ def init_test_case(self):
 
 @skip_check_grad_ci(
     reason="Fusion is for inference only, check_grad is not required.")
-class TestWithFuse(TestConv2dMKLDNNOp):
+class TestWithFuse(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.fuse_bias = True
         self.bias_size = [6]
@@ -126,22 +126,22 @@ def init_test_case(self):
         self.input_residual_size = [2, 6, 5, 5]
 
 
-class TestWithPadWithBias(TestConv2dMKLDNNOp):
+class TestWithPadWithBias(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.input_size = [2, 3, 6, 6]
 
 
-class TestWithStride(TestConv2dMKLDNNOp):
+class TestWithStride(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]
 
 
-class TestWithGroup(TestConv2dMKLDNNOp):
+class TestWithGroup(TestConv2DMKLDNNOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -154,15 +154,15 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWith1x1(TestConv2dMKLDNNOp):
+class TestWith1x1(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.filter_size = [40, 3, 1, 1]
 
 
-class TestWithInput1x1Filter1x1(TestConv2dMKLDNNOp):
+class TestWithInput1x1Filter1x1(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.input_size = [2, 60, 1, 1]  # NCHW
         assert np.mod(self.input_size[1], self.groups) == 0
         f_c = self.input_size[1] // self.groups
@@ -172,7 +172,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestConv2dOp_AsyPadding_MKLDNN(TestConv2dOp_v2):
+class TestConv2DOp_AsyPadding_MKLDNN(TestConv2DOp_v2):
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.dtype = np.float32
@@ -182,19 +182,19 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestConv2dOp_Same_MKLDNN(TestConv2dOp_AsyPadding_MKLDNN):
+class TestConv2DOp_Same_MKLDNN(TestConv2DOp_AsyPadding_MKLDNN):
     def init_paddings(self):
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
-class TestConv2dOp_Valid_MKLDNN(TestConv2dOp_AsyPadding_MKLDNN):
+class TestConv2DOp_Valid_MKLDNN(TestConv2DOp_AsyPadding_MKLDNN):
     def init_paddings(self):
         self.pad = [1, 1]
         self.padding_algorithm = "VALID"
 
 
-class TestConv2dOp_Valid_NHWC_MKLDNN(TestConv2dOp_Valid_MKLDNN):
+class TestConv2DOp_Valid_NHWC_MKLDNN(TestConv2DOp_Valid_MKLDNN):
     def init_data_format(self):
         self.data_format = "NHWC"
 
@@ -203,21 +203,21 @@ def init_test_case_2(self):
         self.input_size = [N, H, W, C]
 
 
-class TestConv2dOp_Same_NHWC_MKLDNN(TestConv2dOp_Valid_NHWC_MKLDNN):
+class TestConv2DOp_Same_NHWC_MKLDNN(TestConv2DOp_Valid_NHWC_MKLDNN):
     def init_paddings(self):
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
-class TestConv2dOp_AsyPadding_NHWC_MKLDNN(TestConv2dOp_Valid_NHWC_MKLDNN):
+class TestConv2DOp_AsyPadding_NHWC_MKLDNN(TestConv2DOp_Valid_NHWC_MKLDNN):
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestMKLDNNDilations(TestConv2dMKLDNNOp):
+class TestMKLDNNDilations(TestConv2DMKLDNNOp):
     def init_test_case(self):
-        TestConv2dMKLDNNOp.init_test_case(self)
+        TestConv2DMKLDNNOp.init_test_case(self)
         self.pad = [0, 0]
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
index 1f68c35ec2b03..7da274917a503 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_transpose_mkldnn_op.py
@@ -19,7 +19,7 @@
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.op_test import OpTest
 
-from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2dTransposeOp
+from paddle.fluid.tests.unittests.test_conv2d_transpose_op import conv2dtranspose_forward_naive, TestConv2DTransposeOp
 
 
 def conv2d_bias_naive(out, bias):
@@ -30,7 +30,7 @@ def conv2d_bias_naive(out, bias):
     return out
 
 
-class TestConv2dTransposeMKLDNNOp(TestConv2dTransposeOp):
+class TestConv2DTransposeMKLDNNOp(TestConv2DTransposeOp):
     def test_check_grad(self):
         return
 
@@ -64,7 +64,7 @@ def init_test_case(self):
 
     def setUp(self):
 
-        TestConv2dTransposeOp.setUp(self)
+        TestConv2DTransposeOp.setUp(self)
 
         output = self.outputs['Output']
 
@@ -86,46 +86,46 @@ def setUp(self):
         self.outputs['Output'] = output
 
 
-class TestMKLDNNFuseBias(TestConv2dTransposeMKLDNNOp):
+class TestMKLDNNFuseBias(TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.fuse_bias = True
         self.bias_size = [6]
 
 
-class TestMKLDNNWithPad(TestConv2dTransposeMKLDNNOp):
+class TestMKLDNNWithPad(TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.input_size = [2, 3, 10, 10]
 
 
-class TestMKLDNNWithStride(TestConv2dTransposeMKLDNNOp):
+class TestMKLDNNWithStride(TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
 
 
-class TestMKLDNNWithAsymPad(TestConv2dTransposeMKLDNNOp):
+class TestMKLDNNWithAsymPad(TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestMKLDNNWithSamePad(TestConv2dTransposeMKLDNNOp):
+class TestMKLDNNWithSamePad(TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [0, 0]
         self.padding_algorithm = "SAME"
 
 
-class TestMKLDNNWithValidPad(TestConv2dTransposeMKLDNNOp):
+class TestMKLDNNWithValidPad(TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.pad = [1, 1]
         self.padding_algorithm = "VALID"
 
@@ -138,10 +138,10 @@ def init_test_case(self):
         self.input_size = [N, H, W, C]
 
 
-class TestConv2dTransposeMKLDNNWithDilationsExplicitPad(
-        TestConv2dTransposeMKLDNNOp):
+class TestConv2DTransposeMKLDNNWithDilationsExplicitPad(
+        TestConv2DTransposeMKLDNNOp):
     def init_test_case(self):
-        TestConv2dTransposeMKLDNNOp.init_test_case(self)
+        TestConv2DTransposeMKLDNNOp.init_test_case(self)
         self.stride = [2, 1]
         self.dilations = [1, 2]
         self.groups = 1
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
index 8f310946db293..ca25b849b4a78 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv3d_mkldnn_op.py
@@ -16,10 +16,10 @@
 
 import unittest
 import numpy as np
-from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1, TestConv3dOp_2
+from paddle.fluid.tests.unittests.test_conv3d_op import TestConv3DOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1, TestConv3DOp_2
 
 
-class TestMKLDNN(TestConv3dOp):
+class TestMKLDNN(TestConv3DOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -61,7 +61,7 @@ def init_kernel_type(self):
         self.dtype = np.float32
 
 
-class TestConv3dOp_AsyPadding_MKLDNN(TestConv3dOp):
+class TestConv3DOp_AsyPadding_MKLDNN(TestConv3DOp):
     def init_kernel_type(self):
         self.use_mkldnn = True
         self.data_format = "NCHW"
@@ -72,7 +72,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestConv3dOp_Same_MKLDNN(TestConv3dOp_AsyPadding_MKLDNN):
+class TestConv3DOp_Same_MKLDNN(TestConv3DOp_AsyPadding_MKLDNN):
     def init_paddings(self):
         self.pad = [0, 0, 0]
         self.padding_algorithm = "SAME"
@@ -83,7 +83,7 @@ def init_kernel_type(self):
         self.dtype = np.float32
 
 
-class TestConv3dOp_Valid_MKLDNN(TestConv3dOp_AsyPadding_MKLDNN):
+class TestConv3DOp_Valid_MKLDNN(TestConv3DOp_AsyPadding_MKLDNN):
     def init_paddings(self):
         self.pad = [1, 1, 1]
         self.padding_algorithm = "VALID"
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
index cccc83306bfdd..639cb570a8472 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_pool2d_int8_mkldnn_op.py
@@ -23,7 +23,7 @@
 from paddle.fluid.tests.unittests.test_pool2d_op import TestPool2D_Op, avg_pool2D_forward_naive, max_pool2D_forward_naive
 
 
-class TestPool2dMKLDNNInt8_Op(TestPool2D_Op):
+class TestPool2DMKLDNNInt8_Op(TestPool2D_Op):
     def init_kernel_type(self):
         self.use_mkldnn = True
 
@@ -51,7 +51,7 @@ def test_check_grad(self):
         pass
 
 
-class TestCase1Avg(TestPool2dMKLDNNInt8_Op):
+class TestCase1Avg(TestPool2DMKLDNNInt8_Op):
     def init_test_case(self):
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
@@ -65,7 +65,7 @@ def init_exclusive(self):
         self.exclusive = True
 
 
-class TestCase2Avg(TestPool2dMKLDNNInt8_Op):
+class TestCase2Avg(TestPool2DMKLDNNInt8_Op):
     def init_test_case(self):
         self.shape = [2, 3, 7, 7]
         self.ksize = [3, 3]
@@ -79,7 +79,7 @@ def init_exclusive(self):
         self.exclusive = False
 
 
-class TestCase0Max(TestPool2dMKLDNNInt8_Op):
+class TestCase0Max(TestPool2DMKLDNNInt8_Op):
     def init_pool_type(self):
         self.pool_type = "max"
         self.pool2D_forward_naive = max_pool2D_forward_naive
@@ -114,7 +114,7 @@ def init_data_type(self):
     globals()[cls_name_u8] = TestU8Case
 
 
-create_test_s8_u8_class(TestPool2dMKLDNNInt8_Op)
+create_test_s8_u8_class(TestPool2DMKLDNNInt8_Op)
 create_test_s8_u8_class(TestCase1Avg)
 create_test_s8_u8_class(TestCase2Avg)
 create_test_s8_u8_class(TestCase0Max)
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index b7ef54a5c2a48..dcf5151578ad5 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -26,7 +26,7 @@
 import paddle.fluid.dygraph as dygraph
 from paddle.fluid import core
 from paddle.fluid.optimizer import SGDOptimizer
-from paddle.nn import Conv2d, Linear, SyncBatchNorm
+from paddle.nn import Conv2D, Linear, SyncBatchNorm
 from paddle.fluid.dygraph.base import to_variable
 
 from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
@@ -42,7 +42,7 @@ def __init__(self,
                  act=None):
         super(TestLayer, self).__init__()
 
-        self._conv = Conv2d(
+        self._conv = Conv2D(
             in_channels=num_channels,
             out_channels=num_filters,
             kernel_size=filter_size,
@@ -53,7 +53,7 @@ def __init__(self,
 
         self._sync_batch_norm = SyncBatchNorm(num_filters)
 
-        self._conv2 = Conv2d(
+        self._conv2 = Conv2D(
             in_channels=num_filters,
             out_channels=num_filters,
             kernel_size=filter_size,
diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
index 9c3ed13cbb000..c71e0e3361be1 100644
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
@@ -65,7 +65,7 @@ def run_executor(exe, binary, feed, fetch_list):
                 feed_data_reader, FeedDataReader
             ), "feed_data_reader must be type of FeedDataReader"
 
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main = fluid.Program()
         startup = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index f40065cf8a3d0..2eec265b5d27a 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -259,7 +259,7 @@ def test_with_input_lengths(self):
 
     def test_predict(self):
         place = paddle.set_device(self.place)
-        paddle.manual_seed(123)
+        paddle.seed(123)
         np.random.seed(123)
 
         class Net(paddle.nn.Layer):
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
index 424406c15bb18..47658518551f2 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool1d.py
@@ -72,7 +72,7 @@ def avg_pool1D_forward_naive(x,
     return out
 
 
-class TestPool1d_API(unittest.TestCase):
+class TestPool1D_API(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -89,7 +89,7 @@ def check_adaptive_avg_dygraph_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1d(
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveAvgPool1D(
                 output_size=16)
             result = ada_max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
index 25692808d090b..2b104041f9468 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool2d.py
@@ -84,7 +84,7 @@ def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
     return out
 
 
-class TestAdaptiveAvgPool2dAPI(unittest.TestCase):
+class TestAdaptiveAvgPool2DAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool2d_forward(
@@ -179,7 +179,7 @@ def test_dynamic_graph(self):
             assert np.allclose(out_6.numpy(), self.res_3_np)
 
 
-class TestAdaptiveAvgPool2dClassAPI(unittest.TestCase):
+class TestAdaptiveAvgPool2DClassAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool2d_forward(
@@ -207,20 +207,20 @@ def test_static_graph(self):
             paddle.enable_static()
             x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[3, 3])
             out_1 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=5)
             out_2 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[2, 5])
             out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
                 output_size=[3, 3], data_format="NHWC")
             out_4 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
                 output_size=[None, 3])
             out_5 = adaptive_avg_pool(x=x)
 
@@ -247,20 +247,20 @@ def test_dynamic_graph(self):
             paddle.disable_static(place=place)
             x = paddle.to_tensor(self.x_np)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[3, 3])
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[3, 3])
             out_1 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=5)
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=5)
             out_2 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=[2, 5])
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=[2, 5])
             out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
                 output_size=[3, 3], data_format="NHWC")
             out_4 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(
                 output_size=[None, 3])
             out_5 = adaptive_avg_pool(x=x)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
index ce85f6bf9fbed..deb45da8a0189 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_avg_pool3d.py
@@ -99,7 +99,7 @@ def adaptive_pool3d_forward(x,
     return out
 
 
-class TestAdaptiveAvgPool3dAPI(unittest.TestCase):
+class TestAdaptiveAvgPool3DAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool3d_forward(
@@ -125,7 +125,8 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(
+                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_avg_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -194,7 +195,7 @@ def test_dynamic_graph(self):
             assert np.allclose(out_6.numpy(), self.res_3_np)
 
 
-class TestAdaptiveAvgPool3dClassAPI(unittest.TestCase):
+class TestAdaptiveAvgPool3DClassAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool3d_forward(
@@ -220,24 +221,25 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(
+                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[3, 3, 3])
             out_1 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(output_size=5)
             out_2 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[2, 3, 5])
             out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[3, 3, 3], data_format="NDHWC")
             out_4 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[None, 3, None])
             out_5 = adaptive_avg_pool(x=x)
 
@@ -264,22 +266,22 @@ def test_dynamic_graph(self):
             paddle.disable_static(place=place)
             x = paddle.to_tensor(self.x_np)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[3, 3, 3])
             out_1 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=5)
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(output_size=5)
             out_2 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[2, 3, 5])
             out_3 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[3, 3, 3], data_format="NDHWC")
             out_4 = adaptive_avg_pool(x=x)
 
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(
                 output_size=[None, 3, None])
             out_5 = adaptive_avg_pool(x=x)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
index 875fdf9e9c3f9..57fe91a818eab 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool1d.py
@@ -63,7 +63,7 @@ def max_pool1D_forward_naive(x,
     return out
 
 
-class TestPool1d_API(unittest.TestCase):
+class TestPool1D_API(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -80,7 +80,7 @@ def check_adaptive_max_dygraph_results(self, place):
                 input_np, ksize=[16], strides=[0], paddings=[0], adaptive=True)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1d(
+            ada_max_pool1d_dg = paddle.nn.layer.AdaptiveMaxPool1D(
                 output_size=16)
             result = ada_max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index 14de5aa53a5f5..944725fab6435 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -84,7 +84,7 @@ def adaptive_pool2d_forward(x, output_size, data_format='NCHW',
     return out
 
 
-class TestAdaptiveMaxPool2dAPI(unittest.TestCase):
+class TestAdaptiveMaxPool2DAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool2d_forward(
@@ -174,7 +174,7 @@ def test_dynamic_graph(self):
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
 
-class TestAdaptiveMaxPool2dClassAPI(unittest.TestCase):
+class TestAdaptiveMaxPool2DClassAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool2d_forward(
@@ -202,20 +202,20 @@ def test_static_graph(self):
             paddle.enable_static()
             x = paddle.fluid.data(name="x", shape=[2, 3, 7, 7], dtype="float32")
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=[3, 3])
             out_1 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=5)
             out_2 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=[2, 5])
             out_3 = adaptive_max_pool(x=x)
 
-            #    adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #    adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(
             #        output_size=[3, 3], data_format="NHWC")
             #    out_4 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(
                 output_size=[None, 3])
             out_5 = adaptive_max_pool(x=x)
 
@@ -242,20 +242,20 @@ def test_dynamic_graph(self):
             paddle.disable_static(place=place)
             x = paddle.to_tensor(self.x_np)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[3, 3])
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=[3, 3])
             out_1 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=5)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=5)
             out_2 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=[2, 5])
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=[2, 5])
             out_3 = adaptive_max_pool(x=x)
 
-            #adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            #adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(
             #    output_size=[3, 3], data_format="NHWC")
             #out_4 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(
                 output_size=[None, 3])
             out_5 = adaptive_max_pool(x=x)
 
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
index 0aa97bdf1caf9..65e0738a99aea 100755
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool3d.py
@@ -99,7 +99,7 @@ def adaptive_pool3d_forward(x,
     return out
 
 
-class TestAdaptiveMaxPool3dAPI(unittest.TestCase):
+class TestAdaptiveMaxPool3DAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool3d_forward(
@@ -125,7 +125,8 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(
+                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
             out_1 = paddle.nn.functional.adaptive_max_pool3d(
                 x=x, output_size=[3, 3, 3])
@@ -189,7 +190,7 @@ def test_dynamic_graph(self):
             assert np.allclose(out_5.numpy(), self.res_5_np)
 
 
-class TestAdaptiveMaxPool3dClassAPI(unittest.TestCase):
+class TestAdaptiveMaxPool3DClassAPI(unittest.TestCase):
     def setUp(self):
         self.x_np = np.random.random([2, 3, 5, 7, 7]).astype("float32")
         self.res_1_np = adaptive_pool3d_forward(
@@ -215,24 +216,25 @@ def test_static_graph(self):
                          if core.is_compiled_with_cuda() else [False]):
             place = paddle.CUDAPlace(0) if use_cuda else paddle.CPUPlace()
             paddle.enable_static()
-            x = paddle.fluid.data(name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
+            x = paddle.fluid.data(
+                name="x", shape=[2, 3, 5, 7, 7], dtype="float32")
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[3, 3, 3])
             out_1 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(output_size=5)
             out_2 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[2, 3, 5])
             out_3 = adaptive_max_pool(x=x)
 
-            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
             #         output_size=[3, 3, 3], data_format="NDHWC")
             #     out_4 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[None, 3, None])
             out_5 = adaptive_max_pool(x=x)
 
@@ -259,22 +261,22 @@ def test_dynamic_graph(self):
             paddle.disable_static(place=place)
             x = paddle.to_tensor(self.x_np)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[3, 3, 3])
             out_1 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(output_size=5)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(output_size=5)
             out_2 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[2, 3, 5])
             out_3 = adaptive_max_pool(x=x)
 
-            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            #     adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
             #         output_size=[3, 3, 3], data_format="NDHWC")
             #     out_4 = adaptive_max_pool(x=x)
 
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3d(
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool3D(
                 output_size=[None, 3, None])
             out_5 = adaptive_max_pool(x=x)
 
diff --git a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
index 324d4cf711036..81189619197a5 100644
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op_v2.py
@@ -32,7 +32,7 @@ def test_name(self):
             places.append(fluid.CUDAPlace(0))
         for p in places:
             with fluid.dygraph.guard(p):
-                batch_norm1d = paddle.nn.BatchNorm1d(1, name="test")
+                batch_norm1d = paddle.nn.BatchNorm1D(1, name="test")
 
     def test_error(self):
         places = [fluid.CPUPlace()]
@@ -45,32 +45,32 @@ def test_error(self):
 
             def error1d_dataformat():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                batch_norm1d = paddle.nn.BatchNorm1d(1, data_format='NCDHW')
+                batch_norm1d = paddle.nn.BatchNorm1D(1, data_format='NCDHW')
                 batch_norm1d(fluid.dygraph.to_variable(x_data_4))
 
             def error2d_dataformat():
                 x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
-                batch_norm2d = paddle.nn.BatchNorm2d(1, data_format='NCDHW')
+                batch_norm2d = paddle.nn.BatchNorm2D(1, data_format='NCDHW')
                 batch_norm2d(fluid.dygraph.to_variable(x_data_3))
 
             def error3d_dataformat():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                batch_norm3d = paddle.nn.BatchNorm3d(1, data_format='NCL')
+                batch_norm3d = paddle.nn.BatchNorm3D(1, data_format='NCL')
                 batch_norm3d(fluid.dygraph.to_variable(x_data_4))
 
             def error1d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                batch_norm1d = paddle.nn.BatchNorm1d(1)
+                batch_norm1d = paddle.nn.BatchNorm1D(1)
                 batch_norm1d(fluid.dygraph.to_variable(x_data_4))
 
             def error2d():
                 x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
-                batch_norm2d = paddle.nn.BatchNorm2d(1)
+                batch_norm2d = paddle.nn.BatchNorm2D(1)
                 batch_norm2d(fluid.dygraph.to_variable(x_data_3))
 
             def error3d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                batch_norm3d = paddle.nn.BatchNorm3d(1)
+                batch_norm3d = paddle.nn.BatchNorm3D(1)
                 batch_norm3d(fluid.dygraph.to_variable(x_data_4))
 
             with fluid.dygraph.guard(p):
@@ -99,7 +99,7 @@ def compute_v1(x, is_test, trainable_statistics):
 
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
-                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    bn = paddle.nn.BatchNorm2D(shape[1])
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
@@ -120,7 +120,7 @@ def compute_v3(x, is_test, trainable_statistics):
 
             def compute_v4(x):
                 with fluid.dygraph.guard(p):
-                    bn = paddle.nn.BatchNorm2d(
+                    bn = paddle.nn.BatchNorm2D(
                         shape[1], weight_attr=False, bias_attr=False)
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
@@ -155,7 +155,7 @@ def compute_v1(x_np, is_test, trainable_statistics):
 
             def compute_v2(x_np):
                 with program_guard(Program(), Program()):
-                    bn = paddle.nn.BatchNorm2d(shape[1])
+                    bn = paddle.nn.BatchNorm2D(shape[1])
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = bn(x)
                     exe.run(fluid.default_startup_program())
@@ -183,8 +183,8 @@ def test_1d(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 4])
-                net1 = paddle.nn.BatchNorm1d(4, data_format="NLC")
-                net2 = paddle.nn.BatchNorm1d(4)
+                net1 = paddle.nn.BatchNorm1D(4, data_format="NLC")
+                net2 = paddle.nn.BatchNorm1D(4)
                 net2.weight = net1.weight
                 net2.bias = net1.bias
                 y1 = net1(x)
@@ -197,8 +197,8 @@ def test_2d(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 6, 4])
-                net1 = paddle.nn.BatchNorm2d(4, data_format="NHWC")
-                net2 = paddle.nn.BatchNorm2d(4)
+                net1 = paddle.nn.BatchNorm2D(4, data_format="NHWC")
+                net2 = paddle.nn.BatchNorm2D(4)
                 net2.weight = net1.weight
                 net2.bias = net1.bias
                 y1 = net1(x)
@@ -211,8 +211,8 @@ def test_3d(self):
         for p in self.places:
             with fluid.dygraph.guard(p):
                 x = paddle.randn([2, 6, 6, 6, 4])
-                net1 = paddle.nn.BatchNorm3d(4, data_format="NDHWC")
-                net2 = paddle.nn.BatchNorm3d(4)
+                net1 = paddle.nn.BatchNorm3D(4, data_format="NDHWC")
+                net2 = paddle.nn.BatchNorm3D(4)
                 net2.weight = net1.weight
                 net2.bias = net1.bias
                 y1 = net1(x)
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 7bdfa3d2dfd74..4b1a54d3c66a1 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -47,7 +47,7 @@ def setUp(self):
 
     def build_program_and_scope(self):
         self.place = fluid.CUDAPlace(0) if self.use_cuda else fluid.CPUPlace()
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         startup_program = fluid.Program()
         main_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_compiled_program.py b/python/paddle/fluid/tests/unittests/test_compiled_program.py
index 751fed2e56126..79ee383f3f9ef 100644
--- a/python/paddle/fluid/tests/unittests/test_compiled_program.py
+++ b/python/paddle/fluid/tests/unittests/test_compiled_program.py
@@ -30,7 +30,7 @@ def setUp(self):
         self.label = np.random.randint(
             low=0, high=10, size=[16, 1], dtype=np.int64)
         with new_program_scope():
-            paddle.manual_seed(self.seed)
+            paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
@@ -47,7 +47,7 @@ def setUp(self):
 
     def test_compiled_program_base(self):
         with new_program_scope():
-            paddle.manual_seed(self.seed)
+            paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
@@ -65,7 +65,7 @@ def test_compiled_program_base(self):
 
     def test_compiled_program_with_data_parallel(self):
         with new_program_scope():
-            paddle.manual_seed(self.seed)
+            paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
             place = fluid.CUDAPlace(0) if core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
index 35fce9e9d6ba9..fc0a64b18a7af 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_layer.py
@@ -21,7 +21,7 @@
 import unittest
 
 
-class Conv1dTestCase(unittest.TestCase):
+class Conv1DTestCase(unittest.TestCase):
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -37,7 +37,7 @@ def __init__(self,
                  no_bias=False,
                  dtype="float32",
                  data_format="NCL"):
-        super(Conv1dTestCase, self).__init__(methodName)
+        super(Conv1DTestCase, self).__init__(methodName)
         self.batch_size = batch_size
         self.num_channels = num_channels
         self.num_filters = num_filters
@@ -107,7 +107,7 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = paddle.to_tensor(self.input)
-        conv = nn.Conv1d(
+        conv = nn.Conv1D(
             self.num_channels,
             self.num_filters,
             self.filter_size,
@@ -139,7 +139,7 @@ def runTest(self):
             self._test_equivalence(place)
 
 
-class Conv1dErrorTestCase(Conv1dTestCase):
+class Conv1DErrorTestCase(Conv1DTestCase):
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -147,7 +147,7 @@ def runTest(self):
                 self.paddle_nn_layer()
 
 
-class Conv1dTypeErrorTestCase(Conv1dTestCase):
+class Conv1DTypeErrorTestCase(Conv1DTestCase):
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -156,27 +156,27 @@ def runTest(self):
 
 
 def add_cases(suite):
-    suite.addTest(Conv1dTestCase(methodName='runTest'))
-    suite.addTest(Conv1dTestCase(methodName='runTest', stride=[1], dilation=2))
-    suite.addTest(Conv1dTestCase(methodName='runTest', stride=2, dilation=(1)))
+    suite.addTest(Conv1DTestCase(methodName='runTest'))
+    suite.addTest(Conv1DTestCase(methodName='runTest', stride=[1], dilation=2))
+    suite.addTest(Conv1DTestCase(methodName='runTest', stride=2, dilation=(1)))
     suite.addTest(
-        Conv1dTestCase(
+        Conv1DTestCase(
             methodName='runTest', padding="same", no_bias=True))
     suite.addTest(
-        Conv1dTestCase(
+        Conv1DTestCase(
             methodName='runTest', filter_size=3, padding='valid'))
     suite.addTest(
-        Conv1dTestCase(
+        Conv1DTestCase(
             methodName='runTest', padding=2, data_format='NLC'))
-    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1]))
-    suite.addTest(Conv1dTestCase(methodName='runTest', padding=[1, 2]))
-    suite.addTest(Conv1dTestCase(methodName='runTest', padding=2))
-    suite.addTest(Conv1dTestCase(methodName='runTest'))
+    suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1]))
+    suite.addTest(Conv1DTestCase(methodName='runTest', padding=[1, 2]))
+    suite.addTest(Conv1DTestCase(methodName='runTest', padding=2))
+    suite.addTest(Conv1DTestCase(methodName='runTest'))
     suite.addTest(
-        Conv1dTestCase(
+        Conv1DTestCase(
             methodName='runTest', groups=2, padding="valid"))
     suite.addTest(
-        Conv1dTestCase(
+        Conv1DTestCase(
             methodName='runTest',
             num_filters=6,
             num_channels=3,
@@ -187,22 +187,22 @@ def add_cases(suite):
 
 def add_error_cases(suite):
     suite.addTest(
-        Conv1dTypeErrorTestCase(
+        Conv1DTypeErrorTestCase(
             methodName='runTest', padding_mode="reflect", padding="valid"))
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1DErrorTestCase(
             methodName='runTest', data_format="VALID"))
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1DErrorTestCase(
             methodName='runTest', padding_mode="VALID"))
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1DErrorTestCase(
             methodName='runTest', num_channels=5, groups=2))
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1DErrorTestCase(
             methodName='runTest', num_filters=8, num_channels=15, groups=3))
     suite.addTest(
-        Conv1dErrorTestCase(
+        Conv1DErrorTestCase(
             methodName='runTest', padding=[1, 2, 3, 4, 5]))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 4c98aacd209da..9c43e2f3e6e9d 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -21,7 +21,7 @@
 import unittest
 
 
-class ConvTranspose1dTestCase(unittest.TestCase):
+class Conv1DTransposeTestCase(unittest.TestCase):
     def __init__(self,
                  methodName='runTest',
                  batch_size=4,
@@ -38,7 +38,7 @@ def __init__(self,
                  no_bias=False,
                  data_format="NCL",
                  dtype="float32"):
-        super(ConvTranspose1dTestCase, self).__init__(methodName)
+        super(Conv1DTransposeTestCase, self).__init__(methodName)
         self.batch_size = batch_size
         self.in_channels = in_channels
         self.out_channels = out_channels
@@ -113,7 +113,7 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = paddle.to_tensor(self.input)
-        conv = nn.ConvTranspose1d(
+        conv = nn.Conv1DTranspose(
             self.in_channels,
             self.out_channels,
             self.filter_size,
@@ -145,7 +145,7 @@ def runTest(self):
             self._test_equivalence(place)
 
 
-class ConvTranspose1dErrorTestCase(ConvTranspose1dTestCase):
+class Conv1DTransposeErrorTestCase(Conv1DTransposeTestCase):
     def runTest(self):
         place = fluid.CPUPlace()
         with dg.guard(place):
@@ -154,68 +154,68 @@ def runTest(self):
 
 
 def add_cases(suite):
-    suite.addTest(ConvTranspose1dTestCase(methodName='runTest'))
+    suite.addTest(Conv1DTransposeTestCase(methodName='runTest'))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', stride=[2], no_bias=True, dilation=2))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest',
             filter_size=(3),
             output_size=[36],
             stride=[2],
             dilation=2))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', stride=2, dilation=(2)))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', padding="valid"))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', padding='valid'))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', filter_size=1, padding=3))
-    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[2]))
+    suite.addTest(Conv1DTransposeTestCase(methodName='runTest', padding=[2]))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', data_format="NLC"))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', groups=2, padding="valid"))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest',
             out_channels=6,
             in_channels=3,
             groups=3,
             padding="valid"))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest',
             data_format="NLC",
             spartial_shape=16,
             output_size=18))
     suite.addTest(
-        ConvTranspose1dTestCase(
+        Conv1DTransposeTestCase(
             methodName='runTest', data_format="NLC", stride=3,
             output_padding=2))
-    suite.addTest(ConvTranspose1dTestCase(methodName='runTest', padding=[1, 2]))
+    suite.addTest(Conv1DTransposeTestCase(methodName='runTest', padding=[1, 2]))
 
 
 def add_error_cases(suite):
     suite.addTest(
-        ConvTranspose1dErrorTestCase(
+        Conv1DTransposeErrorTestCase(
             methodName='runTest', data_format="not_valid"))
     suite.addTest(
-        ConvTranspose1dErrorTestCase(
+        Conv1DTransposeErrorTestCase(
             methodName='runTest', in_channels=5, groups=2))
     suite.addTest(
-        ConvTranspose1dErrorTestCase(
+        Conv1DTransposeErrorTestCase(
             methodName='runTest', stride=2, output_padding=3))
     suite.addTest(
-        ConvTranspose1dErrorTestCase(
+        Conv1DTransposeErrorTestCase(
             methodName='runTest', output_size="not_valid"))
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
index dd1e69f74b3c3..5f3d141a502d9 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py
@@ -45,7 +45,7 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingVALIDCase
 
 
-class TestConv2dFusionOp(OpTest):
+class TestConv2DFusionOp(OpTest):
     def setUp(self):
         self.op_type = "conv2d_fusion"
         self.exhaustive_search = False
@@ -157,28 +157,28 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithoutResidual(TestConv2dFusionOp):
+class TestWithoutResidual(TestConv2DFusionOp):
     def init_residual(self):
         self.add_residual_data = False
 
 
-class TestIdentityActivation(TestConv2dFusionOp):
+class TestIdentityActivation(TestConv2DFusionOp):
     def init_activation(self):
         self.activation = 'identity'
 
 
-class TestIdentityActivation(TestConv2dFusionOp):
+class TestIdentityActivation(TestConv2DFusionOp):
     def init_activation(self):
         self.activation = 'identity'
         self.add_residual_data = False
 
 
-class TestWithGroup(TestConv2dFusionOp):
+class TestWithGroup(TestConv2DFusionOp):
     def init_group(self):
         self.groups = 3
 
 
-class TestWithDilation(TestConv2dFusionOp):
+class TestWithDilation(TestConv2DFusionOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -194,12 +194,12 @@ def init_group(self):
         self.groups = 3
 
 
-class TestCUDNNExhaustiveSearch(TestConv2dFusionOp):
+class TestCUDNNExhaustiveSearch(TestConv2DFusionOp):
     def set_search_method(self):
         self.exhaustive_search = True
 
 
-class TestMultipleOutputs(TestConv2dFusionOp):
+class TestMultipleOutputs(TestConv2DFusionOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -215,13 +215,13 @@ def set_outputs(self):
         self.outputs['Outputs'] = [('out1', out1), ('out2', out2)]
 
 
-class TestAsyPadding(TestConv2dFusionOp):
+class TestAsyPadding(TestConv2DFusionOp):
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithPad_AsyPadding(TestConv2dFusionOp):
+class TestWithPad_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -234,7 +234,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithStride_AsyPadding(TestConv2dFusionOp):
+class TestWithStride_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -247,7 +247,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWith1x1_AsyPadding(TestConv2dFusionOp):
+class TestWith1x1_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -263,12 +263,12 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithGroup_AsyPadding(TestConv2dFusionOp):
+class TestWithGroup_AsyPadding(TestConv2DFusionOp):
     def init_group(self):
         self.groups = 3
 
 
-class TestWithDepthWise3x3_AsyPadding(TestConv2dFusionOp):
+class TestWithDepthWise3x3_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [3, 4, 10, 10]  # NCHW
@@ -287,7 +287,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDepthWise5x5_AsyPadding(TestConv2dFusionOp):
+class TestWithDepthWise5x5_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 4, 10, 10]  # NCHW
@@ -303,7 +303,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDepthWise7x7_AsyPadding(TestConv2dFusionOp):
+class TestWithDepthWise7x7_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 8, 10, 10]  # NCHW
@@ -319,7 +319,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDilation_AsyPadding(TestConv2dFusionOp):
+class TestWithDilation_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -338,7 +338,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithInput1x1Filter1x1_AsyPadding(TestConv2dFusionOp):
+class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DFusionOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 1, 1]  # NCHW
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
index 6bfe2aca530dd..f92a05158ce1a 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_layer.py
@@ -166,7 +166,7 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv2d(
+        conv = nn.Conv2D(
             self.num_channels,
             self.num_filters,
             self.filter_size,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
index 8025a332396d6..d2c2d2cecdda7 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
@@ -289,7 +289,7 @@ def init_paddings(self):
     globals()[cls_name] = TestCUDNNPaddingVALIDCase
 
 
-class TestConv2dOp(OpTest):
+class TestConv2DOp(OpTest):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -412,7 +412,7 @@ def init_kernel_type(self):
         pass
 
 
-class TestWithPad(TestConv2dOp):
+class TestWithPad(TestConv2DOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -422,7 +422,7 @@ def init_test_case(self):
         self.filter_size = [6, f_c, 3, 3]
 
 
-class TestWithStride(TestConv2dOp):
+class TestWithStride(TestConv2DOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -432,7 +432,7 @@ def init_test_case(self):
         self.filter_size = [6, f_c, 3, 3]
 
 
-class TestWithGroup(TestConv2dOp):
+class TestWithGroup(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -443,7 +443,7 @@ def init_test_case(self):
         self.filter_size = [18, f_c, 3, 3]
 
 
-class TestWith1x1(TestConv2dOp):
+class TestWith1x1(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -456,7 +456,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithDepthWise3x3(TestConv2dOp):
+class TestWithDepthWise3x3(TestConv2DOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -472,7 +472,7 @@ def init_group(self):
         self.groups = 4
 
 
-class TestWithDepthWise5x5(TestConv2dOp):
+class TestWithDepthWise5x5(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -485,7 +485,7 @@ def init_group(self):
         self.groups = 4
 
 
-class TestWithDepthWise7x7(TestConv2dOp):
+class TestWithDepthWise7x7(TestConv2DOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -498,7 +498,7 @@ def init_group(self):
         self.groups = 8
 
 
-class TestWithDilation(TestConv2dOp):
+class TestWithDilation(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -514,7 +514,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithInput1x1Filter1x1(TestConv2dOp):
+class TestWithInput1x1Filter1x1(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -527,18 +527,18 @@ def init_group(self):
         self.groups = 3
 
 
-#----------------Conv2dCUDNN----------------
+#----------------Conv2DCUDNN----------------
 
-create_test_cudnn_class(TestConv2dOp)
+create_test_cudnn_class(TestConv2DOp)
 create_test_cudnn_class(TestWithPad)
 create_test_cudnn_class(TestWithStride)
 create_test_cudnn_class(TestWithGroup)
 create_test_cudnn_class(TestWith1x1)
 create_test_cudnn_class(TestWithInput1x1Filter1x1)
 
-#----------------Conv2dCUDNN fp16----------------
+#----------------Conv2DCUDNN fp16----------------
 
-create_test_cudnn_fp16_class(TestConv2dOp, grad_check=False)
+create_test_cudnn_fp16_class(TestConv2DOp, grad_check=False)
 create_test_cudnn_fp16_class(TestWithPad, grad_check=False)
 create_test_cudnn_fp16_class(TestWithStride, grad_check=False)
 create_test_cudnn_fp16_class(TestWithGroup, grad_check=False)
@@ -548,7 +548,7 @@ def init_group(self):
 #----------------TestDepthwiseConv -----
 
 
-class TestDepthwiseConv(TestConv2dOp):
+class TestDepthwiseConv(TestConv2DOp):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -561,7 +561,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConv2(TestConv2dOp):
+class TestDepthwiseConv2(TestConv2DOp):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -574,7 +574,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConv3(TestConv2dOp):
+class TestDepthwiseConv3(TestConv2DOp):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -587,7 +587,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConvWithDilation(TestConv2dOp):
+class TestDepthwiseConvWithDilation(TestConv2DOp):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -601,7 +601,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConvWithDilation2(TestConv2dOp):
+class TestDepthwiseConvWithDilation2(TestConv2DOp):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -615,7 +615,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConvandFuse(TestConv2dOp):
+class TestDepthwiseConvandFuse(TestConv2DOp):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -629,7 +629,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConv2andFuse(TestConv2dOp):
+class TestDepthwiseConv2andFuse(TestConv2DOp):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -643,7 +643,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConv3andFuse(TestConv2dOp):
+class TestDepthwiseConv3andFuse(TestConv2DOp):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -657,7 +657,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConvWithDilationandFuse(TestConv2dOp):
+class TestDepthwiseConvWithDilationandFuse(TestConv2DOp):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -672,7 +672,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestDepthwiseConvWithDilation2andFuse(TestConv2dOp):
+class TestDepthwiseConvWithDilation2andFuse(TestConv2DOp):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -687,13 +687,13 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d"
 
 
-class TestCUDNNExhaustiveSearch(TestConv2dOp):
+class TestCUDNNExhaustiveSearch(TestConv2DOp):
     def init_kernel_type(self):
         self.use_cudnn = True
         self.exhaustive_search = True
 
 
-class TestConv2dOpError(unittest.TestCase):
+class TestConv2DOpError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -724,7 +724,7 @@ def test_dtype():
 # ---- test asymmetric padding ----
 
 
-class TestConv2dOp_v2(OpTest):
+class TestConv2DOp_v2(OpTest):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -854,13 +854,13 @@ def init_test_case_2(self):
         pass
 
 
-class TestConv2dOp_AsyPadding(TestConv2dOp_v2):
+class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithPad_AsyPadding(TestConv2dOp_v2):
+class TestWithPad_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -873,7 +873,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithStride_AsyPadding(TestConv2dOp_v2):
+class TestWithStride_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -886,7 +886,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithGroup_AsyPadding(TestConv2dOp_v2):
+class TestWithGroup_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 2]
@@ -897,7 +897,7 @@ def init_test_case(self):
         self.filter_size = [24, f_c, 4, 3]
 
 
-class TestWith1x1_AsyPadding(TestConv2dOp_v2):
+class TestWith1x1_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -913,7 +913,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDepthWise3x3_AsyPadding(TestConv2dOp_v2):
+class TestWithDepthWise3x3_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [3, 4, 10, 10]  # NCHW
@@ -932,7 +932,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDepthWise5x5_AsyPadding(TestConv2dOp_v2):
+class TestWithDepthWise5x5_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 4, 10, 10]  # NCHW
@@ -948,7 +948,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDepthWise7x7_AsyPadding(TestConv2dOp_v2):
+class TestWithDepthWise7x7_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 8, 10, 10]  # NCHW
@@ -964,7 +964,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDilation_AsyPadding(TestConv2dOp_v2):
+class TestWithDilation_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -983,7 +983,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithInput1x1Filter1x1_AsyPadding(TestConv2dOp_v2):
+class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [40, 3, 1, 1]  # NCHW
@@ -999,7 +999,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-create_test_cudnn_class(TestConv2dOp_AsyPadding)
+create_test_cudnn_class(TestConv2DOp_AsyPadding)
 create_test_cudnn_class(TestWithPad_AsyPadding)
 create_test_cudnn_class(TestWithStride_AsyPadding)
 create_test_cudnn_class(TestWithGroup_AsyPadding)
@@ -1007,7 +1007,7 @@ def init_paddings(self):
 create_test_cudnn_class(TestWithInput1x1Filter1x1_AsyPadding)
 
 
-class TestDepthwiseConv_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConv_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.use_cuda = True
         self.stride = [2, 2]
@@ -1023,7 +1023,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConv2_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConv2_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.use_cuda = True
         self.stride = [1, 1]
@@ -1039,7 +1039,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConv3_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConv3_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.use_cuda = True
         self.stride = [1, 1]
@@ -1055,7 +1055,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConvWithDilation_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConvWithDilation_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -1073,7 +1073,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConvWithDilation2_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConvWithDilation2_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.use_cuda = True
         self.pad = [1, 1]
@@ -1091,7 +1091,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConvandFuse_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConvandFuse_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -1109,7 +1109,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConv2andFuse_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConv2andFuse_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -1127,7 +1127,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConv3andFuse_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConv3andFuse_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -1145,7 +1145,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConvWithDilationandFuse_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConvWithDilationandFuse_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -1164,7 +1164,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestDepthwiseConvWithDilation2andFuse_AsyPadding(TestConv2dOp_v2):
+class TestDepthwiseConvWithDilation2andFuse_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.fuse_relu_before_depthwise_conv = True
         self.use_cuda = True
@@ -1184,25 +1184,25 @@ def init_paddings(self):
 
 
 #---------- test SAME VALID -----------
-create_test_padding_SAME_class(TestConv2dOp_AsyPadding)
+create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
 create_test_padding_SAME_class(TestWithPad_AsyPadding)
 create_test_padding_SAME_class(TestWithStride_AsyPadding)
 create_test_padding_SAME_class(TestWithGroup_AsyPadding)
 create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-create_test_padding_VALID_class(TestConv2dOp_AsyPadding)
+create_test_padding_VALID_class(TestConv2DOp_AsyPadding)
 create_test_padding_VALID_class(TestWithPad_AsyPadding)
 create_test_padding_VALID_class(TestWithStride_AsyPadding)
 create_test_padding_VALID_class(TestWithGroup_AsyPadding)
 create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-create_test_cudnn_padding_SAME_class(TestConv2dOp_AsyPadding)
+create_test_cudnn_padding_SAME_class(TestConv2DOp_AsyPadding)
 create_test_cudnn_padding_SAME_class(TestWithPad_AsyPadding)
 create_test_cudnn_padding_SAME_class(TestWithStride_AsyPadding)
 create_test_cudnn_padding_SAME_class(TestWithGroup_AsyPadding)
 create_test_cudnn_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-create_test_cudnn_padding_VALID_class(TestConv2dOp_AsyPadding)
+create_test_cudnn_padding_VALID_class(TestConv2DOp_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWithPad_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWithStride_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWithGroup_AsyPadding)
@@ -1221,7 +1221,7 @@ def init_paddings(self):
 create_test_padding_VALID_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
 
 # ------------ test channel last ---------
-create_test_channel_last_class(TestConv2dOp_AsyPadding)
+create_test_channel_last_class(TestConv2DOp_AsyPadding)
 create_test_channel_last_class(TestWithPad_AsyPadding)
 create_test_channel_last_class(TestWithGroup_AsyPadding)
 create_test_channel_last_class(TestWith1x1_AsyPadding)
@@ -1232,14 +1232,14 @@ def init_paddings(self):
 create_test_channel_last_class(TestDepthwiseConvandFuse_AsyPadding)
 create_test_channel_last_class(TestDepthwiseConvWithDilationandFuse_AsyPadding)
 
-create_test_cudnn_channel_last_class(TestConv2dOp_AsyPadding)
+create_test_cudnn_channel_last_class(TestConv2DOp_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithPad_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithStride_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithGroup_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithDilation_AsyPadding)
 
 create_test_cudnn_channel_last_fp16_class(
-    TestConv2dOp_AsyPadding, grad_check=False)
+    TestConv2DOp_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(
     TestWithPad_AsyPadding, grad_check=False)
 create_test_cudnn_channel_last_fp16_class(
@@ -1251,7 +1251,7 @@ def init_paddings(self):
 
 
 # --------- test python API ---------------
-class TestConv2dAPI(unittest.TestCase):
+class TestConv2DAPI(unittest.TestCase):
     def test_api(self):
 
         input_NHWC = fluid.layers.data(
@@ -1327,7 +1327,7 @@ def test_api(self):
             data_format="NCHW")
 
 
-class TestConv2dAPI_Error(unittest.TestCase):
+class TestConv2DAPI_Error(unittest.TestCase):
     def test_api(self):
         input = fluid.layers.data(
             name="input",
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index ba450b345b8a3..28c3a466aa6c8 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -155,7 +155,7 @@ def paddle_nn_layer(self):
         else:
             output_size = self.output_size
 
-        conv = nn.ConvTranspose2d(
+        conv = nn.Conv2DTranspose(
             self.num_channels,
             self.num_filters,
             self.filter_size,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
index 913db51da500b..bc87e76fd9b89 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
@@ -111,7 +111,7 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
     return out
 
 
-class TestConv2dTransposeOp(OpTest):
+class TestConv2DTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.dtype = np.float64
@@ -211,7 +211,7 @@ def init_op_type(self):
         self.op_type = "conv2d_transpose"
 
 
-class TestWithSymmetricPad(TestConv2dTransposeOp):
+class TestWithSymmetricPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -222,7 +222,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3]
 
 
-class TestWithAsymmetricPad(TestConv2dTransposeOp):
+class TestWithAsymmetricPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -233,7 +233,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3]
 
 
-class TestWithSAMEPad(TestConv2dTransposeOp):
+class TestWithSAMEPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.stride = [2, 1]
         self.dilations = [1, 2]
@@ -244,7 +244,7 @@ def init_test_case(self):
         self.padding_algorithm = 'SAME'
 
 
-class TestWithVALIDPad(TestConv2dTransposeOp):
+class TestWithVALIDPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.stride = [1, 1]
         self.dilations = [1, 1]
@@ -255,7 +255,7 @@ def init_test_case(self):
         self.padding_algorithm = 'VALID'
 
 
-class TestWithGroups(TestConv2dTransposeOp):
+class TestWithGroups(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -266,7 +266,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 3, 3, 3]
 
 
-class TestWithStride(TestConv2dTransposeOp):
+class TestWithStride(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -277,7 +277,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3]
 
 
-class TestWithDilation(TestConv2dTransposeOp):
+class TestWithDilation(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -288,7 +288,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3]
 
 
-class TestWithEvenUpsample(TestConv2dTransposeOp):
+class TestWithEvenUpsample(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -300,7 +300,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 5, 5]
 
 
-class TestWithEvenUpsampleOutputPadding(TestConv2dTransposeOp):
+class TestWithEvenUpsampleOutputPadding(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -312,7 +312,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 5, 5]
 
 
-class Test_NHWC(TestConv2dTransposeOp):
+class Test_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -324,7 +324,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithSymmetricPad_NHWC(TestConv2dTransposeOp):
+class TestWithSymmetricPad_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -336,7 +336,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithAsymmetricPad_NHWC(TestConv2dTransposeOp):
+class TestWithAsymmetricPad_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [1, 1]
@@ -348,7 +348,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithGroups_NHWC(TestConv2dTransposeOp):
+class TestWithGroups_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -360,7 +360,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithStride_NHWC(TestConv2dTransposeOp):
+class TestWithStride_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -372,7 +372,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithDilation_NHWC(TestConv2dTransposeOp):
+class TestWithDilation_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -384,7 +384,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithEvenUpsample_NHWC(TestConv2dTransposeOp):
+class TestWithEvenUpsample_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -397,7 +397,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithEvenUpsample_NHWC_output_padding(TestConv2dTransposeOp):
+class TestWithEvenUpsample_NHWC_output_padding(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [2, 2]
         self.stride = [2, 2]
@@ -413,7 +413,7 @@ def init_test_case(self):
 # ------------ test_cudnn ------------
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNN(TestConv2dTransposeOp):
+class TestCUDNN(TestConv2DTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv2d_transpose"
@@ -547,7 +547,7 @@ def init_op_type(self):
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNN_NHWC(TestConv2dTransposeOp):
+class TestCUDNN_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -654,7 +654,7 @@ def init_op_type(self):
         self.op_type = "conv2d_transpose"
 
 
-class TestDepthwiseConvTranspose(TestConv2dTransposeOp):
+class TestDepthwiseConvTranspose(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -667,7 +667,7 @@ def init_test_case(self):
         self.op_type = "depthwise_conv2d_transpose"
 
 
-class TestDepthwiseConvTransposeAsymmetricPad(TestConv2dTransposeOp):
+class TestDepthwiseConvTransposeAsymmetricPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [2, 2]
@@ -681,7 +681,7 @@ def init_test_case(self):
         self.data_format = 'NCHW'
 
 
-class TestDepthwiseConvTransposeSAMEPad(TestConv2dTransposeOp):
+class TestDepthwiseConvTransposeSAMEPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.stride = [2, 2]
         self.dilations = [1, 1]
@@ -694,7 +694,7 @@ def init_test_case(self):
         self.padding_algorithm = 'SAME'
 
 
-class TestDepthwiseConvTransposeVALIDPad(TestConv2dTransposeOp):
+class TestDepthwiseConvTransposeVALIDPad(TestConv2DTransposeOp):
     def init_test_case(self):
         self.stride = [2, 2]
         self.dilations = [1, 1]
@@ -707,7 +707,7 @@ def init_test_case(self):
         self.padding_algorithm = 'VALID'
 
 
-class TestDepthwiseConvTranspose_NHWC_4x4kernel(TestConv2dTransposeOp):
+class TestDepthwiseConvTranspose_NHWC_4x4kernel(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -721,7 +721,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestDepthwiseConvTranspose_NHWC_3x3kernel(TestConv2dTransposeOp):
+class TestDepthwiseConvTranspose_NHWC_3x3kernel(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -735,7 +735,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestDepthwiseConvTransposeAsymmetricPad_NHWC(TestConv2dTransposeOp):
+class TestDepthwiseConvTransposeAsymmetricPad_NHWC(TestConv2DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 0, 1, 2]
         self.stride = [2, 2]
@@ -751,7 +751,7 @@ def init_test_case(self):
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNN_FP16(TestConv2dTransposeOp):
+class TestCUDNN_FP16(TestConv2DTransposeOp):
     def init_test_case(self):
         self.dtype = np.float16
         self.pad = [1, 1]
@@ -867,7 +867,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestConv2dTransposeAPI(unittest.TestCase):
+class TestConv2DTransposeAPI(unittest.TestCase):
     def test_case1(self):
         data1 = fluid.layers.data(
             name='data1', shape=[3, 5, 5], dtype='float32')
@@ -945,7 +945,7 @@ def test_case1(self):
         self.assertIsNotNone(results[6])
 
 
-class TestConv2dTransposeOpException(unittest.TestCase):
+class TestConv2DTransposeOpException(unittest.TestCase):
     def test_exception(self):
         data = fluid.layers.data(name='data', shape=[3, 5, 5], dtype="float32")
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
index 56355a1c95e03..b45e2d1a6aa14 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_layer.py
@@ -135,7 +135,7 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.Conv3d(
+        conv = nn.Conv3D(
             self.num_channels,
             self.num_filters,
             self.filter_size,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
index 8f1f2094fad24..1636019a6252c 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
@@ -228,7 +228,7 @@ def init_test_case_2(self):
     globals()[cls_name] = TestCudnnChannelLastCase
 
 
-class TestConv3dOp(OpTest):
+class TestConv3DOp(OpTest):
     def setUp(self):
         self.op_type = "conv3d"
         self.use_cudnn = False
@@ -334,7 +334,7 @@ def init_kernel_type(self):
         pass
 
 
-class TestCase1(TestConv3dOp):
+class TestCase1(TestConv3DOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -344,7 +344,7 @@ def init_test_case(self):
         self.filter_size = [6, f_c, 3, 3, 3]
 
 
-class TestWithGroup1(TestConv3dOp):
+class TestWithGroup1(TestConv3DOp):
     def init_group(self):
         self.groups = 3
 
@@ -354,7 +354,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWith1x1(TestConv3dOp):
+class TestWith1x1(TestConv3DOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -370,7 +370,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithInput1x1Filter1x1(TestConv3dOp):
+class TestWithInput1x1Filter1x1(TestConv3DOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -386,7 +386,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithDilation(TestConv3dOp):
+class TestWithDilation(TestConv3DOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -402,19 +402,19 @@ def init_group(self):
         self.groups = 3
 
 
-#---------------- Conv3dCUDNN ----------------
+#---------------- Conv3DCUDNN ----------------
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNN(TestConv3dOp):
+class TestCUDNN(TestConv3DOp):
     def init_kernel_type(self):
         self.use_cudnn = True
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestFP16CUDNN(TestConv3dOp):
+class TestFP16CUDNN(TestConv3DOp):
     def init_kernel_type(self):
         self.use_cudnn = True
         self.dtype = np.float16
@@ -519,7 +519,7 @@ def init_kernel_type(self):
 # ---- test asymmetric padding ----
 
 
-class TestConv3dOp_2(OpTest):
+class TestConv3DOp_2(OpTest):
     def setUp(self):
         self.op_type = "conv3d"
         self.use_cudnn = False
@@ -624,7 +624,7 @@ def init_data_format(self):
         self.data_format = "NCDHW"
 
 
-class TestConv3dOp_AsyPadding(TestConv3dOp_2):
+class TestConv3DOp_AsyPadding(TestConv3DOp_2):
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -637,7 +637,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestConv3dOp_DiffDataInDiffDim(TestConv3dOp_2):
+class TestConv3DOp_DiffDataInDiffDim(TestConv3DOp_2):
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.input_size = [2, 3, 4, 5, 5]  # NCDHW
@@ -650,12 +650,12 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-create_test_padding_SAME_class(TestConv3dOp_DiffDataInDiffDim)
-create_test_padding_VALID_class(TestConv3dOp_DiffDataInDiffDim)
-create_test_channel_last_class(TestConv3dOp_DiffDataInDiffDim)
+create_test_padding_SAME_class(TestConv3DOp_DiffDataInDiffDim)
+create_test_padding_VALID_class(TestConv3DOp_DiffDataInDiffDim)
+create_test_channel_last_class(TestConv3DOp_DiffDataInDiffDim)
 
 
-class TestCase1_AsyPadding(TestConv3dOp_2):
+class TestCase1_AsyPadding(TestConv3DOp_2):
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -668,7 +668,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithGroup1_AsyPadding(TestConv3dOp_2):
+class TestWithGroup1_AsyPadding(TestConv3DOp_2):
     def init_group(self):
         self.groups = 3
 
@@ -677,7 +677,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithGroup2_AsyPadding(TestConv3dOp_2):
+class TestWithGroup2_AsyPadding(TestConv3DOp_2):
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]  # NCDHW
@@ -693,7 +693,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWith1x1_AsyPadding(TestConv3dOp_2):
+class TestWith1x1_AsyPadding(TestConv3DOp_2):
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 4, 4, 4]
@@ -712,7 +712,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDilation_AsyPadding(TestConv3dOp_2):
+class TestWithDilation_AsyPadding(TestConv3DOp_2):
     def init_test_case(self):
         self.stride = [1, 1, 1]
         self.input_size = [2, 3, 6, 6, 6]
@@ -731,41 +731,41 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-create_test_cudnn_class(TestConv3dOp_AsyPadding)
+create_test_cudnn_class(TestConv3DOp_AsyPadding)
 create_test_cudnn_class(TestWithGroup1_AsyPadding)
 create_test_cudnn_class(TestWithGroup2_AsyPadding)
 create_test_cudnn_class(TestWith1x1_AsyPadding)
 create_test_cudnn_class(TestWithDilation_AsyPadding)
 
-create_test_padding_SAME_class(TestConv3dOp_AsyPadding)
+create_test_padding_SAME_class(TestConv3DOp_AsyPadding)
 create_test_padding_SAME_class(TestWithGroup1_AsyPadding)
 create_test_padding_SAME_class(TestWith1x1_AsyPadding)
 
-create_test_padding_VALID_class(TestConv3dOp_AsyPadding)
+create_test_padding_VALID_class(TestConv3DOp_AsyPadding)
 create_test_padding_VALID_class(TestWithGroup1_AsyPadding)
 create_test_padding_VALID_class(TestWith1x1_AsyPadding)
 
-create_test_cudnn_padding_SAME_class(TestConv3dOp_AsyPadding)
+create_test_cudnn_padding_SAME_class(TestConv3DOp_AsyPadding)
 create_test_cudnn_padding_SAME_class(TestWithGroup1_AsyPadding)
 create_test_cudnn_padding_SAME_class(TestWith1x1_AsyPadding)
 
-create_test_cudnn_padding_VALID_class(TestConv3dOp_AsyPadding)
+create_test_cudnn_padding_VALID_class(TestConv3DOp_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWithGroup1_AsyPadding)
 create_test_cudnn_padding_VALID_class(TestWith1x1_AsyPadding)
 
-create_test_channel_last_class(TestConv3dOp_AsyPadding)
+create_test_channel_last_class(TestConv3DOp_AsyPadding)
 create_test_channel_last_class(TestWithGroup1_AsyPadding)
 create_test_channel_last_class(TestWith1x1_AsyPadding)
 
-create_test_channel_last_class(TestConv3dOp_AsyPadding)
+create_test_channel_last_class(TestConv3DOp_AsyPadding)
 create_test_channel_last_class(TestWithGroup1_AsyPadding)
 create_test_channel_last_class(TestWith1x1_AsyPadding)
 
-create_test_cudnn_channel_last_class(TestConv3dOp_AsyPadding)
+create_test_cudnn_channel_last_class(TestConv3DOp_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithGroup1_AsyPadding)
 create_test_cudnn_channel_last_class(TestWith1x1_AsyPadding)
 
-create_test_cudnn_channel_last_class(TestConv3dOp_AsyPadding)
+create_test_cudnn_channel_last_class(TestConv3DOp_AsyPadding)
 create_test_cudnn_channel_last_class(TestWithGroup1_AsyPadding)
 create_test_cudnn_channel_last_class(TestWith1x1_AsyPadding)
 
@@ -777,7 +777,7 @@ def init_paddings(self):
 
 
 # --------- test python API ---------------
-class TestConv3dAPI(unittest.TestCase):
+class TestConv3DAPI(unittest.TestCase):
     def test_api(self):
 
         input_NDHWC = fluid.layers.data(
@@ -853,7 +853,7 @@ def test_api(self):
             data_format="NCDHW")
 
 
-class TestConv3dAPI_Error(unittest.TestCase):
+class TestConv3DAPI_Error(unittest.TestCase):
     def test_api(self):
         input = fluid.layers.data(
             name="input",
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index e30f0cd3ecd0b..dac84a8486ef2 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -139,7 +139,7 @@ def functional(self, place):
 
     def paddle_nn_layer(self):
         x_var = dg.to_variable(self.input)
-        conv = nn.ConvTranspose3d(
+        conv = nn.Conv3DTranspose(
             self.num_channels,
             self.num_filters,
             self.filter_size,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
index 6570fb8f358ad..42062b1557620 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
@@ -107,7 +107,7 @@ def _get_padding_with_SAME(input_shape, kernel_size, kernel_stride):
     return out
 
 
-class TestConv3dTransposeOp(OpTest):
+class TestConv3DTransposeOp(OpTest):
     def setUp(self):
         # init as conv transpose
         self.use_cudnn = False
@@ -200,7 +200,7 @@ def init_op_type(self):
         self.op_type = "conv3d_transpose"
 
 
-class TestWithSymmetricPad(TestConv3dTransposeOp):
+class TestWithSymmetricPad(TestConv3DTransposeOp):
     def init_test_case(self):
         self.check_no_input = True
         self.pad = [1, 1, 1]
@@ -212,7 +212,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
-class TestWithAsymmetricPad(TestConv3dTransposeOp):
+class TestWithAsymmetricPad(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 0, 1, 0, 1, 2]
         self.stride = [1, 1, 1]
@@ -223,7 +223,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
-class TestWithSAMEPad(TestConv3dTransposeOp):
+class TestWithSAMEPad(TestConv3DTransposeOp):
     def init_test_case(self):
         self.stride = [1, 1, 2]
         self.dilations = [1, 2, 1]
@@ -234,7 +234,7 @@ def init_test_case(self):
         self.padding_algorithm = 'SAME'
 
 
-class TestWithVALIDPad(TestConv3dTransposeOp):
+class TestWithVALIDPad(TestConv3DTransposeOp):
     def init_test_case(self):
         self.stride = [2, 1, 1]
         self.dilations = [1, 1, 1]
@@ -245,7 +245,7 @@ def init_test_case(self):
         self.padding_algorithm = 'VALID'
 
 
-class TestWithStride(TestConv3dTransposeOp):
+class TestWithStride(TestConv3DTransposeOp):
     def init_test_case(self):
         self.check_no_filter = True
         self.pad = [1, 1, 1]
@@ -257,7 +257,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
-class TestWithGroups(TestConv3dTransposeOp):
+class TestWithGroups(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -268,7 +268,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 3, 3, 3, 3]
 
 
-class TestWithDilation(TestConv3dTransposeOp):
+class TestWithDilation(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -279,7 +279,7 @@ def init_test_case(self):
         self.filter_size = [f_c, 6, 3, 3, 3]
 
 
-class Test_NHWC(TestConv3dTransposeOp):
+class Test_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
@@ -294,7 +294,7 @@ def init_test_case(self):
 # ------------ test_cudnn ------------
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNN(TestConv3dTransposeOp):
+class TestCUDNN(TestConv3DTransposeOp):
     def init_op_type(self):
         self.use_cudnn = True
         self.op_type = "conv3d_transpose"
@@ -419,7 +419,7 @@ def init_op_type(self):
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
-class TestCUDNN_NHWC(TestConv3dTransposeOp):
+class TestCUDNN_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [0, 0, 0]
         self.stride = [1, 1, 1]
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
index 241f6b570fe91..d597045641913 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_part2_op.py
@@ -20,10 +20,10 @@
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from op_test import OpTest
-from test_conv3d_transpose_op import conv3dtranspose_forward_naive, TestConv3dTransposeOp
+from test_conv3d_transpose_op import TestConv3DTransposeOp
 
 
-class TestWithSymmetricPad_NHWC(TestConv3dTransposeOp):
+class TestWithSymmetricPad_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [1, 1, 1]
@@ -35,7 +35,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithAsymmetricPad_NHWC(TestConv3dTransposeOp):
+class TestWithAsymmetricPad_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 0, 1, 0, 1, 2]
         self.stride = [1, 1, 1]
@@ -47,7 +47,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithGroups_NHWC(TestConv3dTransposeOp):
+class TestWithGroups_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.check_no_filter = True
         self.pad = [1, 1, 1]
@@ -60,7 +60,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithStride_NHWC(TestConv3dTransposeOp):
+class TestWithStride_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.pad = [1, 1, 1]
         self.stride = [2, 2, 2]
@@ -72,7 +72,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestWithDilation_NHWC(TestConv3dTransposeOp):
+class TestWithDilation_NHWC(TestConv3DTransposeOp):
     def init_test_case(self):
         self.check_no_input = True
         self.pad = [1, 1, 1]
@@ -85,7 +85,7 @@ def init_test_case(self):
         self.data_format = 'NHWC'
 
 
-class TestConv3dTransposeAPI(unittest.TestCase):
+class TestConv3DTransposeAPI(unittest.TestCase):
     def test_case1(self):
         data1 = fluid.layers.data(
             name='data1', shape=[3, 5, 5, 5], dtype='float32')
@@ -164,7 +164,7 @@ def test_case1(self):
         self.assertIsNotNone(results[6])
 
 
-class TestConv3dTransposeOpException(unittest.TestCase):
+class TestConv3DTransposeOpException(unittest.TestCase):
     def test_exception(self):
         data = fluid.layers.data(
             name='data', shape=[3, 5, 5, 5], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
index c953841be028c..31f2000f3ad45 100644
--- a/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_nn_grad.py
@@ -438,7 +438,7 @@ def test_grad(self):
             self.func(p)
 
 
-class TestConv3dDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
+class TestConv3DDoubleGradCheck_ChannelLast_AsyPadding(unittest.TestCase):
     @prog_scope()
     def func(self, place):
         shape = [2, 2, 2, 2, 3]
diff --git a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
index 0c2520038a82a..686e738b8e078 100644
--- a/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_cuda_random_seed.py
@@ -31,7 +31,7 @@ class TestGeneratorSeed(unittest.TestCase):
     """
 
     def test_gen_dropout_dygraph(self):
-        gen = paddle.manual_seed(12343)
+        gen = paddle.seed(12343)
 
         fluid.enable_dygraph()
 
@@ -70,13 +70,13 @@ def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
         fluid.enable_dygraph()
 
-        paddle.manual_seed(12312321111)
+        paddle.seed(12312321111)
         x = fluid.layers.gaussian_random([120], dtype="float32")
         st1 = paddle.get_cuda_rng_state()
         x1 = fluid.layers.gaussian_random([120], dtype="float32")
         paddle.set_cuda_rng_state(st1)
         x2 = fluid.layers.gaussian_random([120], dtype="float32")
-        paddle.manual_seed(12312321111)
+        paddle.seed(12312321111)
         x3 = fluid.layers.gaussian_random([120], dtype="float32")
         x_np = x.numpy()
         x1_np = x1.numpy()
@@ -93,13 +93,13 @@ def test_generator_randint_dygraph(self):
 
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = paddle.randint(low=10, shape=[10], dtype="int32")
         st1 = gen.get_state()
         x1 = paddle.randint(low=10, shape=[10], dtype="int32")
         gen.set_state(st1)
         x2 = paddle.randint(low=10, shape=[10], dtype="int32")
-        paddle.manual_seed(12312321111)
+        paddle.seed(12312321111)
         x3 = paddle.randint(low=10, shape=[10], dtype="int32")
         x_np = x.numpy()
         x1_np = x1.numpy()
@@ -114,7 +114,7 @@ def test_generator_randint_dygraph(self):
     def test_gen_TruncatedNormal_initializer(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
         cur_state = paddle.get_cuda_rng_state()
 
         startup_program = fluid.Program()
@@ -140,7 +140,7 @@ def test_gen_TruncatedNormal_initializer(self):
                            feed={},
                            fetch_list=[result_1, result_2])
 
-        paddle.manual_seed(123123143)
+        paddle.seed(123123143)
         with fluid.program_guard(train_program, startup_program):
             exe.run(startup_program)
             out2 = exe.run(train_program,
diff --git a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
index cc0f3745bbf7b..a7c1b14d269f4 100644
--- a/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_decoupled_py_reader.py
@@ -34,7 +34,7 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
-    paddle.manual_seed(1)
+    paddle.seed(1)
     paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
index eed637b1d5da1..80c10886826e7 100644
--- a/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_deformable_conv_op.py
@@ -286,7 +286,7 @@ def test_invalid_offset():
         self.assertRaises(TypeError, test_invalid_offset)
 
 
-class TestDeformConv2dAPI(unittest.TestCase):
+class TestDeformConv2DAPI(unittest.TestCase):
     def test_api(self):
         def test_deform_conv2d_v1():
             paddle.enable_static()
diff --git a/python/paddle/fluid/tests/unittests/test_dropout_op.py b/python/paddle/fluid/tests/unittests/test_dropout_op.py
index 7b9e25e1d4ae8..0d0273c1670fa 100644
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
@@ -487,7 +487,7 @@ def test_dygraph(self):
                 self.assertTrue(np.allclose(result.numpy(), result_np))
 
 
-class TestDropout2dFAPI(unittest.TestCase):
+class TestDropout2DFAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -535,7 +535,7 @@ def test_dygraph(self):
                 self.assertTrue(np.allclose(res.numpy(), res_np))
 
 
-class TestDropout2dFAPIError(unittest.TestCase):
+class TestDropout2DFAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -554,7 +554,7 @@ def test_dataformat():
             self.assertRaises(ValueError, test_dataformat)
 
 
-class TestDropout2dCAPI(unittest.TestCase):
+class TestDropout2DCAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -567,13 +567,13 @@ def test_dygraph(self):
                 input_np = np.random.random([2, 3, 4, 5]).astype("float32")
                 result_np = input_np
                 input = fluid.dygraph.to_variable(input_np)
-                m = paddle.nn.Dropout2d(p=0.)
+                m = paddle.nn.Dropout2D(p=0.)
                 m.eval()
                 result = m(input)
                 self.assertTrue(np.allclose(result.numpy(), result_np))
 
 
-class TestDropout3dFAPI(unittest.TestCase):
+class TestDropout3DFAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -621,7 +621,7 @@ def test_dygraph(self):
                 self.assertTrue(np.allclose(res.numpy(), res_np))
 
 
-class TestDropout3dFAPIError(unittest.TestCase):
+class TestDropout3DFAPIError(unittest.TestCase):
     def test_errors(self):
         with program_guard(Program(), Program()):
 
@@ -640,7 +640,7 @@ def test_dataformat():
             self.assertRaises(ValueError, test_dataformat)
 
 
-class TestDropout3dCAPI(unittest.TestCase):
+class TestDropout3DCAPI(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -653,7 +653,7 @@ def test_dygraph(self):
                 input_np = np.random.random([2, 3, 4, 5, 6]).astype("float32")
                 result_np = input_np
                 input = fluid.dygraph.to_variable(input_np)
-                m = paddle.nn.Dropout3d(p=0.)
+                m = paddle.nn.Dropout3D(p=0.)
                 m.eval()
                 result = m(input)
                 self.assertTrue(np.allclose(result.numpy(), result_np))
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
index 88b496c1d89e6..a1165f3358415 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
@@ -110,7 +110,7 @@ def test_mnist_forward_float32(self):
         epoch_num = 1
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(SEED)
+            paddle.seed(SEED)
             paddle.framework.random._manual_program_seed(SEED)
             mnist = MNIST()
             sgd = SGDOptimizer(
@@ -143,7 +143,7 @@ def test_mnist_forward_float32(self):
                             dy_param_init_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(SEED)
+            paddle.seed(SEED)
             paddle.framework.random._manual_program_seed(SEED)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
index a963c2ece0958..f95546f15f002 100644
--- a/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_weight_norm.py
@@ -117,7 +117,7 @@ def weight_normalize(self, w, dim=None):
 
     def test_check_output(self):
         fluid.enable_imperative()
-        linear = paddle.nn.Conv2d(2, 3, 3)
+        linear = paddle.nn.Conv2D(2, 3, 3)
         before_weight = linear.weight.numpy()
         if self.dim == None:
             self.dim = -1
@@ -179,7 +179,7 @@ def init_test_case(self):
 
     def test_check_output(self):
         fluid.enable_imperative()
-        linear = paddle.nn.Conv2d(2, 3, 3)
+        linear = paddle.nn.Conv2D(2, 3, 3)
         before_weight = linear.weight
         wn = weight_norm(linear, dim=self.dim)
         rwn = remove_weight_norm(linear)
diff --git a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
index e0c0277270b40..ff99a06e49e78 100644
--- a/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_eager_deletion_padding_rnn.py
@@ -466,7 +466,7 @@ def set_customed_config(self):
         pass
 
     def _prepare_program(self, config, parallel=True):
-        paddle.manual_seed(config.random_seed)
+        paddle.seed(config.random_seed)
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
         with fluid.program_guard(self.main_program, self.startup_program):
diff --git a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
index c18b7c5b044e7..120880a5fc969 100644
--- a/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_embedding_id_stop_gradient.py
@@ -39,7 +39,7 @@ def test_check_grad(self):
 
     def run_program(self, place, stop_gradient=False):
         np.random.seed(1)
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
 
         startup_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 1272d82dfdd1d..3bbc8df188227 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -137,7 +137,7 @@ def config(self):
 class TestFcOp_NumFlattenDims_NegOne(unittest.TestCase):
     def test_api(self):
         def run_program(num_flatten_dims):
-            paddle.manual_seed(SEED)
+            paddle.seed(SEED)
             startup_program = Program()
             main_program = Program()
 
diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
index 5bcfc8720ddd2..6a1700e758e57 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_act_pass.py
@@ -57,7 +57,7 @@ def build_program(self, main_program, startup_program, use_cuda, seed=1):
         return x, y, loss
 
     def check(self, place, use_cuda):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_program = fluid.Program()
         startup_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
index 1bc305cd1f4dc..45c27552743d3 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
@@ -158,7 +158,7 @@ def build_origin_program(self,
         return x, y, loss
 
     def check(self, place, use_cuda):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         iters = 5
         batch_size = 16
diff --git a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
index dddc6811ef08b..121dcbb3cdc12 100644
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
@@ -38,7 +38,7 @@ def setUp(self):
             "seed": 10,
             "use_mkldnn": self.use_mkldnn
         }
-        paddle.manual_seed(10)
+        paddle.seed(10)
 
         self.outputs = {'Out': np.zeros((123, 92), dtype='float32')}
 
diff --git a/python/paddle/fluid/tests/unittests/test_generator.py b/python/paddle/fluid/tests/unittests/test_generator.py
index 8b1f420358d31..ef9a305053e12 100644
--- a/python/paddle/fluid/tests/unittests/test_generator.py
+++ b/python/paddle/fluid/tests/unittests/test_generator.py
@@ -30,8 +30,6 @@ def test_basic_generator(self):
         """Test basic generator."""
         gen = generator.Generator()
         gen.manual_seed(123123143)
-        s = gen.initial_seed()
-        s = gen.seed()
         st = gen.get_state()
         gen.set_state(st)
         gen.random()
diff --git a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
index 7c1ff41f7e767..c36550fca8cab 100644
--- a/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
+++ b/python/paddle/fluid/tests/unittests/test_generator_dataloader.py
@@ -35,7 +35,7 @@ def random_reader():
 
 
 def simple_fc_net(places, use_legacy_py_reader, use_double_buffer):
-    paddle.manual_seed(1)
+    paddle.seed(1)
     paddle.framework.random._manual_program_seed(1)
     startup_prog = fluid.Program()
     main_prog = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
index 3f8eed08adf68..590c3e061f26e 100644
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
@@ -269,7 +269,7 @@ def hs_net_conf(self, is_sparse):
 
     def training_test(self, is_sparse):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
-            paddle.manual_seed(1)
+            paddle.seed(1)
             start_up = fluid.default_startup_program()
             x = np.arange(6).reshape(6)
             path_table = np.array([(1, 2, -1), (1, 2, -1)]).astype('int64')
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 71381ecfde738..2d1d2949a4eb2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -120,7 +120,7 @@ def test_minimize(self):
         inp_np = np.random.random(size=[1, 3, 128, 128]).astype(np.float32)
 
         def run_simple_conv(inp_np, use_scaler=True):
-            paddle.manual_seed(10)
+            paddle.seed(10)
             paddle.framework.random._manual_program_seed(10)
             with fluid.dygraph.guard():
                 model = SimpleConv(
@@ -205,7 +205,7 @@ def train_resnet(self, enable_amp=True):
 
         paddle.disable_static()
 
-        paddle.manual_seed(seed)
+        paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
         resnet = ResNet(use_cudnn=True)
@@ -282,7 +282,7 @@ def train_resnet(self, enable_amp=True):
         batch_num = 1
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet(use_cudnn=True)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
index cc6c2f97a9334..04a0e5e4cd10f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_deepcf.py
@@ -206,7 +206,7 @@ def test_deefcf(self):
         else:
             (users_np, items_np, labels_np, num_users, num_items,
              matrix) = get_data()
-        paddle.manual_seed(seed)
+        paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
         startup = fluid.Program()
         main = fluid.Program()
@@ -243,7 +243,7 @@ def test_deefcf(self):
                     sys.stderr.write('static loss %s\n' % static_loss)
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             deepcf = DeepCF(num_users, num_items, matrix)
@@ -268,7 +268,7 @@ def test_deefcf(self):
                     sys.stderr.write('dynamic loss: %s %s\n' % (slice, dy_loss))
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             deepcf2 = DeepCF(num_users, num_items, matrix)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 39c6fca89ccbe..600ee6d10e5de 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -311,7 +311,7 @@ def model_f(input):
         fluid.set_flags({'FLAGS_sort_sum_gradient': True})
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(123)
+            paddle.seed(123)
             paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
@@ -328,7 +328,7 @@ def model_f(input):
             grad_1 = dx[0].numpy()
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(123)
+            paddle.seed(123)
             paddle.framework.random._manual_program_seed(123)
             a = fluid.dygraph.to_variable(value)
             a.stop_gradient = False
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gan.py b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
index b752b439f0fa9..189745e7295a8 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gan.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gan.py
@@ -56,7 +56,7 @@ def forward(self, inputs):
 class TestDygraphGAN(unittest.TestCase):
     def test_gan_float32(self):
         seed = 90
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         startup = fluid.Program()
         discriminate_p = fluid.Program()
@@ -131,7 +131,7 @@ def test_gan_float32(self):
 
         dy_params = dict()
         with fluid.dygraph.guard():
-            paddle.manual_seed(1)
+            paddle.seed(1)
             paddle.framework.random._manual_program_seed(1)
 
             discriminator = Discriminator()
@@ -176,7 +176,7 @@ def test_gan_float32(self):
         dy_params2 = dict()
         with fluid.dygraph.guard():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            paddle.manual_seed(1)
+            paddle.seed(1)
             paddle.framework.random._manual_program_seed(1)
             discriminator2 = Discriminator()
             generator2 = Generator()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
index 4db6f2d0da1d5..c813aeede6fe4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_gnn.py
@@ -61,7 +61,7 @@ def forward(self, x, adj):
 
 class TestDygraphGNN(unittest.TestCase):
     def test_gnn_float32(self):
-        paddle.manual_seed(90)
+        paddle.seed(90)
         paddle.framework.random._manual_program_seed(90)
         startup = fluid.Program()
         main = fluid.Program()
@@ -112,7 +112,7 @@ def test_gnn_float32(self):
                 scope.find_var(model.gc.weight.name).get_tensor())
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(90)
+            paddle.seed(90)
             paddle.framework.random._manual_program_seed(90)
 
             features = np.ones([1, 100, 50], dtype=np.float32)
@@ -138,7 +138,7 @@ def test_gnn_float32(self):
             model_gc_weight_value = model.gc.weight.numpy()
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(90)
+            paddle.seed(90)
             paddle.framework.random._manual_program_seed(90)
 
             features2 = np.ones([1, 100, 50], dtype=np.float32)
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
index ab9a98588f76e..c18dab61fc5ab 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_apply.py
@@ -28,11 +28,11 @@ def __init__(self, num_classes=10, classifier_activation='softmax'):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
-            nn.Conv2d(
+            nn.Conv2D(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
             paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            nn.Conv2d(
+            nn.Conv2D(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
             paddle.fluid.dygraph.Pool2D(2, 'max', 2))
@@ -60,7 +60,7 @@ def init_weights(layer):
         new_bias = paddle.fluid.layers.fill_constant(
             layer.bias.shape, layer.bias.dtype, value=-0.1)
         layer.bias.set_value(new_bias)
-    elif type(layer) == nn.Conv2d:
+    elif type(layer) == nn.Conv2D:
         new_weight = paddle.fluid.layers.fill_constant(
             layer.weight.shape, layer.weight.dtype, value=0.7)
         layer.weight.set_value(new_weight)
@@ -80,7 +80,7 @@ def test_apply_init_weight(self):
                 if type(layer) == nn.Linear:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.9)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.1)
-                elif type(layer) == nn.Conv2d:
+                elif type(layer) == nn.Conv2D:
                     np.testing.assert_allclose(layer.weight.numpy(), 0.7)
                     np.testing.assert_allclose(layer.bias.numpy(), -0.2)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
index 95d3b87f0e948..870d48f2fb4b5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_layer_children.py
@@ -27,11 +27,11 @@ class LeNetDygraph(fluid.dygraph.Layer):
     def __init__(self):
         super(LeNetDygraph, self).__init__()
         self.features = nn.Sequential(
-            nn.Conv2d(
+            nn.Conv2D(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
             paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            nn.Conv2d(
+            nn.Conv2D(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
             paddle.fluid.dygraph.Pool2D(2, 'max', 2))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
index f0fea2d7eb75c..e7af249cf8bc4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_lod_tensor_to_selected_rows.py
@@ -95,7 +95,7 @@ def simple_net_float32(self, is_sparse, dtype):
 
             for is_sort_sum_gradient in [True, False]:
                 with fluid.dygraph.guard(place):
-                    paddle.manual_seed(seed)
+                    paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
@@ -140,7 +140,7 @@ def simple_net_float32(self, is_sparse, dtype):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    paddle.manual_seed(seed)
+                    paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
index afe50664ef2eb..f256e97e83795 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ocr_attention_model.py
@@ -403,7 +403,7 @@ def test_while_op(self):
 
         with fluid.dygraph.guard():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             ocr_attention = OCRAttention()
 
@@ -454,7 +454,7 @@ def test_while_op(self):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             exe = fluid.Executor(fluid.CPUPlace(
             ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
index 7876675bcc6a1..cd019c920756f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer.py
@@ -74,7 +74,7 @@ def _check_exception(self, exception_message, place=None):
 
         with fluid.dygraph.guard(place):
             try:
-                paddle.manual_seed(seed)
+                paddle.seed(seed)
                 paddle.framework.random._manual_program_seed(seed)
                 mlp = MLP()
                 optimizer = self.get_optimizer_dygraph(
@@ -91,7 +91,7 @@ def _check_mlp(self, place=None):
             ) else fluid.CUDAPlace(0)
 
         with fluid.dygraph.guard(place):
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             mlp = MLP()
@@ -132,7 +132,7 @@ def _check_mlp(self, place=None):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
index e1b7847a6e6dd..4b1e7ec5e69fb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_optimizer_v2.py
@@ -74,7 +74,7 @@ def _check_exception(self, exception_message, place=None):
 
         try:
             paddle.disable_static()
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             mlp = MLP()
             optimizer = self.get_optimizer_dygraph(
@@ -93,7 +93,7 @@ def _check_mlp(self, place=None):
             ) else fluid.CUDAPlace(0)
 
         paddle.disable_static(place)
-        paddle.manual_seed(seed)
+        paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
         mlp = MLP()
@@ -142,7 +142,7 @@ def _check_mlp(self, place=None):
 
         paddle.enable_static()
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             if place == None:
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
index fa23ff8e7c29f..1c183a8c2b74a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn.py
@@ -226,7 +226,7 @@ def ptb_rnn_cpu_float32(self, is_sparse):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -294,7 +294,7 @@ def ptb_rnn_cpu_float32(self, is_sparse):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             ptb_model = PtbModel(
                 hidden_size=hidden_size,
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
index 0487f8dd9a640..e5453eed136c2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_ptb_rnn_sorted_gradient.py
@@ -45,7 +45,7 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
 
         with fluid.dygraph.guard():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             # TODO: marsyang1993 Change seed to
@@ -95,7 +95,7 @@ def ptb_rnn_sort_gradient_cpu_float32(self, is_sparse):
             dy_last_hidden_value = last_hidden.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             ptb_model = PtbModel(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
index 0076c61e58407..a89628c594de9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_reinforcement.py
@@ -64,7 +64,7 @@ def test_mnist_float32(self):
         mask = np.array(mask_list).astype("float32")
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             policy = Policy(input_size=4)
@@ -105,7 +105,7 @@ def test_mnist_float32(self):
                 dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
index e8a2298c17d00..2d67af82de87a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet.py
@@ -251,7 +251,7 @@ def test_resnet_float32(self):
         traced_layer = None
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
@@ -334,7 +334,7 @@ def test_resnet_float32(self):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
index 13b12da3318ca..13570d1bf71a5 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_resnet_sorted_gradient.py
@@ -78,7 +78,7 @@ def test_resnet_sort_gradient_float32(self):
         batch_num = 10
         with fluid.dygraph.guard():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             resnet = ResNet()
@@ -137,7 +137,7 @@ def test_resnet_sort_gradient_float32(self):
                     dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
index 45709a358635c..6c6b164bdec68 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load.py
@@ -219,7 +219,7 @@ def setUp(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -305,7 +305,7 @@ def testLoadAndSetVarBase(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -414,7 +414,7 @@ def testSetVariable(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -521,7 +521,7 @@ def testSetNumpy(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -711,7 +711,7 @@ def testLoadAndSetVarBaseBeforeTrain(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -802,7 +802,7 @@ def testSetNumpyBeforeTrain(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
index 0335fa547616e..672ffa9d39418 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_save_load_v2.py
@@ -219,7 +219,7 @@ def setUp(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -308,7 +308,7 @@ def testLoadAndSetVarBase(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -416,7 +416,7 @@ def testSetVariable(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -524,7 +524,7 @@ def testSetNumpy(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -638,7 +638,7 @@ def testSetVariableBeforeTrain(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -717,7 +717,7 @@ def testLoadAndSetVarBaseBeforeTrain(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
@@ -808,7 +808,7 @@ def testSetNumpyBeforeTrain(self):
         batch_num = 200
 
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             # TODO: marsyang1993 Change seed to
             ptb_model = PtbModel(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
index e47a70054be41..8f8890557ad12 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_se_resnext.py
@@ -311,7 +311,7 @@ def test_se_resnext_float32(self):
         batch_num = 1
         epoch_num = 1
         with fluid.dygraph.guard():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             se_resnext = SeResNeXt()
@@ -372,7 +372,7 @@ def test_se_resnext_float32(self):
                         dy_param_value[param.name] = param.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
 
             exe = fluid.Executor(fluid.CPUPlace(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
index 794f59e48507e..2f2a3e5de5ef9 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_selected_rows_to_lod_tensor.py
@@ -102,7 +102,7 @@ def simple_net_float(self, is_sparse, dtype):
             for is_sort_sum_gradient in [True, False]:
                 traced_layer = None
                 with fluid.dygraph.guard(place):
-                    paddle.manual_seed(seed)
+                    paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
@@ -146,7 +146,7 @@ def simple_net_float(self, is_sparse, dtype):
                     dy_loss_value = dy_loss.numpy()
 
                 with new_program_scope():
-                    paddle.manual_seed(seed)
+                    paddle.seed(seed)
                     paddle.framework.random._manual_program_seed(seed)
 
                     simple_net = SimpleNet(
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
index 1ab37aaed2353..e114961c0cc9a 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_star_gan_with_gradient_penalty.py
@@ -468,7 +468,7 @@ def build_optimizer(layer, cfg, loss=None):
 
 class DyGraphTrainModel(object):
     def __init__(self, cfg):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
 
         self.generator = Generator(cfg)
@@ -529,7 +529,7 @@ def create_data_layer():
                 shape=[None, cfg.c_dim], dtype='float32', name='label_trg')
             return image_real, label_org, label_trg
 
-        paddle.manual_seed(cfg.seed)
+        paddle.seed(cfg.seed)
         paddle.framework.random._manual_program_seed(cfg.seed)
         self.gen_program = fluid.Program()
         gen_startup_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
index 9f58ef881e4e4..57da838c554bb 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_transformer_sorted_gradient.py
@@ -951,7 +951,7 @@ def transformer_sort_gradient_float32(self, is_sparse):
 
         with guard():
             fluid.set_flags({'FLAGS_sort_sum_gradient': True})
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
@@ -1035,7 +1035,7 @@ def transformer_sort_gradient_float32(self, is_sparse):
             dy_token_num_value = dy_token_num.numpy()
 
         with new_program_scope():
-            paddle.manual_seed(seed)
+            paddle.seed(seed)
             paddle.framework.random._manual_program_seed(seed)
             transformer = TransFormer(
                 ModelHyperParams.src_vocab_size,
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
index c75acd7c15b1e..0c43d5693456c 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -80,7 +80,7 @@ class TestInplaceAddto(unittest.TestCase):
     def test_result(self):
         def run_program(enable_addto):
             np.random.seed(10)
-            paddle.manual_seed(10)
+            paddle.seed(10)
             paddle.framework.random._manual_program_seed(10)
             if fluid.core.is_compiled_with_cuda():
                 fluid.set_flags({"FLAGS_cudnn_deterministic": True})
diff --git a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
index c45c144e3ad44..19d0b1ea9895c 100644
--- a/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
+++ b/python/paddle/fluid/tests/unittests/test_instance_norm_op_v2.py
@@ -35,22 +35,22 @@ def test_error(self):
 
             def error1d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                instance_norm1d = paddle.nn.InstanceNorm1d(1)
+                instance_norm1d = paddle.nn.InstanceNorm1D(1)
                 instance_norm1d(fluid.dygraph.to_variable(x_data_4))
 
             def error2d():
                 x_data_3 = np.random.random(size=(2, 1, 3)).astype('float32')
-                instance_norm2d = paddle.nn.InstanceNorm2d(1)
+                instance_norm2d = paddle.nn.InstanceNorm2D(1)
                 instance_norm2d(fluid.dygraph.to_variable(x_data_3))
 
             def error3d():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                instance_norm3d = paddle.nn.BatchNorm3d(1)
+                instance_norm3d = paddle.nn.BatchNorm3D(1)
                 instance_norm3d(fluid.dygraph.to_variable(x_data_4))
 
             def weight_bias_false():
                 x_data_4 = np.random.random(size=(2, 1, 3, 3)).astype('float32')
-                instance_norm3d = paddle.nn.BatchNorm3d(
+                instance_norm3d = paddle.nn.BatchNorm3D(
                     1, weight_attr=False, bias_attr=False)
 
             with fluid.dygraph.guard(p):
@@ -75,7 +75,7 @@ def compute_v1(x):
 
             def compute_v2(x):
                 with fluid.dygraph.guard(p):
-                    bn = paddle.nn.InstanceNorm2d(shape[1])
+                    bn = paddle.nn.InstanceNorm2D(shape[1])
                     y = bn(fluid.dygraph.to_variable(x))
                 return y.numpy()
 
@@ -104,7 +104,7 @@ def compute_v1(x_np):
 
             def compute_v2(x_np):
                 with program_guard(Program(), Program()):
-                    ins = paddle.nn.InstanceNorm2d(shape[1])
+                    ins = paddle.nn.InstanceNorm2D(shape[1])
                     x = fluid.data(name='x', shape=x_np.shape, dtype=x_np.dtype)
                     y = ins(x)
                     exe.run(fluid.default_startup_program())
diff --git a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
index eaa7e711a29c7..0ace288d9d429 100644
--- a/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
@@ -37,7 +37,7 @@ def check_network_convergence(self,
                                   use_cuda=True,
                                   use_mem_opt=False,
                                   iter_num=5):
-        paddle.manual_seed(100)
+        paddle.seed(100)
         paddle.framework.random._manual_program_seed(100)
         prog = Program()
         startup_prog = Program()
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index 71ec1271a041e..ac9a3f06f8f3e 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -222,7 +222,7 @@ def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save_model(self, model_path=None):
@@ -370,7 +370,7 @@ def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
     def test_output_spec(self):
@@ -429,7 +429,7 @@ def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
         # train and save base model
         self.train_and_save_orig_model()
@@ -457,7 +457,7 @@ def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
     def train_and_save(self):
@@ -512,7 +512,7 @@ def setUp(self):
         # enable dygraph mode
         fluid.enable_dygraph()
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
     def verify_inference_correctness(self, layer, model_path, with_label=False):
diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py
index e3f477c1d9b5e..3908d65229afe 100644
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -57,7 +57,7 @@ def _get_place(self, force_to_use_cpu=False):
     @contextlib.contextmanager
     def static_graph(self):
         with new_program_scope():
-            paddle.manual_seed(self.seed)
+            paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
             yield
 
@@ -77,7 +77,7 @@ def get_static_graph_result(self,
     def dynamic_graph(self, force_to_use_cpu=False):
         with fluid.dygraph.guard(
                 self._get_place(force_to_use_cpu=force_to_use_cpu)):
-            paddle.manual_seed(self.seed)
+            paddle.seed(self.seed)
             paddle.framework.random._manual_program_seed(self.seed)
             yield
 
diff --git a/python/paddle/fluid/tests/unittests/test_manual_seed.py b/python/paddle/fluid/tests/unittests/test_manual_seed.py
index a1d6eb915ce78..75753dcd1e880 100644
--- a/python/paddle/fluid/tests/unittests/test_manual_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_manual_seed.py
@@ -17,16 +17,16 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.framework import manual_seed
+from paddle.framework import seed
 from paddle.fluid.framework import Program, default_main_program, default_startup_program
 import numpy as np
 
 
 class TestManualSeed(unittest.TestCase):
-    def test_manual_seed(self):
+    def test_seed(self):
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = fluid.layers.gaussian_random([10], dtype="float32")
         st1 = gen.get_state()
         x1 = fluid.layers.gaussian_random([10], dtype="float32")
diff --git a/python/paddle/fluid/tests/unittests/test_normal.py b/python/paddle/fluid/tests/unittests/test_normal.py
index 595e0bb480051..79632817662c5 100644
--- a/python/paddle/fluid/tests/unittests/test_normal.py
+++ b/python/paddle/fluid/tests/unittests/test_normal.py
@@ -18,7 +18,7 @@
 import copy
 
 np.random.seed(10)
-paddle.manual_seed(10)
+paddle.seed(10)
 
 
 class TestNormalAPI(unittest.TestCase):
@@ -61,7 +61,8 @@ def static_api(self):
         if isinstance(self.mean, np.ndarray) \
             and isinstance(self.std, np.ndarray):
             with paddle.static.program_guard(paddle.static.Program()):
-                mean = paddle.fluid.data('Mean', self.mean.shape, self.mean.dtype)
+                mean = paddle.fluid.data('Mean', self.mean.shape,
+                                         self.mean.dtype)
                 std = paddle.fluid.data('Std', self.std.shape, self.std.dtype)
                 out = paddle.normal(mean, std, self.shape)
 
@@ -76,7 +77,8 @@ def static_api(self):
             return ret_all
         elif isinstance(self.mean, np.ndarray):
             with paddle.static.program_guard(paddle.static.Program()):
-                mean = paddle.fluid.data('Mean', self.mean.shape, self.mean.dtype)
+                mean = paddle.fluid.data('Mean', self.mean.shape,
+                                         self.mean.dtype)
                 out = paddle.normal(mean, self.std, self.shape)
 
                 exe = paddle.static.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
index fee3494558604..e211a38e7ec4c 100644
--- a/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_paddle_save_load.py
@@ -73,7 +73,7 @@ def setUp(self):
         paddle.disable_static()
 
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
     def build_and_train_model(self):
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index c1169dfc5210a..cc2490d1f1245 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -105,7 +105,7 @@ def avg_pool1D_forward_naive(x,
     return out
 
 
-class TestPool1d_API(unittest.TestCase):
+class TestPool1D_API(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -138,7 +138,7 @@ def check_avg_dygraph_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool1d_dg = paddle.nn.layer.AvgPool1d(
+            avg_pool1d_dg = paddle.nn.layer.AvgPool1D(
                 kernel_size=2, stride=None, padding=0)
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -159,7 +159,7 @@ def check_avg_dygraph_padding_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool1d_dg = paddle.nn.AvgPool1d(
+            avg_pool1d_dg = paddle.nn.AvgPool1D(
                 kernel_size=2, stride=None, padding=1, count_include_pad=True)
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -190,7 +190,7 @@ def check_max_dygraph_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+            max_pool1d_dg = paddle.nn.layer.MaxPool1D(
                 kernel_size=2, stride=None, padding=0)
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -207,7 +207,7 @@ def check_max_dygraph_return_index_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool1d_dg = paddle.nn.layer.MaxPool1d(
+            max_pool1d_dg = paddle.nn.layer.MaxPool1D(
                 kernel_size=2, stride=None, padding=0)
             result = max_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -248,7 +248,7 @@ def test_pool1d(self):
             self.check_max_dygraph_return_index_results(place)
 
 
-class TestPool2dError_API(unittest.TestCase):
+class TestPool2DError_API(unittest.TestCase):
     def test_error_api(self):
         def run1():
             with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 91faf78418b0d..66505327c2df3 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -22,7 +22,7 @@
 import paddle
 
 
-class TestPool2d_API(unittest.TestCase):
+class TestPool2D_API(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -63,7 +63,7 @@ def check_avg_dygraph_results(self, place):
                 pool_type='avg')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
                 kernel_size=2, stride=2, padding=0)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -84,7 +84,7 @@ def check_avg_dygraph_padding_results(self, place):
                 exclusive=False)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
                 kernel_size=2, stride=2, padding=1, ceil_mode=False)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -104,7 +104,7 @@ def check_avg_dygraph_ceilmode_results(self, place):
                 ceil_mode=True)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
                 kernel_size=2, stride=2, padding=0, ceil_mode=True)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -144,7 +144,7 @@ def check_max_dygraph_results(self, place):
                 pool_type='max')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
                 kernel_size=2, stride=2, padding=0)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -188,7 +188,7 @@ def check_max_dygraph_padding_results(self, place):
                 exclusive=False)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
                 kernel_size=2, stride=2, padding=1, ceil_mode=False)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -208,7 +208,7 @@ def check_max_dygraph_ceilmode_results(self, place):
                 ceil_mode=True)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
                 kernel_size=2, stride=2, padding=0, ceil_mode=True)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -233,7 +233,7 @@ def check_max_dygraph_stride_is_none(self, place):
                 padding_algorithm="SAME")
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
                 kernel_size=2, stride=2, padding=0)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -254,7 +254,7 @@ def check_avg_dygraph_stride_is_none(self, place):
                 padding_algorithm="SAME")
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
                 kernel_size=2, stride=2, padding=0)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -279,7 +279,7 @@ def check_max_dygraph_padding(self, place):
                 pool_type='max')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool2d_dg = paddle.nn.layer.MaxPool2d(
+            max_pool2d_dg = paddle.nn.layer.MaxPool2D(
                 kernel_size=2, stride=2, padding=0)
             result = max_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -304,7 +304,7 @@ def check_avg_divisor(self, place):
                 pool_type='avg')
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool2d_dg = paddle.nn.layer.AvgPool2d(
+            avg_pool2d_dg = paddle.nn.layer.AvgPool2D(
                 kernel_size=2, stride=2, padding=0)
             result = avg_pool2d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -325,7 +325,7 @@ def test_pool2d(self):
             self.check_max_dygraph_nhwc_results(place)
 
 
-class TestPool2dError_API(unittest.TestCase):
+class TestPool2DError_API(unittest.TestCase):
     def test_error_api(self):
         def run1():
             with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 5e8828c3e9126..8553fa8b99a92 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -1018,7 +1018,7 @@ def init_shape(self):
 
 
 # ----- test API
-class TestPool2dAPI(unittest.TestCase):
+class TestPool2DAPI(unittest.TestCase):
     def test_api(self):
         x_NHWC = np.random.random([2, 5, 5, 3]).astype("float32")
         x_NCHW = np.random.random([2, 3, 5, 5]).astype("float32")
@@ -1237,7 +1237,7 @@ def test_api(self):
                 data_format="NHWC"))
 
 
-class TestPool2dAPI_Error(unittest.TestCase):
+class TestPool2DAPI_Error(unittest.TestCase):
     def test_api(self):
         input_NHWC = fluid.layers.data(
             name="input_NHWC",
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index 505a1c7383841..b2700303ee477 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -25,7 +25,7 @@
 from test_pool3d_op import adaptive_start_index, adaptive_end_index, pool3D_forward_naive, avg_pool3D_forward_naive, max_pool3D_forward_naive
 
 
-class TestPool3d_API(unittest.TestCase):
+class TestPool3D_API(unittest.TestCase):
     def setUp(self):
         np.random.seed(123)
         self.places = [fluid.CPUPlace()]
@@ -68,7 +68,7 @@ def check_avg_dygraph_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
                 kernel_size=2, stride=None, padding="SAME")
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -95,7 +95,7 @@ def check_avg_dygraph_padding_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
                 kernel_size=2,
                 stride=None,
                 padding=1,
@@ -120,7 +120,7 @@ def check_avg_dygraph_ceilmode_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
                 kernel_size=2, stride=None, padding=0, ceil_mode=True)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -159,7 +159,7 @@ def check_max_dygraph_results(self, place):
                 pool_type='max')
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
                 kernel_size=2, stride=None, padding=0)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -204,7 +204,7 @@ def check_max_dygraph_ceilmode_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
                 kernel_size=2, stride=None, padding=0, ceil_mode=True)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -225,7 +225,7 @@ def check_max_dygraph_padding_results(self, place):
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
-            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
                 kernel_size=2, stride=None, padding=1, ceil_mode=False)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -250,7 +250,7 @@ def check_max_dygraph_stride_is_none(self, place):
                 padding_algorithm="SAME")
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
                 kernel_size=2, stride=2, padding=0)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -270,7 +270,7 @@ def check_max_dygraph_padding(self, place):
                 pool_type='max')
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            max_pool3d_dg = paddle.nn.layer.MaxPool3d(
+            max_pool3d_dg = paddle.nn.layer.MaxPool3D(
                 kernel_size=2, stride=2, padding=0)
             result = max_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -299,7 +299,7 @@ def check_avg_divisor(self, place):
                 pool_type='avg')
 
             self.assertTrue(np.allclose(result.numpy(), result_np))
-            avg_pool3d_dg = paddle.nn.layer.AvgPool3d(
+            avg_pool3d_dg = paddle.nn.layer.AvgPool3D(
                 kernel_size=2, stride=2, padding=0)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
@@ -327,7 +327,7 @@ def test_pool3d(self):
             self.check_max_dygraph_ceilmode_results(place)
 
 
-class TestPool3dError_API(unittest.TestCase):
+class TestPool3DError_API(unittest.TestCase):
     def test_error_api(self):
         def run1():
             with fluid.dygraph.guard():
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index eab7126c7a422..fade1691210a4 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -219,7 +219,7 @@ def avg_pool3D_forward_naive(x,
     return out
 
 
-class TestPool3d_Op(OpTest):
+class TestPool3D_Op(OpTest):
     def setUp(self):
         self.op_type = "pool3d"
         self.init_kernel_type()
@@ -312,7 +312,7 @@ def init_adaptive(self):
         self.adaptive = False
 
 
-class TestCase1(TestPool3d_Op):
+class TestCase1(TestPool3D_Op):
     def init_shape(self):
         self.shape = [2, 3, 7, 7, 7]
 
@@ -330,7 +330,7 @@ def init_global_pool(self):
         self.global_pool = False
 
 
-class TestCase2(TestPool3d_Op):
+class TestCase2(TestPool3D_Op):
     def init_shape(self):
         self.shape = [2, 3, 6, 7, 7]
 
@@ -348,7 +348,7 @@ def init_global_pool(self):
         self.global_pool = False
 
 
-class TestCase3(TestPool3d_Op):
+class TestCase3(TestPool3D_Op):
     def init_pool_type(self):
         self.pool_type = "max"
 
@@ -378,7 +378,7 @@ def init_kernel_type(self):
     globals()[cls_name] = TestCUDNNCase
 
 
-create_test_cudnn_class(TestPool3d_Op)
+create_test_cudnn_class(TestPool3D_Op)
 create_test_cudnn_class(TestCase1)
 create_test_cudnn_class(TestCase2)
 create_test_cudnn_class(TestCase3)
@@ -405,7 +405,7 @@ def test_check_output(self):
     globals()[cls_name] = TestCUDNNFp16Case
 
 
-create_test_cudnn_fp16_class(TestPool3d_Op)
+create_test_cudnn_fp16_class(TestPool3D_Op)
 create_test_cudnn_fp16_class(TestCase1)
 create_test_cudnn_fp16_class(TestCase2)
 create_test_cudnn_fp16_class(TestCase3)
@@ -429,7 +429,7 @@ def init_ceil_mode(self):
     globals()[cls_name] = TestPool3DUseCeilCase
 
 
-create_test_cudnn_use_ceil_class(TestPool3d_Op)
+create_test_cudnn_use_ceil_class(TestPool3D_Op)
 create_test_cudnn_use_ceil_class(TestCase1)
 
 
@@ -480,7 +480,7 @@ def init_test_case(self):
 
 
 #-------test pool3d with asymmetric padding------
-class TestPool3d_Op_AsyPadding(TestPool3d_Op):
+class TestPool3D_Op_AsyPadding(TestPool3D_Op):
     def init_test_case(self):
         self.ksize = [3, 4, 3]
         self.strides = [1, 1, 2]
@@ -552,21 +552,21 @@ def init_shape(self):
         self.shape = [2, 3, 7, 7, 7]
 
 
-create_test_cudnn_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_class(TestPool3D_Op_AsyPadding)
 create_test_cudnn_class(TestCase1_AsyPadding)
 create_test_cudnn_class(TestCase2_AsyPadding)
 create_test_cudnn_class(TestCase3_AsyPadding)
 create_test_cudnn_class(TestCase4_AsyPadding)
 create_test_cudnn_class(TestCase5_AsyPadding)
 
-create_test_cudnn_fp16_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_fp16_class(TestPool3D_Op_AsyPadding)
 create_test_cudnn_fp16_class(TestCase1_AsyPadding)
 create_test_cudnn_fp16_class(TestCase2_AsyPadding)
 create_test_cudnn_fp16_class(TestCase3_AsyPadding)
 create_test_cudnn_fp16_class(TestCase4_AsyPadding)
 create_test_cudnn_fp16_class(TestCase5_AsyPadding)
 
-create_test_cudnn_use_ceil_class(TestPool3d_Op_AsyPadding)
+create_test_cudnn_use_ceil_class(TestPool3D_Op_AsyPadding)
 create_test_cudnn_use_ceil_class(TestCase1_AsyPadding)
 
 create_test_use_ceil_class(TestCase1_AsyPadding)
@@ -606,7 +606,7 @@ def init_paddings(self):
 
 
 # ------------ test channel_last --------------
-class TestPool3d_channel_last(TestPool3d_Op):
+class TestPool3D_channel_last(TestPool3D_Op):
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -654,14 +654,14 @@ def init_shape(self):
         self.shape = [2, 7, 7, 7, 3]
 
 
-create_test_cudnn_class(TestPool3d_channel_last)
+create_test_cudnn_class(TestPool3D_channel_last)
 create_test_cudnn_class(TestCase1_channel_last)
 create_test_cudnn_class(TestCase2_channel_last)
 create_test_cudnn_class(TestCase3_channel_last)
 create_test_cudnn_class(TestCase4_channel_last)
 create_test_cudnn_class(TestCase5_channel_last)
 
-create_test_cudnn_use_ceil_class(TestPool3d_channel_last)
+create_test_cudnn_use_ceil_class(TestPool3D_channel_last)
 create_test_cudnn_use_ceil_class(TestCase1_channel_last)
 
 create_test_use_ceil_class(TestCase1_channel_last)
@@ -716,7 +716,7 @@ def init_adaptive(self):
 
 
 # --- asy padding
-class TestPool3d_Op_AsyPadding_channel_last(TestPool3d_Op_AsyPadding):
+class TestPool3D_Op_AsyPadding_channel_last(TestPool3D_Op_AsyPadding):
     def init_data_format(self):
         self.data_format = "NDHWC"
 
@@ -764,14 +764,14 @@ def init_shape(self):
         self.shape = [2, 7, 8, 6, 3]
 
 
-create_test_cudnn_class(TestPool3d_Op_AsyPadding_channel_last)
+create_test_cudnn_class(TestPool3D_Op_AsyPadding_channel_last)
 create_test_cudnn_class(TestCase1_AsyPadding_channel_last)
 create_test_cudnn_class(TestCase2_AsyPadding_channel_last)
 create_test_cudnn_class(TestCase3_AsyPadding_channel_last)
 create_test_cudnn_class(TestCase4_AsyPadding_channel_last)
 create_test_cudnn_class(TestCase5_AsyPadding_channel_last)
 
-create_test_cudnn_use_ceil_class(TestPool3d_Op_AsyPadding_channel_last)
+create_test_cudnn_use_ceil_class(TestPool3D_Op_AsyPadding_channel_last)
 create_test_cudnn_use_ceil_class(TestCase1_AsyPadding_channel_last)
 
 create_test_use_ceil_class(TestCase1_AsyPadding_channel_last)
@@ -812,14 +812,14 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingSMAECase
 
 
-create_test_padding_SAME_class(TestPool3d_Op)
+create_test_padding_SAME_class(TestPool3D_Op)
 create_test_padding_SAME_class(TestCase1)
 create_test_padding_SAME_class(TestCase2)
 create_test_padding_SAME_class(TestCase3)
 create_test_padding_SAME_class(TestCase4)
 create_test_padding_SAME_class(TestCase5)
 
-create_test_padding_SAME_class(TestPool3d_channel_last)
+create_test_padding_SAME_class(TestPool3D_channel_last)
 create_test_padding_SAME_class(TestCase1_channel_last)
 create_test_padding_SAME_class(TestCase2_channel_last)
 create_test_padding_SAME_class(TestCase3_channel_last)
@@ -843,14 +843,14 @@ def init_paddings(self):
     globals()[cls_name] = TestCUDNNPaddingSMAECase
 
 
-create_test_cudnn_padding_SAME_class(TestPool3d_Op)
+create_test_cudnn_padding_SAME_class(TestPool3D_Op)
 create_test_cudnn_padding_SAME_class(TestCase1)
 create_test_cudnn_padding_SAME_class(TestCase2)
 create_test_cudnn_padding_SAME_class(TestCase3)
 create_test_cudnn_padding_SAME_class(TestCase4)
 create_test_cudnn_padding_SAME_class(TestCase5)
 
-create_test_cudnn_padding_SAME_class(TestPool3d_channel_last)
+create_test_cudnn_padding_SAME_class(TestPool3D_channel_last)
 create_test_cudnn_padding_SAME_class(TestCase1_channel_last)
 create_test_cudnn_padding_SAME_class(TestCase2_channel_last)
 create_test_cudnn_padding_SAME_class(TestCase3_channel_last)
@@ -869,14 +869,14 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingVALIDCase
 
 
-create_test_padding_VALID_class(TestPool3d_Op)
+create_test_padding_VALID_class(TestPool3D_Op)
 create_test_padding_VALID_class(TestCase1)
 create_test_padding_VALID_class(TestCase2)
 create_test_padding_VALID_class(TestCase3)
 create_test_padding_VALID_class(TestCase4)
 create_test_padding_VALID_class(TestCase5)
 
-create_test_padding_VALID_class(TestPool3d_channel_last)
+create_test_padding_VALID_class(TestPool3D_channel_last)
 create_test_padding_VALID_class(TestCase1_channel_last)
 create_test_padding_VALID_class(TestCase2_channel_last)
 create_test_padding_VALID_class(TestCase3_channel_last)
@@ -900,14 +900,14 @@ def init_paddings(self):
     globals()[cls_name] = TestCUDNNPaddingVALIDCase
 
 
-create_test_cudnn_padding_VALID_class(TestPool3d_Op)
+create_test_cudnn_padding_VALID_class(TestPool3D_Op)
 create_test_cudnn_padding_VALID_class(TestCase1)
 create_test_cudnn_padding_VALID_class(TestCase2)
 create_test_cudnn_padding_VALID_class(TestCase3)
 create_test_cudnn_padding_VALID_class(TestCase4)
 create_test_cudnn_padding_VALID_class(TestCase5)
 
-create_test_cudnn_padding_VALID_class(TestPool3d_channel_last)
+create_test_cudnn_padding_VALID_class(TestPool3D_channel_last)
 create_test_cudnn_padding_VALID_class(TestCase1_channel_last)
 create_test_cudnn_padding_VALID_class(TestCase2_channel_last)
 create_test_cudnn_padding_VALID_class(TestCase3_channel_last)
@@ -916,7 +916,7 @@ def init_paddings(self):
 
 
 #test API
-class TestPool3dAPI(unittest.TestCase):
+class TestPool3DAPI(unittest.TestCase):
     def test_api(self):
         x_NDHWC = np.random.random([2, 5, 5, 5, 3]).astype("float32")
         x_NCDHW = np.random.random([2, 3, 5, 5, 5]).astype("float32")
@@ -1101,7 +1101,7 @@ def test_api(self):
             atol=1e-05)
 
 
-class TestPool3dAPI_Error(unittest.TestCase):
+class TestPool3DAPI_Error(unittest.TestCase):
     def test_api(self):
         input_NDHWC = fluid.layers.data(
             name="input_NDHWC",
diff --git a/python/paddle/fluid/tests/unittests/test_py_func_op.py b/python/paddle/fluid/tests/unittests/test_py_func_op.py
index 32d8f73552f71..14b0eec9cbcdd 100644
--- a/python/paddle/fluid/tests/unittests/test_py_func_op.py
+++ b/python/paddle/fluid/tests/unittests/test_py_func_op.py
@@ -147,7 +147,7 @@ def test_main(use_cuda, use_py_func_op, use_parallel_executor):
 
     with fluid.program_guard(fluid.Program(), fluid.Program()):
         with fluid.scope_guard(fluid.core.Scope()):
-            gen = paddle.manual_seed(1)
+            gen = paddle.seed(1)
             np.random.seed(1)
             img = fluid.layers.data(name='image', shape=[784], dtype='float32')
             label = fluid.layers.data(name='label', shape=[1], dtype='int64')
diff --git a/python/paddle/fluid/tests/unittests/test_random_seed.py b/python/paddle/fluid/tests/unittests/test_random_seed.py
index 343508bf619b6..2a759d5b5464c 100644
--- a/python/paddle/fluid/tests/unittests/test_random_seed.py
+++ b/python/paddle/fluid/tests/unittests/test_random_seed.py
@@ -35,7 +35,7 @@ def test_generator_uniform_random_dygraph(self):
 
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = fluid.layers.uniform_random([10], dtype="float32", min=0.0, max=1.0)
 
         st1 = gen.get_state()
@@ -47,7 +47,7 @@ def test_generator_uniform_random_dygraph(self):
         x2 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
 
-        paddle.manual_seed(12312321111)
+        paddle.seed(12312321111)
         x3 = fluid.layers.uniform_random(
             [10], dtype="float32", min=0.0, max=1.0)
 
@@ -63,7 +63,7 @@ def test_generator_uniform_random_dygraph(self):
     def test_generator_uniform_random_static(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -97,7 +97,7 @@ def test_generator_uniform_random_static(self):
     def test_gen_dropout_dygraph(self):
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(111111111)
+        gen = paddle.seed(111111111)
         st = gen.get_state()
         # x = np.arange(1,101).reshape(2,50).astype("float32")
         x = fluid.layers.uniform_random(
@@ -118,7 +118,7 @@ def test_gen_dropout_dygraph(self):
     def test_gen_dropout_static(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -144,7 +144,7 @@ def test_generator_gaussian_random_dygraph(self):
         """Test Generator seed."""
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = fluid.layers.gaussian_random([10], dtype="float32")
         st1 = gen.get_state()
         x1 = fluid.layers.gaussian_random([10], dtype="float32")
@@ -165,7 +165,7 @@ def test_generator_gaussian_random_dygraph(self):
     def test_generator_gaussian_random_static(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -203,7 +203,7 @@ def test_generator_randint_dygraph(self):
 
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = paddle.randint(low=10, shape=[10], dtype="int32")
         st1 = gen.get_state()
         x1 = paddle.randint(low=10, shape=[10], dtype="int32")
@@ -224,7 +224,7 @@ def test_generator_randint_dygraph(self):
     def test_generator_uniform_random_static(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -259,7 +259,7 @@ def test_generator_randint_dygraph(self):
         """Test Generator seed."""
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = paddle.randint(low=1)
         st1 = gen.get_state()
         x1 = paddle.randint(low=1)
@@ -278,7 +278,7 @@ def test_generator_randint_dygraph(self):
     def test_generator_ranint_static(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -315,7 +315,7 @@ def test_generator_randperm_dygraph(self):
 
         fluid.enable_dygraph()
 
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
         x = paddle.randperm(10)
         st1 = gen.get_state()
         x1 = paddle.randperm(10)
@@ -337,7 +337,7 @@ def test_generator_randperm_static(self):
 
         fluid.disable_dygraph()
 
-        paddle.manual_seed(123123143)
+        paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -353,7 +353,7 @@ def test_generator_randperm_static(self):
                            feed={},
                            fetch_list=[result_1, result_2])
 
-            paddle.manual_seed(123123143)
+            paddle.seed(123123143)
             out2 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
@@ -371,7 +371,7 @@ def test_generator_randperm_static(self):
 
     def test_generator_sampling_id_dygraph(self):
         """Test Generator seed."""
-        gen = paddle.manual_seed(12312321111)
+        gen = paddle.seed(12312321111)
 
         fluid.enable_dygraph()
 
@@ -409,7 +409,7 @@ def test_generator_randperm_static(self):
 
         fluid.disable_dygraph()
 
-        paddle.manual_seed(123123143)
+        paddle.seed(123123143)
 
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -426,7 +426,7 @@ def test_generator_randperm_static(self):
                            feed={},
                            fetch_list=[result_1, result_2])
 
-            paddle.manual_seed(123123143)
+            paddle.seed(123123143)
             out2 = exe.run(train_program,
                            feed={},
                            fetch_list=[result_1, result_2])
@@ -445,7 +445,7 @@ def test_generator_randperm_static(self):
     def test_gen_TruncatedNormal_initializer(self):
         fluid.disable_dygraph()
 
-        gen = paddle.manual_seed(123123143)
+        gen = paddle.seed(123123143)
         cur_state = gen.get_state()
 
         startup_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer.py b/python/paddle/fluid/tests/unittests/test_regularizer.py
index 167a8a017c24a..04c6e45625724 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
@@ -169,7 +169,7 @@ def run_program(self, place, feed_list):
         return param_sum
 
     def check_l2decay_regularizer(self, place, model):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
@@ -189,7 +189,7 @@ def check_l2decay_regularizer(self, place, model):
         return param_sum
 
     def check_l2decay(self, place, model):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
@@ -246,7 +246,7 @@ def test_repeated_regularization(self):
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
                 np.random.randn(3, 2).astype('float32'))
-            paddle.manual_seed(1)
+            paddle.seed(1)
             paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
diff --git a/python/paddle/fluid/tests/unittests/test_regularizer_api.py b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
index 76186d2e39fea..e00a97aaa17f4 100644
--- a/python/paddle/fluid/tests/unittests/test_regularizer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer_api.py
@@ -94,7 +94,7 @@ def run_program(self, place, feed_list):
         return param_sum
 
     def check_l2decay_regularizer(self, place, model):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
@@ -114,7 +114,7 @@ def check_l2decay_regularizer(self, place, model):
         return param_sum
 
     def check_l2decay(self, place, model):
-        paddle.manual_seed(1)
+        paddle.seed(1)
         paddle.framework.random._manual_program_seed(1)
         main_prog = fluid.framework.Program()
         startup_prog = fluid.framework.Program()
@@ -171,7 +171,7 @@ def test_repeated_regularization(self):
         with fluid.dygraph.guard():
             input = fluid.dygraph.to_variable(
                 np.random.randn(3, 2).astype('float32'))
-            paddle.manual_seed(1)
+            paddle.seed(1)
             paddle.framework.random._manual_program_seed(1)
 
             linear1 = fluid.dygraph.Linear(
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index 3e1dd4ef57320..de94e0b0fcd2d 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -20,13 +20,13 @@
 paddle.disable_static()
 SEED = 2020
 np.random.seed(SEED)
-paddle.manual_seed(SEED)
+paddle.seed(SEED)
 
 
 class Generator(fluid.dygraph.Layer):
     def __init__(self):
         super(Generator, self).__init__()
-        self.conv1 = paddle.nn.Conv2d(3, 3, 3, padding=1)
+        self.conv1 = paddle.nn.Conv2D(3, 3, 3, padding=1)
 
     def forward(self, x):
         x = self.conv1(x)
@@ -37,7 +37,7 @@ def forward(self, x):
 class Discriminator(fluid.dygraph.Layer):
     def __init__(self):
         super(Discriminator, self).__init__()
-        self.convd = paddle.nn.Conv2d(6, 3, 1)
+        self.convd = paddle.nn.Conv2D(6, 3, 1)
 
     def forward(self, x):
         x = self.convd(x)
diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 066d0a37e1361..304e7cd9a5c32 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -617,7 +617,7 @@ def _calc_output(self, place, mode="test", dygraph=True):
             fluid.enable_dygraph(place)
         else:
             fluid.disable_dygraph()
-        gen = paddle.manual_seed(self._random_seed)
+        gen = paddle.seed(self._random_seed)
         gen._is_init_py = False
         paddle.framework.random._manual_program_seed(self._random_seed)
         scope = fluid.core.Scope()
diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index 1c11e831b0ad3..bfd22dbe1cee6 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -228,12 +228,12 @@ def test_convert(self):
 
         with program_guard(Program(), Program()):
             compare_model = paddle.nn.Sequential(
-                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
+                paddle.nn.Conv2D(3, 5, 3), paddle.nn.BatchNorm2D(5))
             model = paddle.nn.Sequential(
-                paddle.nn.Conv2d(3, 5, 3), paddle.nn.BatchNorm2d(5))
+                paddle.nn.Conv2D(3, 5, 3), paddle.nn.BatchNorm2D(5))
             model = paddle.nn.SyncBatchNorm.convert_sync_batchnorm(model)
             for idx, sublayer in enumerate(compare_model.sublayers()):
-                if isinstance(sublayer, paddle.nn.BatchNorm2d):
+                if isinstance(sublayer, paddle.nn.BatchNorm2D):
                     self.assertEqual(
                         isinstance(model[idx], paddle.nn.SyncBatchNorm), True)
 
diff --git a/python/paddle/fluid/tests/unittests/test_transformer_api.py b/python/paddle/fluid/tests/unittests/test_transformer_api.py
index 3133aad0f4853..23df03da1e5bc 100644
--- a/python/paddle/fluid/tests/unittests/test_transformer_api.py
+++ b/python/paddle/fluid/tests/unittests/test_transformer_api.py
@@ -211,7 +211,7 @@ def ffn(src, encoder_layer, ffn_fc1_act="relu"):
 class TestTransformer(unittest.TestCase):
     def test_multi_head_attention(self):
         def multihead_attention_test_helper(self_attention, cache):
-            paddle.manual_seed(2020)
+            paddle.seed(2020)
             paddle.framework.random._manual_program_seed(2020)
             # self_attention|cross_attention, cache|No cache
             with fluid.dygraph.guard(fluid.CPUPlace()):
@@ -275,7 +275,7 @@ def multihead_attention_test_helper(self_attention, cache):
     def test_transformer_encoder_layer(self):
 
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            paddle.framework.manual_seed(2020)
+            paddle.framework.seed(2020)
             paddle.framework.random._manual_program_seed(2020)
 
             ffn_fc1_act = "relu"
@@ -320,7 +320,7 @@ def test_transformer_encoder_layer(self):
 
     def test_transformer_decoder_layer(self):
         with fluid.dygraph.guard(fluid.CPUPlace()):
-            paddle.framework.manual_seed(2020)
+            paddle.framework.seed(2020)
             activation = "relu"
             normalize_before = False
             batch_size, d_model, n_head, dim_feedforward, dropout, attn_dropout, act_dropout, source_length, target_length = generate_basic_params(
diff --git a/python/paddle/fluid/tests/unittests/test_translated_layer.py b/python/paddle/fluid/tests/unittests/test_translated_layer.py
index e5dc279750d3d..d0b361d6f2c63 100644
--- a/python/paddle/fluid/tests/unittests/test_translated_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_translated_layer.py
@@ -77,7 +77,7 @@ def setUp(self):
         paddle.disable_static(place)
 
         # config seed
-        paddle.manual_seed(SEED)
+        paddle.seed(SEED)
         paddle.framework.random._manual_program_seed(SEED)
 
         # create network
diff --git a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
index 5ecf25c53b794..6de36c02bee05 100644
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
@@ -235,7 +235,7 @@ def test_check_output(self):
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-        paddle.manual_seed(10)
+        paddle.seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
@@ -256,7 +256,7 @@ class TestUniformRandomOpSelectedRowsWithDiagInit(
     def check_with_place(self, place):
         scope = core.Scope()
         out = scope.var("X").get_selected_rows()
-        paddle.manual_seed(10)
+        paddle.seed(10)
         op = Operator(
             "uniform_random",
             Out="X",
@@ -277,7 +277,7 @@ def check_with_place(self, place):
 
 class TestUniformRandomOpApi(unittest.TestCase):
     def test_api(self):
-        paddle.manual_seed(10)
+        paddle.seed(10)
         x = fluid.layers.data('x', shape=[16], dtype='float32', lod_level=1)
         y = fluid.layers.fc(x,
                             size=16,
@@ -350,7 +350,7 @@ def test_attr_tensor_int32_API(self):
 class TestUniformRandomOp_API_seed(unittest.TestCase):
     def test_attr_tensor_API(self):
         _seed = 10
-        gen = paddle.manual_seed(_seed)
+        gen = paddle.seed(_seed)
         gen._is_init_py = False
         startup_program = fluid.Program()
         train_program = fluid.Program()
@@ -392,7 +392,7 @@ def check_with_place(self, place):
         out = scope.var("X").get_selected_rows()
         shape_tensor = scope.var("Shape").get_tensor()
         shape_tensor.set(np.array([1000, 784]).astype("int64"), place)
-        paddle.manual_seed(10)
+        paddle.seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensor="Shape",
@@ -426,7 +426,7 @@ def check_with_place(self, place):
         shape_1.set(np.array([1000]).astype("int64"), place)
         shape_2 = scope.var("shape2").get_tensor()
         shape_2.set(np.array([784]).astype("int64"), place)
-        paddle.manual_seed(10)
+        paddle.seed(10)
         op = Operator(
             "uniform_random",
             ShapeTensorList=["shape1", "shape2"],
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 6d4258a426d05..2df24b00797c1 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -416,7 +416,7 @@ def _assert_to_static(self, var_base, static_var, is_param=False):
     def test_tensor_str(self):
         paddle.enable_static()
         paddle.disable_static(paddle.CPUPlace())
-        paddle.manual_seed(10)
+        paddle.seed(10)
         a = paddle.rand([10, 20])
         paddle.set_printoptions(4, 100, 3)
         a_str = str(a)
diff --git a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py b/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
index db5debdb43222..4e23b20581122 100644
--- a/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
+++ b/python/paddle/fluid/tests/unittests/test_var_conv_2d.py
@@ -19,7 +19,7 @@
 from op_test import OpTest, skip_check_grad_ci
 
 
-class TestVarConv2dOp(OpTest):
+class TestVarConv2DOp(OpTest):
     def setUp(self):
         self.init_op_type()
         self.set_data()
@@ -179,7 +179,7 @@ def test_check_grad(self):
             ['X'], 'Out', max_relative_error=0.005, check_dygraph=False)
 
 
-class TestVarConv2dOpCase1(TestVarConv2dOp):
+class TestVarConv2DOpCase1(TestVarConv2DOp):
     def set_data(self):
         # set in_ch 1
         input_channel = 1
@@ -192,7 +192,7 @@ def set_data(self):
                        col)
 
 
-class TestVarConv2dOpCase2(TestVarConv2dOp):
+class TestVarConv2DOpCase2(TestVarConv2DOp):
     def set_data(self):
         # set out_ch 1
         input_channel = 2
@@ -205,7 +205,7 @@ def set_data(self):
                        col)
 
 
-class TestVarConv2dOpCase3(TestVarConv2dOp):
+class TestVarConv2DOpCase3(TestVarConv2DOp):
     def set_data(self):
         # set batch 1
         input_channel = 2
@@ -218,7 +218,7 @@ def set_data(self):
                        col)
 
 
-class TestVarConv2dOpCase4(TestVarConv2dOp):
+class TestVarConv2DOpCase4(TestVarConv2DOp):
     def set_data(self):
         # set filter size very large
         input_channel = 3
@@ -231,7 +231,7 @@ def set_data(self):
                        col)
 
 
-class TestVarConv2dOpCase5(TestVarConv2dOp):
+class TestVarConv2DOpCase5(TestVarConv2DOp):
     def set_data(self):
         # set input very small
         input_channel = 50
@@ -247,7 +247,7 @@ def set_data(self):
 @skip_check_grad_ci(
     reason="[skip shape check] Use shape of input_channel, row and col all is 1 to test special LoDTensor."
 )
-class TestVarConv2dOpCase6(TestVarConv2dOp):
+class TestVarConv2DOpCase6(TestVarConv2DOp):
     def set_data(self):
         input_channel = 1
         output_channel = 3
@@ -259,7 +259,7 @@ def set_data(self):
                        col)
 
 
-class TestVarConv2dOpCase7(TestVarConv2dOp):
+class TestVarConv2DOpCase7(TestVarConv2DOp):
     def set_data(self):
         input_channel = 2
         output_channel = 3
@@ -271,7 +271,7 @@ def set_data(self):
                        col)
 
 
-class TestVarConv2dApi(unittest.TestCase):
+class TestVarConv2DApi(unittest.TestCase):
     def test_api(self):
         import paddle.fluid as fluid
 
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
index f826448c59664..aaa4f636b0951 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_conv2d_op_xpu.py
@@ -159,7 +159,7 @@ def init_paddings(self):
     globals()[cls_name] = TestPaddingVALIDCase
 
 
-class TestConv2dOp(OpTest):
+class TestConv2DOp(OpTest):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -274,7 +274,7 @@ def init_kernel_type(self):
         pass
 
 
-class TestWithPad(TestConv2dOp):
+class TestWithPad(TestConv2DOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [1, 1]
@@ -284,7 +284,7 @@ def init_test_case(self):
         self.filter_size = [6, f_c, 3, 3]
 
 
-class TestWithStride(TestConv2dOp):
+class TestWithStride(TestConv2DOp):
     def init_test_case(self):
         self.pad = [1, 1]
         self.stride = [2, 2]
@@ -294,7 +294,7 @@ def init_test_case(self):
         self.filter_size = [6, f_c, 3, 3]
 
 
-class TestWithGroup(TestConv2dOp):
+class TestWithGroup(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -305,7 +305,7 @@ def init_test_case(self):
         self.filter_size = [18, f_c, 3, 3]
 
 
-class TestWith1x1(TestConv2dOp):
+class TestWith1x1(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -318,7 +318,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithDilation(TestConv2dOp):
+class TestWithDilation(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -334,7 +334,7 @@ def init_group(self):
         self.groups = 3
 
 
-class TestWithInput1x1Filter1x1(TestConv2dOp):
+class TestWithInput1x1Filter1x1(TestConv2DOp):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 1]
@@ -356,7 +356,7 @@ def init_group(self):
 # ---- test asymmetric padding ----
 
 
-class TestConv2dOp_v2(OpTest):
+class TestConv2DOp_v2(OpTest):
     def setUp(self):
         self.op_type = "conv2d"
         self.use_cudnn = False
@@ -482,13 +482,13 @@ def init_test_case_2(self):
         pass
 
 
-class TestConv2dOp_AsyPadding(TestConv2dOp_v2):
+class TestConv2DOp_AsyPadding(TestConv2DOp_v2):
     def init_paddings(self):
         self.pad = [0, 0, 1, 2]
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithPad_AsyPadding(TestConv2dOp_v2):
+class TestWithPad_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -501,7 +501,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithStride_AsyPadding(TestConv2dOp_v2):
+class TestWithStride_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [2, 2]
         self.input_size = [2, 3, 6, 6]  # NCHW
@@ -514,7 +514,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithGroup_AsyPadding(TestConv2dOp_v2):
+class TestWithGroup_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.pad = [0, 0]
         self.stride = [1, 2]
@@ -525,7 +525,7 @@ def init_test_case(self):
         self.filter_size = [24, f_c, 4, 3]
 
 
-class TestWith1x1_AsyPadding(TestConv2dOp_v2):
+class TestWith1x1_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 5, 5]  # NCHW
@@ -541,7 +541,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithDilation_AsyPadding(TestConv2dOp_v2):
+class TestWithDilation_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [2, 3, 10, 10]  # NCHW
@@ -560,7 +560,7 @@ def init_paddings(self):
         self.padding_algorithm = "EXPLICIT"
 
 
-class TestWithInput1x1Filter1x1_AsyPadding(TestConv2dOp_v2):
+class TestWithInput1x1Filter1x1_AsyPadding(TestConv2DOp_v2):
     def init_test_case(self):
         self.stride = [1, 1]
         self.input_size = [40, 3, 1, 1]  # NCHW
@@ -577,20 +577,20 @@ def init_paddings(self):
 
 
 #---------- test SAME VALID -----------
-create_test_padding_SAME_class(TestConv2dOp_AsyPadding)
+create_test_padding_SAME_class(TestConv2DOp_AsyPadding)
 create_test_padding_SAME_class(TestWithPad_AsyPadding)
 create_test_padding_SAME_class(TestWithStride_AsyPadding)
 create_test_padding_SAME_class(TestWithGroup_AsyPadding)
 create_test_padding_SAME_class(TestWithInput1x1Filter1x1_AsyPadding)
 
-create_test_padding_VALID_class(TestConv2dOp_AsyPadding)
+create_test_padding_VALID_class(TestConv2DOp_AsyPadding)
 create_test_padding_VALID_class(TestWithPad_AsyPadding)
 create_test_padding_VALID_class(TestWithStride_AsyPadding)
 create_test_padding_VALID_class(TestWithGroup_AsyPadding)
 create_test_padding_VALID_class(TestWithInput1x1Filter1x1_AsyPadding)
 
 # ------------ test channel last ---------
-create_test_channel_last_class(TestConv2dOp_AsyPadding)
+create_test_channel_last_class(TestConv2DOp_AsyPadding)
 create_test_channel_last_class(TestWithPad_AsyPadding)
 create_test_channel_last_class(TestWithGroup_AsyPadding)
 create_test_channel_last_class(TestWith1x1_AsyPadding)
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index e52d9da99c3b9..3d06b4ab911ac 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -14,9 +14,8 @@
 
 # TODO: import framework api under this directory 
 __all__ = [
-    'create_parameter', 'ParamAttr',
-    'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace', 'get_default_dtype',
-    'set_default_dtype'
+    'create_parameter', 'ParamAttr', 'CPUPlace', 'CUDAPlace', 'CUDAPinnedPlace',
+    'get_default_dtype', 'set_default_dtype'
 ]
 
 __all__ += [
@@ -25,7 +24,7 @@
 ]
 
 from . import random
-from .random import manual_seed
+from .random import seed
 from .framework import get_default_dtype
 from .framework import set_default_dtype
 
diff --git a/python/paddle/framework/random.py b/python/paddle/framework/random.py
index ba2cf603d4a69..1624a069a51ec 100644
--- a/python/paddle/framework/random.py
+++ b/python/paddle/framework/random.py
@@ -16,10 +16,10 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
-__all__ = ['manual_seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
+__all__ = ['seed', 'get_cuda_rng_state', 'set_cuda_rng_state']
 
 
-def manual_seed(seed):
+def seed(seed):
     """
 
     Sets the seed for global default generator, which manages the random number generation.
@@ -34,7 +34,7 @@ def manual_seed(seed):
         .. code-block:: python
 
             import paddle
-            gen = paddle.manual_seed(102)
+            gen = paddle.seed(102)
 
     """
     #TODO(zhiqiu): 1. remove program.random_seed when all random-related op upgrade
@@ -109,7 +109,7 @@ def _manual_program_seed(seed):
     """
     Sets global seed for generating random numbers.
   
-    NOTE(zhiqiu): This is the original implemention of manual_seed. Keeps it temporally 
+    NOTE(zhiqiu): This is the original implemention of seed. Keeps it temporally
     since CUDA generator is not developed, so we need it in the unittest.
 
     Args:
diff --git a/python/paddle/hapi/model_summary.py b/python/paddle/hapi/model_summary.py
index 30b22a2f32c34..c6288ea40c59e 100644
--- a/python/paddle/hapi/model_summary.py
+++ b/python/paddle/hapi/model_summary.py
@@ -51,14 +51,14 @@ def __init__(self, num_classes=10):
                     super(LeNet, self).__init__()
                     self.num_classes = num_classes
                     self.features = nn.Sequential(
-                        nn.Conv2d(
+                        nn.Conv2D(
                             1, 6, 3, stride=1, padding=1),
                         nn.ReLU(),
-                        nn.MaxPool2d(2, 2),
-                        nn.Conv2d(
+                        nn.MaxPool2D(2, 2),
+                        nn.Conv2D(
                             6, 16, 5, stride=1, padding=0),
                         nn.ReLU(),
-                        nn.MaxPool2d(2, 2))
+                        nn.MaxPool2D(2, 2))
 
                     if num_classes > 0:
                         self.fc = nn.Sequential(
diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index 1d626c38c21bd..e53ba753a9bda 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -83,29 +83,29 @@
 from .layer.common import Upsample  #DEFINE_ALIAS
 from .layer.common import Bilinear  #DEFINE_ALIAS
 from .layer.common import Dropout  #DEFINE_ALIAS
-from .layer.common import Dropout2d  #DEFINE_ALIAS
-from .layer.common import Dropout3d  #DEFINE_ALIAS
+from .layer.common import Dropout2D  #DEFINE_ALIAS
+from .layer.common import Dropout3D  #DEFINE_ALIAS
 from .layer.common import AlphaDropout  #DEFINE_ALIAS
 
-from .layer.pooling import AvgPool1d  #DEFINE_ALIAS
-from .layer.pooling import AvgPool2d  #DEFINE_ALIAS
-from .layer.pooling import AvgPool3d  #DEFINE_ALIAS
-from .layer.pooling import MaxPool1d  #DEFINE_ALIAS
-from .layer.pooling import MaxPool2d  #DEFINE_ALIAS
-from .layer.pooling import MaxPool3d  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
+from .layer.pooling import AvgPool1D  #DEFINE_ALIAS
+from .layer.pooling import AvgPool2D  #DEFINE_ALIAS
+from .layer.pooling import AvgPool3D  #DEFINE_ALIAS
+from .layer.pooling import MaxPool1D  #DEFINE_ALIAS
+from .layer.pooling import MaxPool2D  #DEFINE_ALIAS
+from .layer.pooling import MaxPool3D  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool1D  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool2D  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveAvgPool3D  #DEFINE_ALIAS
 
-from .layer.pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
-from .layer.pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
-from .layer.conv import Conv1d  #DEFINE_ALIAS
-from .layer.conv import Conv2d  #DEFINE_ALIAS
-from .layer.conv import Conv3d  #DEFINE_ALIAS
-from .layer.conv import ConvTranspose1d  #DEFINE_ALIAS
-from .layer.conv import ConvTranspose2d  #DEFINE_ALIAS
-from .layer.conv import ConvTranspose3d  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool1D  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool2D  #DEFINE_ALIAS
+from .layer.pooling import AdaptiveMaxPool3D  #DEFINE_ALIAS
+from .layer.conv import Conv1D  #DEFINE_ALIAS
+from .layer.conv import Conv2D  #DEFINE_ALIAS
+from .layer.conv import Conv3D  #DEFINE_ALIAS
+from .layer.conv import Conv1DTranspose  #DEFINE_ALIAS
+from .layer.conv import Conv2DTranspose  #DEFINE_ALIAS
+from .layer.conv import Conv3DTranspose  #DEFINE_ALIAS
 # from .layer.conv import TreeConv        #DEFINE_ALIAS
 # from .layer.conv import Conv1D        #DEFINE_ALIAS
 from .layer.extension import RowConv  #DEFINE_ALIAS
@@ -127,12 +127,12 @@
 from .layer.norm import GroupNorm  #DEFINE_ALIAS
 from .layer.norm import LayerNorm  #DEFINE_ALIAS
 from .layer.norm import SpectralNorm  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm1d  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm2d  #DEFINE_ALIAS
-from .layer.norm import InstanceNorm3d  #DEFINE_ALIAS
-from .layer.norm import BatchNorm1d  #DEFINE_ALIAS
-from .layer.norm import BatchNorm2d  #DEFINE_ALIAS
-from .layer.norm import BatchNorm3d  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm1D  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm2D  #DEFINE_ALIAS
+from .layer.norm import InstanceNorm3D  #DEFINE_ALIAS
+from .layer.norm import BatchNorm1D  #DEFINE_ALIAS
+from .layer.norm import BatchNorm2D  #DEFINE_ALIAS
+from .layer.norm import BatchNorm3D  #DEFINE_ALIAS
 from .layer.norm import LocalResponseNorm  #DEFINE_ALIAS
 
 from .layer.rnn import RNNCellBase  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index d2e4ee2ac9d11..03dd40fb140cf 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -405,7 +405,7 @@ def conv2d(x,
             points. If dilation is a tuple, it must contain two integers, (dilation_height, 
             dilation_width). Otherwise, dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
-        groups (int): The groups number of the Conv2d Layer. According to grouped
+        groups (int): The groups number of the Conv2D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
@@ -896,7 +896,7 @@ def conv_transpose2d(x,
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
@@ -1122,7 +1122,7 @@ def conv3d(x,
             If dilation is a tuple, it must contain three integers, (dilation_depth, dilation_height,
             dilation_width). Otherwise, dilation_depth = dilation_height = dilation_width = dilation. 
             Default: dilation = 1.
-        groups (int): The groups number of the Conv3d Layer. According to grouped
+        groups (int): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
@@ -1340,7 +1340,7 @@ def conv_transpose3d(x,
             Default: padding = 0.
         output_padding(int|list|tuple, optional): Additional size added to one side
             of each dimension in the output shape. Default: 0.
-        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
diff --git a/python/paddle/nn/functional/norm.py b/python/paddle/nn/functional/norm.py
index 9b78368259127..0a1547bebbb31 100644
--- a/python/paddle/nn/functional/norm.py
+++ b/python/paddle/nn/functional/norm.py
@@ -127,7 +127,7 @@ def batch_norm(x,
     """
     Applies Batch Normalization as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
-    nn.functional.batch_norm is uesd for nn.BatchNorm1d, nn.BatchNorm2d, nn.BatchNorm3d. Please use above API for BatchNorm.
+    nn.functional.batch_norm is uesd for nn.BatchNorm1D, nn.BatchNorm2D, nn.BatchNorm3D. Please use above API for BatchNorm.
     
     Parameters:
         x(Tesnor): input value. It's data type should be float32, float64.
@@ -338,7 +338,7 @@ def instance_norm(x,
                   data_format="NCHW",
                   name=None):
     """
-    See more detail in nn.layer.InstanceNorm2d.
+    See more detail in nn.layer.InstanceNorm2D.
 
     Parameters:
         x(Tensor): Input Tensor. It's data type should be float32, float64.
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 1defed3362c1c..801290e99572b 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -53,27 +53,27 @@
 from .common import Flatten  #DEFINE_ALIAS
 from .common import Upsample  #DEFINE_ALIAS
 from .common import Dropout  #DEFINE_ALIAS
-from .common import Dropout2d  #DEFINE_ALIAS
-from .common import Dropout3d  #DEFINE_ALIAS
+from .common import Dropout2D  #DEFINE_ALIAS
+from .common import Dropout3D  #DEFINE_ALIAS
 from .common import AlphaDropout  #DEFINE_ALIAS
-from .pooling import AvgPool1d  #DEFINE_ALIAS
-from .pooling import AvgPool2d  #DEFINE_ALIAS
-from .pooling import AvgPool3d  #DEFINE_ALIAS
-from .pooling import MaxPool1d  #DEFINE_ALIAS
-from .pooling import MaxPool2d  #DEFINE_ALIAS
-from .pooling import MaxPool3d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool2d  #DEFINE_ALIAS
-from .pooling import AdaptiveAvgPool3d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool1d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool2d  #DEFINE_ALIAS
-from .pooling import AdaptiveMaxPool3d  #DEFINE_ALIAS
-from .conv import Conv1d  #DEFINE_ALIAS
-from .conv import Conv2d  #DEFINE_ALIAS
-from .conv import Conv3d  #DEFINE_ALIAS
-from .conv import ConvTranspose1d  #DEFINE_ALIAS
-from .conv import ConvTranspose2d  #DEFINE_ALIAS
-from .conv import ConvTranspose3d  #DEFINE_ALIAS
+from .pooling import AvgPool1D  #DEFINE_ALIAS
+from .pooling import AvgPool2D  #DEFINE_ALIAS
+from .pooling import AvgPool3D  #DEFINE_ALIAS
+from .pooling import MaxPool1D  #DEFINE_ALIAS
+from .pooling import MaxPool2D  #DEFINE_ALIAS
+from .pooling import MaxPool3D  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool1D  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool2D  #DEFINE_ALIAS
+from .pooling import AdaptiveAvgPool3D  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool1D  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool2D  #DEFINE_ALIAS
+from .pooling import AdaptiveMaxPool3D  #DEFINE_ALIAS
+from .conv import Conv1D  #DEFINE_ALIAS
+from .conv import Conv2D  #DEFINE_ALIAS
+from .conv import Conv3D  #DEFINE_ALIAS
+from .conv import Conv1DTranspose  #DEFINE_ALIAS
+from .conv import Conv2DTranspose  #DEFINE_ALIAS
+from .conv import Conv3DTranspose  #DEFINE_ALIAS
 # from .conv import TreeConv        #DEFINE_ALIAS
 # from .conv import Conv1D        #DEFINE_ALIAS
 from .extension import RowConv  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index 71bddefdb13e7..ad8263e48356c 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -32,8 +32,8 @@
     'Pad3D',
     'CosineSimilarity',
     'Dropout',
-    'Dropout2d',
-    'Dropout3d',
+    'Dropout2D',
+    'Dropout3D',
     'Bilinear',
     'AlphaDropout',
 ]
@@ -538,12 +538,12 @@ def forward(self, input):
         return out
 
 
-class Dropout2d(layers.Layer):
+class Dropout2D(layers.Layer):
     """
     Randomly zero out entire channels (in the batched input 4d tensor with the shape `NCHW` ,
     a channel is a 2D feature map with the shape `HW`). Each channel will be zeroed out independently
     on every forward call with probability `p` using samples from a Bernoulli distribution.
-    Dropout2d will help promote independence between feature maps as described in the paper:
+    Dropout2D will help promote independence between feature maps as described in the paper:
     `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_
 
     See ``paddle.nn.functional.dropout2d`` for more details.
@@ -570,7 +570,7 @@ class Dropout2d(layers.Layer):
             paddle.disable_static()
             x = np.random.random(size=(2, 3, 4, 5)).astype('float32')
             x = paddle.to_tensor(x)
-            m = paddle.nn.Dropout2d(p=0.5)
+            m = paddle.nn.Dropout2D(p=0.5)
             y_train = m(x)
             m.eval()  # switch the model to test phase
             y_test = m(x)
@@ -580,7 +580,7 @@ class Dropout2d(layers.Layer):
    """
 
     def __init__(self, p=0.5, data_format='NCHW', name=None):
-        super(Dropout2d, self).__init__()
+        super(Dropout2D, self).__init__()
 
         self.p = p
         self.data_format = data_format
@@ -596,12 +596,12 @@ def forward(self, input):
         return out
 
 
-class Dropout3d(layers.Layer):
+class Dropout3D(layers.Layer):
     """
     Randomly zero out entire channels (in the batched input 5d tensor with the shape `NCDHW` ,
     a channel is a 3D feature map with the shape `DHW` ). Each channel will be zeroed out independently
     on every forward call with probability `p` using samples from a Bernoulli distribution.
-    Dropout3d will help promote independence between feature maps as described in the paper:
+    Dropout3D will help promote independence between feature maps as described in the paper:
     `Efficient Object Localization Using Convolutional Networks <https://arxiv.org/abs/1411.4280>`_
 
     See ``paddle.nn.functional.dropout3d`` for more details.
@@ -628,7 +628,7 @@ class Dropout3d(layers.Layer):
             paddle.disable_static()
             x = np.random.random(size=(2, 3, 4, 5, 6)).astype('float32')
             x = paddle.to_tensor(x)
-            m = paddle.nn.Dropout3d(p=0.5)
+            m = paddle.nn.Dropout3D(p=0.5)
             y_train = m(x)
             m.eval()  # switch the model to test phase
             y_test = m(x)
@@ -638,7 +638,7 @@ class Dropout3d(layers.Layer):
    """
 
     def __init__(self, p=0.5, data_format='NCDHW', name=None):
-        super(Dropout3d, self).__init__()
+        super(Dropout3D, self).__init__()
 
         self.p = p
         self.data_format = data_format
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index baa89798b7fc3..51c466d113f02 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -15,12 +15,12 @@
 # TODO: define classes of convolutional neural network
 
 __all__ = [
-    'Conv1d',
-    'Conv2d',
-    'Conv3d',
-    'ConvTranspose1d',
-    'ConvTranspose2d',
-    'ConvTranspose3d',
+    'Conv1D',
+    'Conv2D',
+    'Conv3D',
+    'Conv1DTranspose',
+    'Conv2DTranspose',
+    'Conv3DTranspose',
 ]
 
 import numpy as np
@@ -113,9 +113,9 @@ def __init__(self,
             attr=self._bias_attr, shape=[self._out_channels], is_bias=True)
 
 
-class Conv1d(_ConvNd):
+class Conv1D(_ConvNd):
     """
-    This interface is used to construct a callable object of the ``Conv1d`` class.
+    This interface is used to construct a callable object of the ``Conv1D`` class.
     For more details, refer to code examples.
     The convolution1D layer calculates the output based on the input, filter
     and stride, padding, dilation, groups parameters. Input and
@@ -194,7 +194,7 @@ class Conv1d(_ConvNd):
     Examples:
         .. code-block:: python
           import paddle
-          from paddle.nn import Conv1d
+          from paddle.nn import Conv1D
           import numpy as np
           x = np.array([[[4, 8, 1, 9],
             [7, 2, 0, 9],
@@ -208,7 +208,7 @@ class Conv1d(_ConvNd):
             [5, 6, 8]]]).astype(np.float32)
           paddle.disable_static()
           x_t = paddle.to_tensor(x)
-          conv = Conv1d(3, 2, 3)
+          conv = Conv1D(3, 2, 3)
           conv.weight.set_value(w)
           y_t = conv(x_t)
           y_np = y_t.numpy()
@@ -229,7 +229,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(Conv1d, self).__init__(
+        super(Conv1D, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -266,9 +266,9 @@ def forward(self, x):
         return out
 
 
-class ConvTranspose1d(_ConvNd):
+class Conv1DTranspose(_ConvNd):
     """
-    This interface is used to construct a callable object of the ``ConvTranspose1d`` class.
+    This interface is used to construct a callable object of the ``Conv1DTranspose`` class.
     For more details, refer to code examples.
     The 1-D convolution transpose layer calculates the output based on the input,
     filter, and dilation, stride, padding. Input(Input) and output(Output)
@@ -340,7 +340,7 @@ class ConvTranspose1d(_ConvNd):
              `[pad]` or `[pad_left, pad_right]`. Default: padding = 0.
         output_padding(int|list|tuple, optional): The count of zeros to be added to tail of each dimension.
              If it is a tuple, it must contain one integer. Default: 0.
-        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
@@ -379,7 +379,7 @@ class ConvTranspose1d(_ConvNd):
        .. code-block:: python
 
           import paddle
-          from paddle.nn import ConvTranspose1d
+          from paddle.nn import Conv1DTranspose
           import numpy as np
           
           paddle.disable_static()
@@ -390,7 +390,7 @@ class ConvTranspose1d(_ConvNd):
           y=np.array([[[7, 0]],
                       [[4, 2]]]).astype(np.float32)
           x_t = paddle.to_tensor(x)
-          conv = ConvTranspose1d(2, 1, 2)
+          conv = Conv1DTranspose(2, 1, 2)
           conv.weight.set_value(y)
           y_t = conv(x_t)
           y_np = y_t.numpy()
@@ -411,7 +411,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCL"):
-        super(ConvTranspose1d, self).__init__(
+        super(Conv1DTranspose, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -441,9 +441,9 @@ def forward(self, x, output_size=None):
         return out
 
 
-class Conv2d(_ConvNd):
+class Conv2D(_ConvNd):
     """
-    This interface is used to construct a callable object of the ``Conv2d`` class.
+    This interface is used to construct a callable object of the ``Conv2D`` class.
     For more details, refer to code examples.
     The convolution2D layer calculates the output based on the input, filter
     and strides, paddings, dilations, groups parameters. Input and
@@ -491,7 +491,7 @@ class Conv2d(_ConvNd):
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups(int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
@@ -536,10 +536,12 @@ class Conv2d(_ConvNd):
 
           import paddle
           import paddle.nn as nn
-
+          
+          paddle.disable_static()
+          
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
           
-          conv = nn.Conv2d(4, 6, (3, 3))
+          conv = nn.Conv2D(4, 6, (3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
@@ -558,7 +560,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCHW"):
-        super(Conv2d, self).__init__(
+        super(Conv2D, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -600,9 +602,9 @@ def forward(self, x):
         return out
 
 
-class ConvTranspose2d(_ConvNd):
+class Conv2DTranspose(_ConvNd):
     """
-    This interface is used to construct a callable object of the ``ConvTranspose2d`` class.
+    This interface is used to construct a callable object of the ``Conv2DTranspose`` class.
     For more details, refer to code examples.
     The convolution2D transpose layer calculates the output based on the input,
     filter, and dilations, strides, paddings. Input and output
@@ -653,7 +655,7 @@ class ConvTranspose2d(_ConvNd):
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain two integers, (dilation_H, dilation_W). Otherwise, the
             dilation_H = dilation_W = dilation. Default: 1.
-        groups(int, optional): The groups number of the Conv2d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv2D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
@@ -701,10 +703,12 @@ class ConvTranspose2d(_ConvNd):
 
           import paddle
           import paddle.nn as nn
+          
+          paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8), dtype='float32', min=-1., max=1.)
 
-          conv = nn.ConvTranspose2d(4, 6, (3, 3))
+          conv = nn.Conv2DTranspose(4, 6, (3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
@@ -723,7 +727,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCHW"):
-        super(ConvTranspose2d, self).__init__(
+        super(Conv2DTranspose, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -758,7 +762,7 @@ def forward(self, x, output_size=None):
         return out
 
 
-class Conv3d(_ConvNd):
+class Conv3D(_ConvNd):
     """
     **Convlution3d Layer**
     The convolution3d layer calculates the output based on the input, filter
@@ -802,7 +806,7 @@ class Conv3d(_ConvNd):
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d Layer. According to grouped
+        groups(int, optional): The groups number of the Conv3D Layer. According to grouped
             convolution in Alex Krizhevsky's Deep CNN paper: when group=2,
             the first half of the filters is only connected to the first half
             of the input channels, while the second half of the filters is only
@@ -853,10 +857,12 @@ class Conv3d(_ConvNd):
 
           import paddle
           import paddle.nn as nn
+          
+          paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8, 8), dtype='float32', min=-1., max=1.)
           
-          conv = nn.Conv3d(4, 6, (3, 3, 3))
+          conv = nn.Conv3D(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
@@ -875,7 +881,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCDHW"):
-        super(Conv3d, self).__init__(
+        super(Conv3D, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
@@ -917,7 +923,7 @@ def forward(self, x):
         return out
 
 
-class ConvTranspose3d(_ConvNd):
+class Conv3DTranspose(_ConvNd):
     """
     **Convlution3D transpose layer**
     The convolution3D transpose layer calculates the output based on the input,
@@ -981,7 +987,7 @@ class ConvTranspose3d(_ConvNd):
         dilation(int|list|tuple, optional): The dilation size. If dilation is a tuple, it must
             contain three integers, (dilation_D, dilation_H, dilation_W). Otherwise, the
             dilation_D = dilation_H = dilation_W = dilation. The default value is 1.
-        groups(int, optional): The groups number of the Conv3d transpose layer. Inspired by
+        groups(int, optional): The groups number of the Conv3D transpose layer. Inspired by
             grouped convolution in Alex Krizhevsky's Deep CNN paper, in which
             when group=2, the first half of the filters is only connected to the
             first half of the input channels, while the second half of the
@@ -1035,10 +1041,12 @@ class ConvTranspose3d(_ConvNd):
 
           import paddle
           import paddle.nn as nn
+          
+          paddle.disable_static()
 
           x_var = paddle.uniform((2, 4, 8, 8, 8), dtype='float32', min=-1., max=1.)
           
-          conv = nn.ConvTranspose3d(4, 6, (3, 3, 3))
+          conv = nn.Conv3DTranspose(4, 6, (3, 3, 3))
           y_var = conv(x_var)
           y_np = y_var.numpy()
           print(y_np.shape)
@@ -1057,7 +1065,7 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format="NCDHW"):
-        super(ConvTranspose3d, self).__init__(
+        super(Conv3DTranspose, self).__init__(
             in_channels,
             out_channels,
             kernel_size,
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index ad8dc9b64e78a..a996844c8f5a8 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -54,17 +54,17 @@
 from .. import functional as F
 
 __all__ = [
-    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'BatchNorm1d',
-    'BatchNorm2d', 'BatchNorm3d', 'InstanceNorm1d', 'InstanceNorm2d',
-    'InstanceNorm3d', 'SyncBatchNorm', 'LocalResponseNorm'
+    'BatchNorm', 'GroupNorm', 'LayerNorm', 'SpectralNorm', 'BatchNorm1D',
+    'BatchNorm2D', 'BatchNorm3D', 'InstanceNorm1D', 'InstanceNorm2D',
+    'InstanceNorm3D', 'SyncBatchNorm', 'LocalResponseNorm'
 ]
 
 
 class _InstanceNormBase(layers.Layer):
     """
-    This class is based class for InstanceNorm1d, 2d, 3d. 
+    This class is based class for InstanceNorm1D, 2d, 3d. 
 
-    See InstaceNorm1d, InstanceNorm2d or InstanceNorm3d for more details.
+    See InstaceNorm1D, InstanceNorm2D or InstanceNorm3D for more details.
     """
 
     def __init__(self,
@@ -109,7 +109,7 @@ def forward(self, input):
             input, weight=self.scale, bias=self.bias, eps=self._epsilon)
 
 
-class InstanceNorm1d(_InstanceNormBase):
+class InstanceNorm1D(_InstanceNormBase):
     """
     Applies Instance Normalization over a 3D input (a mini-batch of 1D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
@@ -174,7 +174,7 @@ class InstanceNorm1d(_InstanceNormBase):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 3)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          instance_norm = paddle.nn.InstanceNorm1d(2)
+          instance_norm = paddle.nn.InstanceNorm1D(2)
           instance_norm_out = instance_norm(x)
 
           print(instance_norm_out.numpy())
@@ -187,7 +187,7 @@ def _check_input_dim(self, input):
                 len(input.shape)))
 
 
-class InstanceNorm2d(_InstanceNormBase):
+class InstanceNorm2D(_InstanceNormBase):
     """
     Applies Instance Normalization over a 4D input (a mini-batch of 2D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
@@ -251,7 +251,7 @@ class InstanceNorm2d(_InstanceNormBase):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 3)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          instance_norm = paddle.nn.InstanceNorm2d(2)
+          instance_norm = paddle.nn.InstanceNorm2D(2)
           instance_norm_out = instance_norm(x)
 
           print(instance_norm_out.numpy())
@@ -263,7 +263,7 @@ def _check_input_dim(self, input):
                 len(input.shape)))
 
 
-class InstanceNorm3d(_InstanceNormBase):
+class InstanceNorm3D(_InstanceNormBase):
     """
     Applies Instance Normalization over a 5D input (a mini-batch of 3D inputs with additional channel dimension) as described in the paper Instance Normalization: The Missing Ingredient for Fast Stylization .
 
@@ -327,7 +327,7 @@ class InstanceNorm3d(_InstanceNormBase):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 2, 2, 2, 3)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          instance_norm = paddle.nn.InstanceNorm3d(2)
+          instance_norm = paddle.nn.InstanceNorm3D(2)
           instance_norm_out = instance_norm(x)
 
           print(instance_norm_out.numpy())
@@ -671,7 +671,7 @@ def forward(self, input):
             data_format=self._data_format)
 
 
-class BatchNorm1d(_BatchNormBase):
+class BatchNorm1D(_BatchNormBase):
     """
     Applies Batch Normalization over a 2D or 3D input (a mini-batch of 1D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
@@ -747,7 +747,7 @@ class BatchNorm1d(_BatchNormBase):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 1, 3)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          batch_norm = paddle.nn.BatchNorm1d(1)
+          batch_norm = paddle.nn.BatchNorm1D(1)
           batch_norm_out = batch_norm(x)
 
           print(batch_norm_out.numpy())
@@ -768,7 +768,7 @@ def _check_input_dim(self, input):
                 len(input.shape)))
 
 
-class BatchNorm2d(_BatchNormBase):
+class BatchNorm2D(_BatchNormBase):
     """
     Applies Batch Normalization over a 4D input (a mini-batch of 2D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
@@ -843,7 +843,7 @@ class BatchNorm2d(_BatchNormBase):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 1, 2, 3)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          batch_norm = paddle.nn.BatchNorm2d(1)
+          batch_norm = paddle.nn.BatchNorm2D(1)
           batch_norm_out = batch_norm(x)
 
           print(batch_norm_out.numpy())
@@ -863,7 +863,7 @@ def _check_input_dim(self, input):
                 len(input.shape)))
 
 
-class BatchNorm3d(_BatchNormBase):
+class BatchNorm3D(_BatchNormBase):
     """
     Applies Batch Normalization over a 5D input (a mini-batch of 3D inputswith additional channel dimension) as described in the paper Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift .
 
@@ -938,7 +938,7 @@ class BatchNorm3d(_BatchNormBase):
           np.random.seed(123)
           x_data = np.random.random(size=(2, 1, 2, 2, 3)).astype('float32')
           x = paddle.to_tensor(x_data) 
-          batch_norm = paddle.nn.BatchNorm3d(1)
+          batch_norm = paddle.nn.BatchNorm3D(1)
           batch_norm_out = batch_norm(x)
 
           print(batch_norm_out.numpy())
@@ -1141,7 +1141,7 @@ def convert_sync_batchnorm(cls, layer):
                 import paddle.nn as nn
 
                 paddle.disable_static()
-                model = nn.Sequential(nn.Conv2d(3, 5, 3), nn.BatchNorm2d(5))
+                model = nn.Sequential(nn.Conv2D(3, 5, 3), nn.BatchNorm2D(5))
                 sync_model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
 
         """
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 129dae93b3832..9e544cb02e70e 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -17,22 +17,22 @@
 from .. import functional as F
 
 __all__ = [
-    'AvgPool1d',
-    'AvgPool2d',
-    'AvgPool3d',
-    'MaxPool1d',
-    'MaxPool2d',
-    'MaxPool3d',
-    'AdaptiveAvgPool1d',
-    'AdaptiveAvgPool2d',
-    'AdaptiveAvgPool3d',
-    'AdaptiveMaxPool1d',
-    'AdaptiveMaxPool2d',
-    'AdaptiveMaxPool3d',
+    'AvgPool1D',
+    'AvgPool2D',
+    'AvgPool3D',
+    'MaxPool1D',
+    'MaxPool2D',
+    'MaxPool3D',
+    'AdaptiveAvgPool1D',
+    'AdaptiveAvgPool2D',
+    'AdaptiveAvgPool3D',
+    'AdaptiveMaxPool1D',
+    'AdaptiveMaxPool2D',
+    'AdaptiveMaxPool3D',
 ]
 
 
-class AvgPool1d(layers.Layer):
+class AvgPool1D(layers.Layer):
     """
     This operation applies a 1D average pooling over an input signal composed
     of several input planes, based on the input, output_size, return_indices parameters.
@@ -93,8 +93,8 @@ class AvgPool1d(layers.Layer):
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AvgPool1d = nn.AvgPool1d(kernel_size=2, stride=2, padding=0)
-          pool_out = AvgPool1d(data)
+          AvgPool1D = nn.AvgPool1D(kernel_size=2, stride=2, padding=0)
+          pool_out = AvgPool1D(data)
           # pool_out shape: [1, 3, 16]
 
     """
@@ -106,7 +106,7 @@ def __init__(self,
                  count_include_pad=True,
                  ceil_mode=False,
                  name=None):
-        super(AvgPool1d, self).__init__()
+        super(AvgPool1D, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
@@ -120,7 +120,7 @@ def forward(self, x):
         return out
 
 
-class AvgPool2d(layers.Layer):
+class AvgPool2D(layers.Layer):
     """
     This operation applies 2D average pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -185,7 +185,7 @@ class AvgPool2d(layers.Layer):
 
           # max pool2d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          AvgPool2d = nn.AvgPool2d(kernel_size=2,
+          AvgPool2D = nn.AvgPool2D(kernel_size=2,
                                 stride=2, padding=0)
           output = AvgPoo2d(input)
           # output.shape [1, 3, 16, 16]
@@ -201,7 +201,7 @@ def __init__(self,
                  divisor_override=None,
                  data_format="NCHW",
                  name=None):
-        super(AvgPool2d, self).__init__()
+        super(AvgPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
@@ -224,7 +224,7 @@ def forward(self, x):
             name=self.name)
 
 
-class AvgPool3d(layers.Layer):
+class AvgPool3D(layers.Layer):
     """
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -277,9 +277,9 @@ class AvgPool3d(layers.Layer):
 
           # avg pool3d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          AvgPool3d = nn.AvgPool3d(kernel_size=2,
+          AvgPool3D = nn.AvgPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = AvgPool3d(input)
+          output = AvgPool3D(input)
           # output.shape [1, 2, 3, 16, 16]
 
     """
@@ -293,7 +293,7 @@ def __init__(self,
                  divisor_override=None,
                  data_format="NCDHW",
                  name=None):
-        super(AvgPool3d, self).__init__()
+        super(AvgPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
@@ -316,7 +316,7 @@ def forward(self, x):
             name=self.name)
 
 
-class MaxPool1d(layers.Layer):
+class MaxPool1D(layers.Layer):
     """
     Applies a 1D max pooling over an input signal composed of several input planes based
     on the input, output_size, return_indices parameters.
@@ -373,12 +373,12 @@ class MaxPool1d(layers.Layer):
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0)
-          pool_out = MaxPool1d(data)
+          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0)
+          pool_out = MaxPool1D(data)
           # pool_out shape: [1, 3, 16]
 
-          MaxPool1d = nn.MaxPool1d(kernel_size=2, stride=2, padding=0, return_indices=True)
-          pool_out, indices = MaxPool1d(data)
+          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = MaxPool1D(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
@@ -390,7 +390,7 @@ def __init__(self,
                  return_indices=False,
                  ceil_mode=False,
                  name=None):
-        super(MaxPool1d, self).__init__()
+        super(MaxPool1D, self).__init__()
         self.kernel_size = kernel_size
         self.stride = stride
         self.padding = padding
@@ -404,7 +404,7 @@ def forward(self, input):
         return out
 
 
-class MaxPool2d(layers.Layer):
+class MaxPool2D(layers.Layer):
     """
     This operation applies 2D max pooling over input feature based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -468,14 +468,14 @@ class MaxPool2d(layers.Layer):
 
           # max pool2d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32]).astype(np.float32))
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,
+          MaxPool2D = nn.MaxPool2D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool2d(input)
+          output = MaxPool2D(input)
           # output.shape [1, 3, 16, 16]
 
           # for return_indices=True
-          MaxPool2d = nn.MaxPool2d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool2d(input)
+          MaxPool2D = nn.MaxPool2D(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool2D(input)
           # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
 
@@ -487,7 +487,7 @@ def __init__(self,
                  ceil_mode=False,
                  data_format="NCHW",
                  name=None):
-        super(MaxPool2d, self).__init__()
+        super(MaxPool2D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
@@ -507,7 +507,7 @@ def forward(self, x):
             name=self.name)
 
 
-class MaxPool3d(layers.Layer):
+class MaxPool3D(layers.Layer):
     """
     This operation applies 3D max pooling over input features based on the input,
     and kernel_size, stride, padding parameters. Input(X) and Output(Out) are
@@ -559,14 +559,14 @@ class MaxPool3d(layers.Layer):
 
           # max pool3d
           input = paddle.to_tensor(np.random.uniform(-1, 1, [1, 2, 3, 32, 32]).astype(np.float32))
-          MaxPool3d = nn.MaxPool3d(kernel_size=2,
+          MaxPool3D = nn.MaxPool3D(kernel_size=2,
                                    stride=2, padding=0)
-          output = MaxPool3d(input)
+          output = MaxPool3D(input)
           # output.shape [1, 2, 3, 16, 16]
 
           # for return_indices=True
-          MaxPool3d = nn.MaxPool3d(kernel_size=2,stride=2, padding=0, return_indices=True)
-          output, max_indices = MaxPool3d(input)
+          MaxPool3D = nn.MaxPool3D(kernel_size=2,stride=2, padding=0, return_indices=True)
+          output, max_indices = MaxPool3D(input)
           # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
 
@@ -578,7 +578,7 @@ def __init__(self,
                  ceil_mode=False,
                  data_format="NCDHW",
                  name=None):
-        super(MaxPool3d, self).__init__()
+        super(MaxPool3D, self).__init__()
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
@@ -598,7 +598,7 @@ def forward(self, x):
             name=self.name)
 
 
-class AdaptiveAvgPool1d(layers.Layer):
+class AdaptiveAvgPool1D(layers.Layer):
     """
 
     This operation applies a 1D adaptive average pooling over an input signal composed
@@ -653,13 +653,13 @@ class AdaptiveAvgPool1d(layers.Layer):
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveAvgPool1d = nn.AdaptiveAvgPool1d(output_size=16)
-          pool_out = AdaptiveAvgPool1d(data)
+          AdaptiveAvgPool1D = nn.AdaptiveAvgPool1D(output_size=16)
+          pool_out = AdaptiveAvgPool1D(data)
           # pool_out shape: [1, 3, 16]
     """
 
     def __init__(self, output_size, name=None):
-        super(AdaptiveAvgPool1d, self).__init__()
+        super(AdaptiveAvgPool1D, self).__init__()
         self.output_size = output_size
         self.name = name
 
@@ -667,7 +667,7 @@ def forward(self, input):
         return F.adaptive_avg_pool1d(input, self.output_size, self.name)
 
 
-class AdaptiveAvgPool2d(layers.Layer):
+class AdaptiveAvgPool2D(layers.Layer):
     """
 
     This operation applies 2D adaptive avg pooling on input tensor. The h and w dimensions
@@ -704,7 +704,7 @@ class AdaptiveAvgPool2d(layers.Layer):
         output (Tensor): The output tensor of adaptive avg pool2d operator, which is a 4-D tensor. The data type is same as input x.
 
     Returns:
-        A callable object of AdaptiveAvgPool2d.
+        A callable object of AdaptiveAvgPool2D.
 
     Examples:
         .. code-block:: python
@@ -730,13 +730,13 @@ class AdaptiveAvgPool2d(layers.Layer):
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2d(output_size=3)
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool2D(output_size=3)
             pool_out = adaptive_avg_pool(x = x)
             # pool_out.shape is [2, 3, 3, 3]
     """
 
     def __init__(self, output_size, data_format="NCHW", name=None):
-        super(AdaptiveAvgPool2d, self).__init__()
+        super(AdaptiveAvgPool2D, self).__init__()
         self._output_size = output_size
         self._data_format = data_format
         self._name = name
@@ -749,7 +749,7 @@ def forward(self, x):
             name=self._name)
 
 
-class AdaptiveAvgPool3d(layers.Layer):
+class AdaptiveAvgPool3D(layers.Layer):
     """
 
     This operation applies 3D adaptive avg pooling on input tensor. The h and w dimensions
@@ -789,7 +789,7 @@ class AdaptiveAvgPool3d(layers.Layer):
         output (Tensor): The output tensor of adaptive avg pool3d operator, which is a 5-D tensor. The data type is same as input x.
 
     Returns:
-        A callable object of AdaptiveAvgPool3d.
+        A callable object of AdaptiveAvgPool3D.
 
     Examples:
         .. code-block:: python
@@ -818,13 +818,13 @@ class AdaptiveAvgPool3d(layers.Layer):
             input_data = np.random.rand(2, 3, 8, 32, 32)
             x = paddle.to_tensor(input_data)
             # x.shape is [2, 3, 8, 32, 32]
-            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3d(output_size=3)
+            adaptive_avg_pool = paddle.nn.AdaptiveAvgPool3D(output_size=3)
             pool_out = adaptive_avg_pool(x = x)
             # pool_out = [2, 3, 3, 3, 3]
     """
 
     def __init__(self, output_size, data_format="NCDHW", name=None):
-        super(AdaptiveAvgPool3d, self).__init__()
+        super(AdaptiveAvgPool3D, self).__init__()
         self._output_size = output_size
         self._data_format = data_format
         self._name = name
@@ -837,7 +837,7 @@ def forward(self, x):
             name=self._name)
 
 
-class AdaptiveMaxPool1d(layers.Layer):
+class AdaptiveMaxPool1D(layers.Layer):
     """
 
     This operation applies a 1D adaptive max pooling over an input signal composed
@@ -894,19 +894,19 @@ class AdaptiveMaxPool1d(layers.Layer):
           paddle.disable_static()
 
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16)
-          pool_out = AdaptiveMaxPool1d(data)
+          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16)
+          pool_out = AdaptiveMaxPool1D(data)
           # pool_out shape: [1, 3, 16]
 
           # for return_indices = true
-          AdaptiveMaxPool1d = nn.AdaptiveMaxPool1d(output_size=16, return_indices=True)
-          pool_out, indices = AdaptiveMaxPool1d(data)
+          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_indices=True)
+          pool_out, indices = AdaptiveMaxPool1D(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
     def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool1d, self).__init__()
+        super(AdaptiveMaxPool1D, self).__init__()
         self.output_size = output_size
         self.return_indices = return_indices
         self.name = name
@@ -916,7 +916,7 @@ def forward(self, input):
                                      self.return_indices, self.name)
 
 
-class AdaptiveMaxPool2d(layers.Layer):
+class AdaptiveMaxPool2D(layers.Layer):
     """
     This operation applies 2D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
@@ -941,7 +941,7 @@ class AdaptiveMaxPool2d(layers.Layer):
         output (Tensor): The output tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type is same as input x.
 
     Returns:
-        A callable object of AdaptiveMaxPool2d.
+        A callable object of AdaptiveMaxPool2D.
     Examples:
         .. code-block:: python
 
@@ -965,12 +965,12 @@ class AdaptiveMaxPool2d(layers.Layer):
             paddle.disable_static()
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2d(output_size=3, return_indices=True)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_indices=True)
             pool_out, indices = adaptive_max_pool(x = x)
     """
 
     def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool2d, self).__init__()
+        super(AdaptiveMaxPool2D, self).__init__()
         self._output_size = output_size
         self._return_indices = return_indices
         self._name = name
@@ -983,7 +983,7 @@ def forward(self, x):
             name=self._name)
 
 
-class AdaptiveMaxPool3d(layers.Layer):
+class AdaptiveMaxPool3D(layers.Layer):
     """
     This operation applies 3D adaptive max pooling on input tensor. The h and w dimensions
     of the output tensor are determined by the parameter output_size. The difference between adaptive pooling and pooling is adaptive one focus on the output size.
@@ -1010,7 +1010,7 @@ class AdaptiveMaxPool3d(layers.Layer):
         x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
         output (Tensor): The output tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type is same as input x.
     Returns:
-        A callable object of AdaptiveMaxPool3d.
+        A callable object of AdaptiveMaxPool3D.
     Examples:
         .. code-block:: python
 
@@ -1037,17 +1037,17 @@ class AdaptiveMaxPool3d(layers.Layer):
             paddle.disable_static()
             input_data = np.random.rand(2, 3, 8, 32, 32)
             x = paddle.to_tensor(input_data)
-            pool = paddle.nn.AdaptiveMaxPool3d(output_size=4)
+            pool = paddle.nn.AdaptiveMaxPool3D(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
-            pool = paddle.nn.AdaptiveMaxPool3d(output_size=3, return_indices=True)
+            pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_indices=True)
             out, indices = pool(x)
             # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
 
     """
 
     def __init__(self, output_size, return_indices=False, name=None):
-        super(AdaptiveMaxPool3d, self).__init__()
+        super(AdaptiveMaxPool3D, self).__init__()
         self._output_size = output_size
         self._return_indices = return_indices
         self._name = name
diff --git a/python/paddle/regularizer.py b/python/paddle/regularizer.py
index 5cbb86bfef2b4..a1ab329169af2 100644
--- a/python/paddle/regularizer.py
+++ b/python/paddle/regularizer.py
@@ -61,11 +61,11 @@ class L1Decay(fluid.regularizer.L1Decay):
             # Example2: set Regularizer in parameters
             # Set L1 regularization in parameters.
             # Global regularizer does not take effect on my_conv2d for this case.
-            from paddle.nn import Conv2d
+            from paddle.nn import Conv2D
             from paddle import ParamAttr
             from paddle.regularizer import L2Decay
 
-            my_conv2d = Conv2d(
+            my_conv2d = Conv2D(
                     in_channels=10,
                     out_channels=10,
                     kernel_size=1,
@@ -123,11 +123,11 @@ class L2Decay(fluid.regularizer.L2Decay):
             # Example2: set Regularizer in parameters
             # Set L2 regularization in parameters.
             # Global regularizer does not take effect on my_conv2d for this case.
-            from paddle.nn import Conv2d
+            from paddle.nn import Conv2D
             from paddle import ParamAttr
             from paddle.regularizer import L2Decay
 
-            my_conv2d = Conv2d(
+            my_conv2d = Conv2D(
                     in_channels=10,
                     out_channels=10,
                     kernel_size=1,
diff --git a/python/paddle/tensor/random.py b/python/paddle/tensor/random.py
index 3a0435e776eac..7e4d3d7bf9279 100644
--- a/python/paddle/tensor/random.py
+++ b/python/paddle/tensor/random.py
@@ -59,13 +59,13 @@ def bernoulli(x, name=None):
 
             import paddle
 
-            paddle.manual_seed(100) # on CPU device
+            paddle.seed(100) # on CPU device
             x = paddle.rand([2,3])
             print(x.numpy())
             # [[0.5535528  0.20714243 0.01162981]
             # [0.51577556 0.36369765 0.2609165 ]]
 
-            paddle.manual_seed(200) # on CPU device
+            paddle.seed(200) # on CPU device
             out = paddle.bernoulli(x)
             print(out.numpy())
             # [[0. 0. 0.]
@@ -110,13 +110,13 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
 
             import paddle
 
-            paddle.manual_seed(100) # on CPU device
+            paddle.seed(100) # on CPU device
             x = paddle.rand([2,4])
             print(x.numpy())
             # [[0.5535528  0.20714243 0.01162981 0.51577556]
             # [0.36369765 0.2609165  0.18905126 0.5621971 ]]
 
-            paddle.manual_seed(200) # on CPU device
+            paddle.seed(200) # on CPU device
             out1 = paddle.multinomial(x, num_samples=5, replacement=True)
             print(out1.numpy())
             # [[3 3 0 0 0]
@@ -126,7 +126,7 @@ def multinomial(x, num_samples=1, replacement=False, name=None):
             # InvalidArgumentError: When replacement is False, number of samples
             #  should be less than non-zero categories
 
-            paddle.manual_seed(300) # on CPU device
+            paddle.seed(300) # on CPU device
             out3 = paddle.multinomial(x, num_samples=3)
             print(out3.numpy())
             # [[3 0 1]
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index 0da110146a8e0..c56c1baa7a8e9 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -52,7 +52,7 @@ def set_printoptions(precision=None,
 
             import paddle
 
-            paddle.manual_seed(10)
+            paddle.seed(10)
             a = paddle.rand([10, 20])
             paddle.set_printoptions(4, 100, 3)
             print(a)
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index bcb910a5ada8a..3513f06234047 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -25,7 +25,7 @@
 import paddle
 from paddle import fluid
 from paddle import to_tensor
-from paddle.nn import Conv2d, Linear, ReLU, Sequential, Softmax
+from paddle.nn import Conv2D, Linear, ReLU, Sequential, Softmax
 
 from paddle import Model
 from paddle.static import InputSpec
@@ -44,11 +44,11 @@ def __init__(self, num_classes=10):
         super(LeNetDygraph, self).__init__()
         self.num_classes = num_classes
         self.features = Sequential(
-            Conv2d(
+            Conv2D(
                 1, 6, 3, stride=1, padding=1),
             ReLU(),
             paddle.fluid.dygraph.Pool2D(2, 'max', 2),
-            Conv2d(
+            Conv2D(
                 6, 16, 5, stride=1, padding=0),
             ReLU(),
             paddle.fluid.dygraph.Pool2D(2, 'max', 2))
@@ -142,7 +142,7 @@ def setUpClass(cls):
             cls.test_dataset, places=cls.device, batch_size=64)
 
         seed = 333
-        paddle.manual_seed(seed)
+        paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
         dy_lenet = LeNetDygraph()
@@ -194,7 +194,7 @@ def test_prepare_context(self):
     def fit(self, dynamic, num_replicas=None, rank=None):
         fluid.enable_dygraph(self.device) if dynamic else None
         seed = 333
-        paddle.manual_seed(seed)
+        paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
         net = LeNet()
@@ -306,7 +306,7 @@ def __len__(self):
 
 class TestModelFunction(unittest.TestCase):
     def set_seed(self, seed=1024):
-        paddle.manual_seed(seed)
+        paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
     def test_train_batch(self, dynamic=True):
diff --git a/python/paddle/vision/models/lenet.py b/python/paddle/vision/models/lenet.py
index b30d5992f9adf..119be85db54b9 100644
--- a/python/paddle/vision/models/lenet.py
+++ b/python/paddle/vision/models/lenet.py
@@ -38,14 +38,14 @@ def __init__(self, num_classes=10):
         super(LeNet, self).__init__()
         self.num_classes = num_classes
         self.features = nn.Sequential(
-            nn.Conv2d(
+            nn.Conv2D(
                 1, 6, 3, stride=1, padding=1),
             nn.ReLU(),
-            nn.MaxPool2d(2, 2),
-            nn.Conv2d(
+            nn.MaxPool2D(2, 2),
+            nn.Conv2D(
                 6, 16, 5, stride=1, padding=0),
             nn.ReLU(),
-            nn.MaxPool2d(2, 2))
+            nn.MaxPool2D(2, 2))
 
         if num_classes > 0:
             self.fc = nn.Sequential(
diff --git a/python/paddle/vision/models/mobilenetv1.py b/python/paddle/vision/models/mobilenetv1.py
index 4e6030bd14bf9..22d177248e8b3 100644
--- a/python/paddle/vision/models/mobilenetv1.py
+++ b/python/paddle/vision/models/mobilenetv1.py
@@ -36,7 +36,7 @@ def __init__(self,
                  num_groups=1):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = nn.Conv2d(
+        self._conv = nn.Conv2D(
             in_channels,
             out_channels,
             kernel_size,
@@ -45,7 +45,7 @@ def __init__(self,
             groups=num_groups,
             bias_attr=False)
 
-        self._norm_layer = nn.BatchNorm2d(out_channels)
+        self._norm_layer = nn.BatchNorm2D(out_channels)
         self._act = nn.ReLU()
 
     def forward(self, x):
@@ -214,7 +214,7 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         self.dwsl.append(dws6)
 
         if with_pool:
-            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+            self.pool2d_avg = nn.AdaptiveAvgPool2D(1)
 
         if num_classes > 0:
             self.fc = nn.Linear(int(1024 * scale), num_classes)
diff --git a/python/paddle/vision/models/mobilenetv2.py b/python/paddle/vision/models/mobilenetv2.py
index 0f4dc22f679df..f1cbaab1f90ac 100644
--- a/python/paddle/vision/models/mobilenetv2.py
+++ b/python/paddle/vision/models/mobilenetv2.py
@@ -46,11 +46,11 @@ def __init__(self,
                  kernel_size=3,
                  stride=1,
                  groups=1,
-                 norm_layer=nn.BatchNorm2d):
+                 norm_layer=nn.BatchNorm2D):
         padding = (kernel_size - 1) // 2
 
         super(ConvBNReLU, self).__init__(
-            nn.Conv2d(
+            nn.Conv2D(
                 in_planes,
                 out_planes,
                 kernel_size,
@@ -68,7 +68,7 @@ def __init__(self,
                  oup,
                  stride,
                  expand_ratio,
-                 norm_layer=nn.BatchNorm2d):
+                 norm_layer=nn.BatchNorm2D):
         super(InvertedResidual, self).__init__()
         self.stride = stride
         assert stride in [1, 2]
@@ -88,7 +88,7 @@ def __init__(self,
                 stride=stride,
                 groups=hidden_dim,
                 norm_layer=norm_layer),
-            nn.Conv2d(
+            nn.Conv2D(
                 hidden_dim, oup, 1, 1, 0, bias_attr=False),
             norm_layer(oup),
         ])
@@ -127,7 +127,7 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
 
         block = InvertedResidual
         round_nearest = 8
-        norm_layer = nn.BatchNorm2d
+        norm_layer = nn.BatchNorm2D
         inverted_residual_setting = [
             [1, 16, 1, 1],
             [6, 24, 2, 2],
@@ -169,7 +169,7 @@ def __init__(self, scale=1.0, num_classes=1000, with_pool=True):
         self.features = nn.Sequential(*features)
 
         if with_pool:
-            self.pool2d_avg = nn.AdaptiveAvgPool2d(1)
+            self.pool2d_avg = nn.AdaptiveAvgPool2D(1)
 
         if self.num_classes > 0:
             self.classifier = nn.Sequential(
diff --git a/python/paddle/vision/models/resnet.py b/python/paddle/vision/models/resnet.py
index 3ae01b6fd7d76..8cf797f1719e9 100644
--- a/python/paddle/vision/models/resnet.py
+++ b/python/paddle/vision/models/resnet.py
@@ -52,17 +52,17 @@ def __init__(self,
                  norm_layer=None):
         super(BasicBlock, self).__init__()
         if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
+            norm_layer = nn.BatchNorm2D
 
         if dilation > 1:
             raise NotImplementedError(
                 "Dilation > 1 not supported in BasicBlock")
 
-        self.conv1 = nn.Conv2d(
+        self.conv1 = nn.Conv2D(
             inplanes, planes, 3, padding=1, stride=stride, bias_attr=False)
         self.bn1 = norm_layer(planes)
         self.relu = nn.ReLU()
-        self.conv2 = nn.Conv2d(planes, planes, 3, padding=1, bias_attr=False)
+        self.conv2 = nn.Conv2D(planes, planes, 3, padding=1, bias_attr=False)
         self.bn2 = norm_layer(planes)
         self.downsample = downsample
         self.stride = stride
@@ -101,13 +101,13 @@ def __init__(self,
                  norm_layer=None):
         super(BottleneckBlock, self).__init__()
         if norm_layer is None:
-            norm_layer = nn.BatchNorm2d
+            norm_layer = nn.BatchNorm2D
         width = int(planes * (base_width / 64.)) * groups
 
-        self.conv1 = nn.Conv2d(inplanes, width, 1, bias_attr=False)
+        self.conv1 = nn.Conv2D(inplanes, width, 1, bias_attr=False)
         self.bn1 = norm_layer(width)
 
-        self.conv2 = nn.Conv2d(
+        self.conv2 = nn.Conv2D(
             width,
             width,
             3,
@@ -118,7 +118,7 @@ def __init__(self,
             bias_attr=False)
         self.bn2 = norm_layer(width)
 
-        self.conv3 = nn.Conv2d(
+        self.conv3 = nn.Conv2D(
             width, planes * self.expansion, 1, bias_attr=False)
         self.bn3 = norm_layer(planes * self.expansion)
         self.relu = nn.ReLU()
@@ -183,12 +183,12 @@ def __init__(self, block, depth, num_classes=1000, with_pool=True):
         layers = layer_cfg[depth]
         self.num_classes = num_classes
         self.with_pool = with_pool
-        self._norm_layer = nn.BatchNorm2d
+        self._norm_layer = nn.BatchNorm2D
 
         self.inplanes = 64
         self.dilation = 1
 
-        self.conv1 = nn.Conv2d(
+        self.conv1 = nn.Conv2D(
             3,
             self.inplanes,
             kernel_size=7,
@@ -197,13 +197,13 @@ def __init__(self, block, depth, num_classes=1000, with_pool=True):
             bias_attr=False)
         self.bn1 = self._norm_layer(self.inplanes)
         self.relu = nn.ReLU()
-        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.maxpool = nn.MaxPool2D(kernel_size=3, stride=2, padding=1)
         self.layer1 = self._make_layer(block, 64, layers[0])
         self.layer2 = self._make_layer(block, 128, layers[1], stride=2)
         self.layer3 = self._make_layer(block, 256, layers[2], stride=2)
         self.layer4 = self._make_layer(block, 512, layers[3], stride=2)
         if with_pool:
-            self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
+            self.avgpool = nn.AdaptiveAvgPool2D((1, 1))
 
         if num_classes > 0:
             self.fc = nn.Linear(512 * block.expansion, num_classes)
@@ -217,7 +217,7 @@ def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
             stride = 1
         if stride != 1 or self.inplanes != planes * block.expansion:
             downsample = nn.Sequential(
-                nn.Conv2d(
+                nn.Conv2D(
                     self.inplanes,
                     planes * block.expansion,
                     1,
diff --git a/python/paddle/vision/models/vgg.py b/python/paddle/vision/models/vgg.py
index 2d62e1d22d430..bb158569d3bc9 100644
--- a/python/paddle/vision/models/vgg.py
+++ b/python/paddle/vision/models/vgg.py
@@ -57,7 +57,7 @@ class VGG(nn.Layer):
     def __init__(self, features, num_classes=1000):
         super(VGG, self).__init__()
         self.features = features
-        self.avgpool = nn.AdaptiveAvgPool2d((7, 7))
+        self.avgpool = nn.AdaptiveAvgPool2D((7, 7))
         self.classifier = nn.Sequential(
             nn.Linear(512 * 7 * 7, 4096),
             nn.ReLU(),
@@ -80,11 +80,11 @@ def make_layers(cfg, batch_norm=False):
     in_channels = 3
     for v in cfg:
         if v == 'M':
-            layers += [nn.MaxPool2d(kernel_size=2, stride=2)]
+            layers += [nn.MaxPool2D(kernel_size=2, stride=2)]
         else:
-            conv2d = nn.Conv2d(in_channels, v, kernel_size=3, padding=1)
+            conv2d = nn.Conv2D(in_channels, v, kernel_size=3, padding=1)
             if batch_norm:
-                layers += [conv2d, nn.BatchNorm2d(v), nn.ReLU()]
+                layers += [conv2d, nn.BatchNorm2D(v), nn.ReLU()]
             else:
                 layers += [conv2d, nn.ReLU()]
             in_channels = v

From e8db4412d00b9fb72f9a0a04d90f15fbf861c1fa Mon Sep 17 00:00:00 2001
From: swtkiwi <1208425345@qq.com>
Date: Wed, 21 Oct 2020 19:19:13 +0800
Subject: [PATCH 025/185] delete matplotlib (#28159)

---
 python/requirements.txt | 2 --
 1 file changed, 2 deletions(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index 6e14636b0a4cd..138220b405748 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -3,9 +3,7 @@ numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5"
 protobuf>=3.1.0
 gast==0.3.3
-matplotlib<=2.2.4 ; python_version<"3.6"
 scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"
-matplotlib<=3.2.1 ; python_version>="3.6"
 scipy<=1.3.1 ; python_version=="3.5"
 scipy ; python_version>"3.5"
 rarfile

From 4873c20d747372fe9f60491d4b3f3373b5721d44 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Wed, 21 Oct 2020 19:19:29 +0800
Subject: [PATCH 026/185] modify ut cmakefile (#28140)

* modify ut cmakefile, test=develop
---
 .../fluid/tests/unittests/CMakeLists.txt      | 12 +++----
 .../unittests/test_collective_api_base.py     | 32 +++----------------
 2 files changed, 10 insertions(+), 34 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2fa03f205ba78..39e44f6aaa1ff 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -15,12 +15,6 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
-list(APPEND DIST_TEST_OPS test_collective_reduce_api)
-list(APPEND DIST_TEST_OPS test_collective_scatter_api)
-list(APPEND DIST_TEST_OPS test_collective_barrier_api)
-list(APPEND DIST_TEST_OPS test_collective_allreduce_api)
-list(APPEND DIST_TEST_OPS test_collective_broadcast_api)
-list(APPEND DIST_TEST_OPS test_collective_allgather_api)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -70,6 +64,12 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_scatter_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_barrier_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allreduce_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_broadcast_api)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_allgather_api)
 endif()
 
 #TODO(sunxiaolong01): Fix this unitest failed on GCC8.
diff --git a/python/paddle/fluid/tests/unittests/test_collective_api_base.py b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
index b1e87a306140a..84b58f15f887b 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_api_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_api_base.py
@@ -37,30 +37,6 @@ def get_model(self, train_prog, startup_prog, rank):
         raise NotImplementedError(
             "get model should be implemented by child class.")
 
-    def wait_server_ready(self, endpoints):
-        assert not isinstance(endpoints, string_types)
-        while True:
-            all_ok = True
-            not_ready_endpoints = []
-            for ep in endpoints:
-                ip_port = ep.split(":")
-                with closing(
-                        socket.socket(socket.AF_INET,
-                                      socket.SOCK_STREAM)) as sock:
-                    sock.settimeout(2)
-                    result = sock.connect_ex((ip_port[0], int(ip_port[1])))
-                    if result != 0:
-                        all_ok = False
-                        not_ready_endpoints.append(ep)
-            if not all_ok:
-                sys.stderr.write("server not ready, wait 3 sec to retry...\n")
-                sys.stderr.write("not ready endpoints:" + str(
-                    not_ready_endpoints) + "\n")
-                sys.stderr.flush()
-                time.sleep(3)
-            else:
-                break
-
     def run_trainer(self, args):
         train_prog = fluid.Program()
         startup_prog = fluid.Program()
@@ -157,8 +133,8 @@ def _run_cluster(self, model_file, envs):
         tr_cmd = "%s %s"
         tr0_cmd = tr_cmd % (self._python_interp, model_file)
         tr1_cmd = tr_cmd % (self._python_interp, model_file)
-        tr0_pipe = open("/tmp/tr0_err.log", "w")
-        tr1_pipe = open("/tmp/tr1_err.log", "w")
+        tr0_pipe = open("/tmp/tr0_err_%d.log" % os.getpid(), "w")
+        tr1_pipe = open("/tmp/tr1_err_%d.log" % os.getpid(), "w")
         #print(tr0_cmd) 
         tr0_proc = subprocess.Popen(
             tr0_cmd.strip().split(),
@@ -179,9 +155,9 @@ def _run_cluster(self, model_file, envs):
         # close trainer file
         tr0_pipe.close()
         tr1_pipe.close()
-        with open("/tmp/tr0_err.log", "r") as f:
+        with open("/tmp/tr0_err_%d.log" % os.getpid(), "r") as f:
             sys.stderr.write('trainer 0 stderr file: %s\n' % f.read())
-        with open("/tmp/tr1_err.log", "r") as f:
+        with open("/tmp/tr1_err_%d.log" % os.getpid(), "r") as f:
             sys.stderr.write('trainer 1 stderr file: %s\n' % f.read())
         return pickle.loads(tr0_out), pickle.loads(
             tr1_out), tr0_proc.pid, tr1_proc.pid

From 5d73bfdb9860ecdeb21965da8d5585eb216e9ccb Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 21 Oct 2020 21:05:06 +0800
Subject: [PATCH 027/185] fix test_weight_decay_extend error (#28178)

---
 .../fluid/contrib/tests/test_weight_decay_extend.py      | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
index 5ed7fd01a433b..65d400c63262b 100644
--- a/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
+++ b/python/paddle/fluid/contrib/tests/test_weight_decay_extend.py
@@ -149,17 +149,20 @@ def check_weight_decay2(self, place, model):
 
             avg_cost = model(data, label, self.word_dict_len)
 
+            optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
+
+            params_grads = optimizer.backward(avg_cost)
+
             param_list = [(var, var * self.learning_rate)
                           for var in main_prog.block(0).all_parameters()]
 
-            optimizer = fluid.optimizer.Adam(learning_rate=self.learning_rate)
-
-            optimizer.minimize(avg_cost)
             for params in param_list:
                 updated_p = fluid.layers.elementwise_sub(
                     x=params[0], y=params[1])
                 fluid.layers.assign(input=updated_p, output=params[0])
 
+            optimizer.apply_optimize(avg_cost, startup_prog, params_grads)
+
             param_sum = self.run_program(place, [data, label])
         return param_sum
 

From f935ca8a506374fdce3d6f75aab82ad29370ae71 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 21 Oct 2020 21:09:37 +0800
Subject: [PATCH 028/185] [lite-xpu-subgraph] Fix xpu compile and test xpu ci.
 (#27932)

---
 cmake/external/lite.cmake                              | 2 +-
 paddle/fluid/inference/api/analysis_config.cc          | 2 +-
 paddle/fluid/inference/lite/CMakeLists.txt             | 2 +-
 paddle/fluid/inference/lite/engine.cc                  | 4 ++--
 paddle/fluid/inference/tests/api/lite_resnet50_test.cc | 8 ++++++--
 paddle/fluid/pybind/inference_api.cc                   | 2 ++
 6 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 1da47bba7b6a5..9781d33966679 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -22,7 +22,7 @@ if(XPU_SDK_ROOT)
   set(LITE_WITH_XPU ON)
   include_directories("${XPU_SDK_ROOT}/XTDK/include")
   include_directories("${XPU_SDK_ROOT}/XTCL/include")
-  add_definitions(-DPADDLE_WITH_XPU)
+  add_definitions(-DLITE_SUBGRAPH_WITH_XPU)
   LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/shlib/")
   LINK_DIRECTORIES("${XPU_SDK_ROOT}/XTDK/runtime/shlib/")
 endif()
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 6965a0c904105..009ebd520c2b6 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -381,7 +381,7 @@ void AnalysisConfig::Update() {
   }
 
   if (use_xpu_) {
-#ifndef PADDLE_WITH_XPU
+#ifndef LITE_SUBGRAPH_WITH_XPU
     PADDLE_THROW(platform::errors::Unavailable(
         "You tried to use an XPU device, but Paddle was not compiled "
         "with XPU-runtime."));
diff --git a/paddle/fluid/inference/lite/CMakeLists.txt b/paddle/fluid/inference/lite/CMakeLists.txt
index fd513b59588f8..924d273a9fccd 100644
--- a/paddle/fluid/inference/lite/CMakeLists.txt
+++ b/paddle/fluid/inference/lite/CMakeLists.txt
@@ -4,6 +4,6 @@ endif()
 
 cc_library(lite_op_teller SRCS op_teller.cc DEPS lite_full_static framework_proto device_context boost xxhash)
 cc_library(lite_engine SRCS engine.cc DEPS lite_full_static framework_proto ${XPU_DEPS})
-cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context)
+cc_library(lite_tensor_utils SRCS tensor_utils.cc DEPS memcpy lite_full_static framework_proto boost device_context ${XPU_DEPS})
 cc_test(test_lite_engine SRCS test_engine.cc DEPS lite_engine protobuf framework_proto glog gtest analysis)
 cc_test(test_lite_tensor_utils SRCS test_tensor_utils.cc DEPS lite_engine lite_tensor_utils)
diff --git a/paddle/fluid/inference/lite/engine.cc b/paddle/fluid/inference/lite/engine.cc
index 5f24ef00bce59..b8f6104780f1e 100644
--- a/paddle/fluid/inference/lite/engine.cc
+++ b/paddle/fluid/inference/lite/engine.cc
@@ -16,7 +16,7 @@
 #define LITE_WITH_CUDA 1
 #endif
 
-#ifdef PADDLE_WITH_XPU
+#ifdef LITE_SUBGRAPH_WITH_XPU
 #define LITE_WITH_XPU 1
 #endif
 
@@ -59,7 +59,7 @@ paddle::lite_api::PaddlePredictor* EngineManager::Create(
       cfg.cpu_math_library_num_threads);
 #endif
 
-#ifdef PADDLE_WITH_XPU
+#ifdef LITE_SUBGRAPH_WITH_XPU
   lite_cxx_config.set_xpu_workspace_l3_size_per_thread(
       cfg.xpu_l3_workspace_size);
 #endif
diff --git a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
index d68065553a9cf..b88f09ae6a6a8 100644
--- a/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
+++ b/paddle/fluid/inference/tests/api/lite_resnet50_test.cc
@@ -26,7 +26,11 @@ namespace inference {
 TEST(AnalysisPredictor, use_gpu) {
   std::string model_dir = FLAGS_infer_model + "/" + "model";
   AnalysisConfig config;
+#if defined(PADDLE_WITH_CUDA)
   config.EnableUseGpu(100, 0);
+#elif defined(LITE_SUBGRAPH_WITH_XPU)
+  config.EnableXpu(100);
+#endif
   config.SetModel(model_dir + "/model", model_dir + "/params");
   config.EnableLiteEngine(paddle::AnalysisConfig::Precision::kFloat32, true);
 
@@ -40,7 +44,7 @@ TEST(AnalysisPredictor, use_gpu) {
   std::vector<float> input(input_num, 1);
 
   PaddleTensor in;
-  in.shape = {1, 3, 318, 318};
+  in.shape = {batch, channel, height, width};
   in.data =
       PaddleBuf(static_cast<void*>(input.data()), input_num * sizeof(float));
   in.dtype = PaddleDType::FLOAT32;
@@ -92,7 +96,7 @@ TEST(Predictor, use_gpu) {
   auto input_names = predictor->GetInputNames();
   auto input_t = predictor->GetInputHandle(input_names[0]);
 
-  input_t->Reshape({1, 3, 318, 318});
+  input_t->Reshape({batch, channel, height, width});
   input_t->CopyFromCpu(input.data());
   predictor->Run();
 
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index ac615a2320daa..e503ca31cdd74 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -443,6 +443,8 @@ void BindAnalysisConfig(py::module *m) {
       .def("params_file", &AnalysisConfig::params_file)
       .def("enable_use_gpu", &AnalysisConfig::EnableUseGpu,
            py::arg("memory_pool_init_size_mb"), py::arg("device_id") = 0)
+      .def("enable_xpu", &AnalysisConfig::EnableXpu,
+           py::arg("l3_workspace_size"))
       .def("disable_gpu", &AnalysisConfig::DisableGpu)
       .def("use_gpu", &AnalysisConfig::use_gpu)
       .def("gpu_device_id", &AnalysisConfig::gpu_device_id)

From c226b2e45a04fa811c9eba162be3bb527aa4faeb Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Thu, 22 Oct 2020 09:39:19 +0800
Subject: [PATCH 029/185] update dockerfile  (#27589)

* update dockerfile

* update dockerfile

* update dockerfile

* update dockerfile

* add opencv in ci

* update cidockerfile

* test nccl

* fix diff

* fix dockerfile

* update ubuntu nccl2.7.8

* update ubuntu nccl2.7.8
---
 Dockerfile                                        |  7 +------
 paddle/scripts/paddle_build.sh                    |  8 ++++----
 tools/dockerfile/Dockerfile.centos                | 15 +++++++--------
 tools/dockerfile/Dockerfile.ubuntu                |  5 ++---
 tools/dockerfile/build_scripts/install_nccl2.sh   | 10 +++++++---
 tools/dockerfile/ci_dockerfile.sh                 | 15 ++++++++++++---
 tools/manylinux1/Dockerfile.CI35-GCC8             | 10 +++++-----
 .../Dockerfile.cuda10_cudnn7_gcc48_ubuntu16       |  7 +------
 .../Dockerfile.cuda10_cudnn7_gcc8_py35_centos6    | 10 +++++-----
 .../Dockerfile.cuda10_cudnn7_gcc8_ubuntu16        |  5 ++++-
 .../Dockerfile.cuda9_cudnn7_gcc48_py35_centos6    | 10 +++++-----
 tools/manylinux1/Dockerfile.x64                   | 10 +++++-----
 tools/manylinux1/build_scripts/install_nccl2.sh   | 12 ++++++++----
 tools/xly_Dockerfile/Dockerfile.GCC8              |  6 +-----
 14 files changed, 67 insertions(+), 63 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index b92ac228a8d50..daab4340e3570 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -156,16 +156,12 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install 'ipykernel==4.6.0' 
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort
@@ -207,7 +203,6 @@ RUN wget --no-check-certificate https://pslib.bj.bcebos.com/openmpi-1.4.5.tar.gz
     export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH && export PATH=/usr/local/bin:$PATH && cd .. && \
     rm -rf openmpi-1.4.5.tar.gz && pip --no-cache-dir install mpi4py && ln -fs /bin/bash /bin/sh && \
     apt-get install libprotobuf-dev -y
-RUN pip --no-cache-dir install -U netifaces==0.10.9
 
 # Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
 # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index e4ad4f80b3887..016188057ad3c 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1407,7 +1407,7 @@ EOF
     # run paddle version to install python packages first
     RUN apt-get update && ${NCCL_DEPS}
     RUN apt-get install -y wget python3 python3-pip libgtk2.0-dev dmidecode python3-tk && \
-        pip3 install opencv-python py-cpuinfo==5.0.0 && wget ${ref_web}/${ref_paddle35} && ${ref_paddle35_mv1} pip3 install ${ref_paddle35_whl} ${ref_paddle35_mv2}; apt-get install -f -y && \
+        pip3 install py-cpuinfo==5.0.0 && wget ${ref_web}/${ref_paddle35} && ${ref_paddle35_mv1} pip3 install ${ref_paddle35_whl} ${ref_paddle35_mv2}; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f ${ref_paddle35} && \
         ldconfig
@@ -1429,7 +1429,7 @@ EOF
         CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
         make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.6.0.tgz
     RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.6 install opencv-python && wget ${ref_web}/${ref_paddle36} && ${ref_paddle36_mv1} pip3.6 install ${ref_paddle36_whl} ${ref_paddle36_mv2}; apt-get install -f -y && \
+        wget ${ref_web}/${ref_paddle36} && ${ref_paddle36_mv1} pip3.6 install ${ref_paddle36_whl} ${ref_paddle36_mv2}; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f ${ref_paddle36} && \
         ldconfig
@@ -1445,7 +1445,7 @@ EOF
         CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
         make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.7.0.tgz
     RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.7 install opencv-python && wget ${ref_web}/${ref_paddle37} && pip3.7 install ${ref_paddle37_whl}; apt-get install -f -y && \
+        wget ${ref_web}/${ref_paddle37} && pip3.7 install ${ref_paddle37_whl}; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f ${ref_paddle37} && \
         ldconfig
@@ -1461,7 +1461,7 @@ EOF
         CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \
         make -j8 > /dev/null && make altinstall > /dev/null && cd ../ && rm Python-3.8.0.tgz
     RUN apt-get install -y libgtk2.0-dev dmidecode python3-tk && ldconfig && \
-        pip3.8 install opencv-python && wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
+        wget ${ref_web}/${ref_paddle38} && pip3.8 install ${ref_paddle38_whl}; apt-get install -f -y && \
         apt-get clean -y && \
         rm -f ${ref_paddle38} && \
         ldconfig
diff --git a/tools/dockerfile/Dockerfile.centos b/tools/dockerfile/Dockerfile.centos
index a50d08354b8b4..c88d5927cf0cf 100644
--- a/tools/dockerfile/Dockerfile.centos
+++ b/tools/dockerfile/Dockerfile.centos
@@ -16,8 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig
 RUN yum install -y gettext-devel sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz wget curl-devel
 COPY build_scripts /build_scripts
 RUN bash build_scripts/build.sh
-RUN bash build_scripts/install_nccl2.sh && \
-    bash build_scripts/install_trt.sh  
+RUN bash build_scripts/install_nccl2.sh 
 RUN rm -rf build_scripts
 
 ENV SSL_CERT_FILE=/opt/_internal/certs.pem
@@ -63,12 +62,12 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /o
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32 && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python==4.2.0.32
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.15-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.8.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.8.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index b6fe78eef3f71..84473279364e5 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -126,7 +126,8 @@ RUN curl -s -q https://glide.sh/get | sh
 
 # Downgrade TensorRT 
 COPY tools/dockerfile/build_scripts /build_scripts
-RUN bash /build_scripts/install_trt.sh
+RUN bash /build_scripts/install_trt.sh && \
+    bash /build_scripts/install_nccl2.sh
 RUN rm -rf /build_scripts
 
 # git credential to skip password typing
@@ -202,8 +203,6 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
     cd binutils-2.27 && \
     ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
 
-RUN pip --no-cache-dir install -U netifaces==0.10.9
-
 # Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
 # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
 # So install a newer version here.
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 6307a52edd18b..0c9bf1409d90d 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -2,21 +2,24 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
+elif [ "$VERSION" == "10.2" ]; then
+  DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
 elif [ "$VERSION" == "10.1" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
 elif [ "$VERSION" == "9.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.3.7-ga-cuda9.0_1-1_amd64.deb"
 else
-  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
+  echo "nccl not found"
+  exit 2
 fi
 
-URL="http://nccl2-deb.gz.bcebos.com/$DEB"
+URL="http://nccl2-deb.cdn.bcebos.com/$DEB"
 
 DIR="/nccl2"
 mkdir -p $DIR
 # we cached the nccl2 deb package in BOS, so we can download it with wget
 # install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
-wget -O $DIR/$DEB $URL
+wget -q -O $DIR/$DEB $URL
 
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
@@ -26,4 +29,5 @@ for sub_deb in $DEBS; do
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
+rm /usr/include/nccl.h
 rm -rf $DIR
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index 9c8f8f563abb7..eea7bfda9af73 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -4,7 +4,7 @@ function make_ubuntu_dockerfile(){
   sed 's/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g' ./Dockerfile.ubuntu >${dockerfile_name}
   sed -i 's#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g' ${dockerfile_name} 
   dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
-  sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
   sed -i 's#<install_gcc>#WORKDIR /usr/bin \
       COPY tools/dockerfile/build_scripts /build_scripts \
@@ -15,7 +15,9 @@ function make_ubuntu_dockerfile(){
       RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
       RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
       ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
-
+  sed -i 's#bash /build_scripts/install_nccl2.sh#wget --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb && \
+      dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb && \
+      apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages #g' ${dockerfile_name}
 }
 
 
@@ -27,8 +29,15 @@ function make_centos_dockerfile(){
   sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
      ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/ && \
      rm -rf /usr/include/NvInfer*" ${dockerfile_name}
-  sed -i "${dockerfile_line}i RUN wget --no-check-certificate  -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i 's#RUN bash build_scripts/install_nccl2.sh##g' ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm && \
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm && \
+    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && \
+    rpm -ivh libnccl-2.7.8-1+cuda10.2.x86_64.rpm && \
+    rpm -ivh libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm && \
+    rpm -ivh libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && rm -f /usr/local/include/nccl.h " ${dockerfile_name}
   sed -i 's#<install_gcc>#WORKDIR /usr/bin \
       COPY tools/dockerfile/build_scripts /build_scripts \
       RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
diff --git a/tools/manylinux1/Dockerfile.CI35-GCC8 b/tools/manylinux1/Dockerfile.CI35-GCC8
index 2e7264a3f5254..e0c5d16bad64a 100644
--- a/tools/manylinux1/Dockerfile.CI35-GCC8
+++ b/tools/manylinux1/Dockerfile.CI35-GCC8
@@ -61,11 +61,11 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
index e3a3374b943bc..e996ec0e7651f 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc48_ubuntu16
@@ -161,16 +161,12 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' 
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort
@@ -212,7 +208,6 @@ RUN wget --no-check-certificate https://pslib.bj.bcebos.com/openmpi-1.4.5.tar.gz
     export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH && export PATH=/usr/local/bin:$PATH && cd .. && \
     rm -rf openmpi-1.4.5.tar.gz && pip --no-cache-dir install mpi4py && ln -fs /bin/bash /bin/sh && \
     apt-get install libprotobuf-dev -y
-RUN pip --no-cache-dir install -U netifaces==0.10.9
 
 # ccache 3.7.9
 RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6
index fa80ae72c39ed..30f84141745cc 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6
@@ -70,11 +70,11 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 424a6f3886821..7696e0523dedc 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -138,6 +138,10 @@ RUN curl -s -q https://glide.sh/get | sh
 # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle.
 #    See https://github.com/PaddlePaddle/Paddle/issues/10129 for details.
 
+RUN wget -q --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb && \
+    dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb && \
+    apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages 
+
 RUN wget -q https://paddlepaddledeps.bj.bcebos.com/TensorRT-6.0.1.5.Ubuntu-16.04.x86_64-gnu.cuda-10.1.cudnn7.tar.gz --no-check-certificate && \
     tar -zxf TensorRT-6.0.1.5.Ubuntu-16.04.x86_64-gnu.cuda-10.1.cudnn7.tar.gz -C /usr/local && \
     cp -rf /usr/local/TensorRT-6.0.1.5/include/* /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.5/lib/* /usr/lib/ 
@@ -217,7 +221,6 @@ RUN wget -q https://paddle-ci.gz.bcebos.com/binutils_2.27.orig.tar.gz && \
     ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
 
 RUN apt-get install libprotobuf-dev -y
-RUN pip --no-cache-dir install -U netifaces==0.10.9 
 
 # ccache 3.7.9
 RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
diff --git a/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6 b/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6
index 1f972c583cb83..82918fca37f97 100644
--- a/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6
+++ b/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc48_py35_centos6
@@ -56,11 +56,11 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install -r /root/requirements.txt && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64
index a02decb6a5af7..7ad1b3554ab48 100644
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
@@ -51,11 +51,11 @@ RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /o
     go get github.com/Masterminds/glide && \
     rm -rf /root/requirements.txt
 
-RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python && \
-    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' opencv-python
+RUN LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs4/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27mu/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-2.7.11-ucs2/lib:${LD_LIBRARY_PATH} /opt/python/cp27-cp27m/bin/pip install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.5.1/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.5.1/bin/pip3 install pre-commit 'ipython==5.3.0' && \
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.6.0/bin/pip3 install pre-commit 'ipython==5.3.0' && \ 
+    LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} /opt/_internal/cpython-3.7.0/bin/pip3 install pre-commit 'ipython==5.3.0' 
 
 RUN wget -O /opt/swig-2.0.12.tar.gz https://sourceforge.net/projects/swig/files/swig/swig-2.0.12/swig-2.0.12.tar.gz/download && \
     cd /opt && tar xzf swig-2.0.12.tar.gz && cd /opt/swig-2.0.12 && ./configure && make && make install && cd /opt && rm swig-2.0.12.tar.gz
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 01ee51765f551..0c9bf1409d90d 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -2,21 +2,24 @@
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
+elif [ "$VERSION" == "10.2" ]; then
+  DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
 elif [ "$VERSION" == "10.1" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
 elif [ "$VERSION" == "9.0" ]; then
-  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda9.0_1-1_amd64.deb"
+  DEB="nccl-repo-ubuntu1604-2.3.7-ga-cuda9.0_1-1_amd64.deb"
 else
-  DEB="nccl-repo-ubuntu1604-2.1.15-ga-cuda8.0_1-1_amd64.deb"
+  echo "nccl not found"
+  exit 2
 fi
 
-URL="http://nccl2-deb.gz.bcebos.com/$DEB"
+URL="http://nccl2-deb.cdn.bcebos.com/$DEB"
 
 DIR="/nccl2"
 mkdir -p $DIR
 # we cached the nccl2 deb package in BOS, so we can download it with wget
 # install nccl2: http://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html#down
-wget --no-proxy -O $DIR/$DEB $URL
+wget -q -O $DIR/$DEB $URL
 
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
@@ -26,4 +29,5 @@ for sub_deb in $DEBS; do
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
+rm /usr/include/nccl.h
 rm -rf $DIR
diff --git a/tools/xly_Dockerfile/Dockerfile.GCC8 b/tools/xly_Dockerfile/Dockerfile.GCC8
index 81de81365da0c..f453d50263be1 100644
--- a/tools/xly_Dockerfile/Dockerfile.GCC8
+++ b/tools/xly_Dockerfile/Dockerfile.GCC8
@@ -147,16 +147,12 @@ RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \
 
 RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3 --no-cache-dir install opencv-python && \
     pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.6 --no-cache-dir install opencv-python && \
     pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
     pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip3.7 --no-cache-dir install opencv-python && \
     pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \
-    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip --no-cache-dir install opencv-python
+    pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' 
 
 #For docstring checker
 RUN pip3 --no-cache-dir install pylint pytest astroid isort

From ee4309e6fc5bc5757ff1df22540a006ac0e05a3e Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Thu, 22 Oct 2020 09:39:44 +0800
Subject: [PATCH 030/185] upgrade shellcheck version (#28021)

test=develop
---
 tools/dockerfile/Dockerfile.ubuntu                      | 6 +++++-
 tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 | 6 +++++-
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 84473279364e5..4f8b092ceea65 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -29,9 +29,13 @@ RUN apt-get update && \
     python-matplotlib \
     automake locales clang-format swig  \
     liblapack-dev liblapacke-dev \
-    net-tools libtool module-init-tools shellcheck && \
+    net-tools libtool module-init-tools && \
     apt-get clean -y
 
+RUN wget https://github.com/koalaman/shellcheck/releases/download/v0.7.1/shellcheck-v0.7.1.linux.x86_64.tar.xz -O shellcheck-v0.7.1.linux.x86_64.tar.xz && \
+    tar -xf shellcheck-v0.7.1.linux.x86_64.tar.xz && cp  shellcheck-v0.7.1/shellcheck /usr/bin/shellcheck && \
+    rm -rf shellcheck-v0.7.1.linux.x86_64.tar.xz shellcheck-v0.7.1
+
 # Downgrade gcc&&g++
 <install_gcc>
 
diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
index 7696e0523dedc..8a557a588d55e 100644
--- a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
+++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16
@@ -26,7 +26,11 @@ RUN rm -rf /temp_gcc82 && rm -rf /gcc-8.2.0.tar.xz && rm -rf /gcc-8.2.0
 RUN apt-get update && \
     apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \
     libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \
-    xz-utils tk-dev libffi-dev liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev shellcheck
+    xz-utils tk-dev libffi-dev liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev
+
+RUN wget https://github.com/koalaman/shellcheck/releases/download/v0.7.1/shellcheck-v0.7.1.linux.x86_64.tar.xz -O shellcheck-v0.7.1.linux.x86_64.tar.xz && \
+    tar -xf shellcheck-v0.7.1.linux.x86_64.tar.xz && cp  shellcheck-v0.7.1/shellcheck /usr/bin/shellcheck && \
+    rm -rf shellcheck-v0.7.1.linux.x86_64.tar.xz shellcheck-v0.7.1
 
 # gcc8.2
 RUN wget -q https://paddle-docker-tar.bj.bcebos.com/home/users/tianshuo/bce-python-sdk-0.8.27/gcc-8.2.0.tar.xz && \

From e450823b8b4542d16e8bc188c5a5c1d9fbe41538 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Thu, 22 Oct 2020 10:01:32 +0800
Subject: [PATCH 031/185] Fix nccl op test failed, test=develop (#28172)

---
 .../fluid/operators/nccl/nccl_op_test.cu.cc   | 21 ++++++++-----------
 1 file changed, 9 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
index 216a277938fea..6c7fba8d4ac78 100644
--- a/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
+++ b/paddle/fluid/operators/nccl/nccl_op_test.cu.cc
@@ -174,10 +174,11 @@ void NCCLTester::testNcclAllReduceOp() {
     result_tensor->Resize(kDims);
     auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
-    paddle::memory::Copy(
-        cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
-        recv_tensor.numel() * sizeof(float),
-        static_cast<p::CUDADeviceContext *>(dev_ctxs_[i])->stream());
+    auto *dev_ctx = static_cast<p::CUDADeviceContext *>(dev_ctxs_[i]);
+    paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[i]), rt,
+                         recv_tensor.numel() * sizeof(float),
+                         dev_ctx->stream());
+    dev_ctx->Wait();
 
     for (int64_t j = 0; j < f::product(kDims); ++j) {
       ASSERT_NEAR(ct[j], expected_result, 1e-5);
@@ -272,10 +273,10 @@ void NCCLTester::testNcclBcastOp() {
   result_tensor->Resize(kDims);
   auto *ct = result_tensor->mutable_data<float>(cpu_place);
 
-  paddle::memory::Copy(
-      cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
-      recv_tensor.numel() * sizeof(float),
-      static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx])->stream());
+  auto *dev_ctx = static_cast<p::CUDADeviceContext *>(dev_ctxs_[idx]);
+  paddle::memory::Copy(cpu_place, ct, p::CUDAPlace(gpu_list_[idx]), rt,
+                       recv_tensor.numel() * sizeof(float), dev_ctx->stream());
+  dev_ctx->Wait();
 
   for (int64_t j = 0; j < f::product(kDims); ++j) {
     ASSERT_NEAR(ct[j], result, 1e-5);
@@ -288,13 +289,9 @@ TEST_F(NCCLTester, ncclInitOp) {}
 TEST_F(NCCLTester, ncclOp) {
   // Serial execution is required for the same nccl comm.
 
-  // ncclAllReduceOp with desc
-  // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9367
   testNcclReduceOp();
 
   testNcclAllReduceOp();
 
-  // ncclBcastOp with desc
-  // TODO(helin): https://github.com/PaddlePaddle/Paddle/issues/9540
   testNcclBcastOp();
 }

From 68449d19a5ad0fc31220087f15ef438f213d58ed Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Thu, 22 Oct 2020 10:27:40 +0800
Subject: [PATCH 032/185] Update hapi predict interface (#28180)

* update hapi predict interface

* fix code style

* fix docs

* fix docs

* fix docs

* update docs

* fix codes style

* fix unittest

* fix unittest

* fix coverage
---
 .../tests/unittests/test_rnn_decode_api.py    |  2 +-
 python/paddle/hapi/model.py                   | 44 +++++++++++--------
 python/paddle/tests/test_model.py             | 19 ++++++--
 python/paddle/tests/test_pretrained_model.py  |  4 +-
 python/paddle/tests/test_transforms.py        |  6 +++
 python/paddle/tests/test_vision_models.py     |  4 +-
 python/paddle/vision/transforms/transforms.py |  2 +-
 7 files changed, 52 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
index 304e7cd9a5c32..da25bc8d1fbaf 100644
--- a/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_decode_api.py
@@ -628,7 +628,7 @@ def _calc_output(self, place, mode="test", dygraph=True):
             model.prepare()
             if self.param_states:
                 model.load(self.param_states, optim_state=None)
-            return model.test_batch(self.inputs)
+            return model.predict_batch(self.inputs)
 
     def check_output_with_place(self, place, mode="test"):
         dygraph_output = self._calc_output(place, mode, dygraph=True)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 4f36effe6dd62..ff962fb1c1d5c 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -261,7 +261,7 @@ def eval_batch(self, inputs, labels=None):
         self.mode = 'eval'
         return self._run(inputs, labels)
 
-    def test_batch(self, inputs):
+    def predict_batch(self, inputs):
         self.mode = 'test'
         return self._run(inputs, None)
 
@@ -723,7 +723,7 @@ def eval_batch(self, inputs, labels=None):
         else:
             return metrics
 
-    def test_batch(self, inputs):
+    def predict_batch(self, inputs):
         self.model.network.eval()
         self.mode = 'test'
         inputs = [to_variable(x) for x in to_list(inputs)]
@@ -894,10 +894,13 @@ def train_batch(self, inputs, labels=None):
         Run one training step on a batch of data.
 
         Args:
-            inputs (list): A list of numpy.ndarray, each is a batch of
-                input data.
-            labels (list): A list of numpy.ndarray, each is a batch of
-                input label. If has no labels, set None. Default is None.
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+                be a numpy array or paddle.Tensor, or a list of arrays or 
+                tensors (in case the model has multiple inputs).
+            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
+                a numpy array or paddle.Tensor, or a list of arrays or tensors 
+                (in case the model has multiple labels). If has no labels, 
+                set None. Default is None.
 
         Returns:
             A list of scalar training loss if the model has no metrics,
@@ -941,10 +944,13 @@ def eval_batch(self, inputs, labels=None):
         Run one evaluating step on a batch of data.
 
         Args:
-            inputs (list): A list of numpy.ndarray, each is a batch of
-                input data.
-            labels (list): A list of numpy.ndarray, each is a batch of
-                input label. If has no labels, set None. Default is None.
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+                be a numpy array or paddle.Tensor, or a list of arrays or 
+                tensors (in case the model has multiple inputs).
+            labels (numpy.ndarray|Tensor|list): Batch of labels. It could be 
+                a numpy array or paddle.Tensor, or a list of arrays or tensors 
+                (in case the model has multiple labels). If has no labels, 
+                set None. Default is None.
 
         Returns:
             A list of scalar testing loss if the model has no metrics,
@@ -984,13 +990,14 @@ def eval_batch(self, inputs, labels=None):
             self._update_inputs()
         return loss
 
-    def test_batch(self, inputs):
+    def predict_batch(self, inputs):
         """
-        Run one testing step on a batch of data.
+        Run one predicting step on a batch of data.
 
         Args:
-            inputs (list): A list of numpy.ndarray, each is a batch of
-                input data.
+            inputs (numpy.ndarray|Tensor|list): Batch of input data. It could 
+                be a numpy array or paddle.Tensor, or a list of arrays or 
+                tensors (in case the model has multiple inputs).
 
         Returns:
             A list of numpy.ndarray of predictions, that is the outputs
@@ -1019,10 +1026,10 @@ def test_batch(self, inputs):
               model = paddle.Model(net, input, label)
               model.prepare()
               data = np.random.random(size=(4,784)).astype(np.float32)
-              out = model.test_batch([data])
+              out = model.predict_batch([data])
               print(out)
         """
-        loss = self._adapter.test_batch(inputs)
+        loss = self._adapter.predict_batch(inputs)
         if fluid.in_dygraph_mode() and self._input_shapes is None:
             self._update_inputs()
         return loss
@@ -1847,10 +1854,9 @@ def _run_one_epoch(self, data_loader, callbacks, mode, logs={}):
                     logs[k] = v
             else:
                 if self._inputs is not None:
-                    outs = getattr(self,
-                                   mode + '_batch')(data[:len(self._inputs)])
+                    outs = self.predict_batch(data[:len(self._inputs)])
                 else:
-                    outs = getattr(self, mode + '_batch')(data)
+                    outs = self.predict_batch(data)
 
                 outputs.append(outs)
 
diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 3513f06234047..1cdb7e4e827a9 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -284,6 +284,17 @@ def predict(self, dynamic):
 
         fluid.disable_dygraph() if dynamic else None
 
+    def test_predict_without_inputs(self):
+        fluid.enable_dygraph(self.device)
+        model = Model(LeNet())
+        model.prepare()
+        model.load(self.weight_path)
+        model._inputs = None
+        output = model.predict(
+            self.test_dataset, batch_size=64, stack_outputs=True)
+        np.testing.assert_equal(output[0].shape[0], len(self.test_dataset))
+        fluid.disable_dygraph()
+
 
 class MyModel(paddle.nn.Layer):
     def __init__(self):
@@ -370,7 +381,7 @@ def get_expect():
             inputs = [InputSpec([None, dim], 'float32', 'x')]
             model = Model(net, inputs)
             model.prepare()
-            out, = model.test_batch([data])
+            out, = model.predict_batch([data])
 
             np.testing.assert_allclose(out, ref, rtol=1e-6)
             fluid.disable_dygraph() if dynamic else None
@@ -546,7 +557,7 @@ def test_export_deploy_model(self):
                 np.random.random((1, 1, 28, 28)), dtype=np.float32)
 
             model.save(save_dir, training=False)
-            ori_results = model.test_batch(tensor_img)
+            ori_results = model.predict_batch(tensor_img)
             fluid.disable_dygraph() if dynamic else None
 
             place = fluid.CPUPlace() if not fluid.is_compiled_with_cuda(
@@ -569,7 +580,7 @@ def test_dygraph_export_deploy_model_about_inputs(self):
         mnist_data = MnistDataset(mode='train')
         paddle.disable_static()
         # without inputs
-        for initial in ["fit", "train_batch", "eval_batch", "test_batch"]:
+        for initial in ["fit", "train_batch", "eval_batch", "predict_batch"]:
             save_dir = tempfile.mkdtemp()
             if not os.path.exists(save_dir):
                 os.makedirs(save_dir)
@@ -590,7 +601,7 @@ def test_dygraph_export_deploy_model_about_inputs(self):
                 elif initial == "eval_batch":
                     model.eval_batch([img], [label])
                 else:
-                    model.test_batch([img])
+                    model.predict_batch([img])
 
             model.save(save_dir, training=False)
             shutil.rmtree(save_dir)
diff --git a/python/paddle/tests/test_pretrained_model.py b/python/paddle/tests/test_pretrained_model.py
index a36dd75549a9e..b24b51555c581 100644
--- a/python/paddle/tests/test_pretrained_model.py
+++ b/python/paddle/tests/test_pretrained_model.py
@@ -40,10 +40,10 @@ def infer(self, arch):
 
             if dygraph:
                 model.save(path)
-                res['dygraph'] = model.test_batch(x)
+                res['dygraph'] = model.predict_batch(x)
             else:
                 model.load(path)
-                res['static'] = model.test_batch(x)
+                res['static'] = model.predict_batch(x)
 
             if not dygraph:
                 paddle.disable_static()
diff --git a/python/paddle/tests/test_transforms.py b/python/paddle/tests/test_transforms.py
index ac21f8a6192c4..978200fd531c5 100644
--- a/python/paddle/tests/test_transforms.py
+++ b/python/paddle/tests/test_transforms.py
@@ -205,6 +205,12 @@ def test_to_tensor(self):
         assert isinstance(tensor, paddle.Tensor)
         np.testing.assert_equal(tensor.shape, (3, 50, 100))
 
+    def test_keys(self):
+        fake_img1 = self.create_image((200, 150, 3))
+        fake_img2 = self.create_image((200, 150, 3))
+        trans_pad = transforms.Pad(10, keys=("image", ))
+        fake_img_padded = trans_pad((fake_img1, fake_img2))
+
     def test_exception(self):
         trans = transforms.Compose([transforms.Resize(-1)])
 
diff --git a/python/paddle/tests/test_vision_models.py b/python/paddle/tests/test_vision_models.py
index 6489b02615bb9..5f35a1e0e5a4b 100644
--- a/python/paddle/tests/test_vision_models.py
+++ b/python/paddle/tests/test_vision_models.py
@@ -33,7 +33,7 @@ def models_infer(self, arch, pretrained=False, batch_norm=False):
         model = paddle.Model(net, input)
         model.prepare()
 
-        model.test_batch(x)
+        model.predict_batch(x)
 
     def test_mobilenetv2_pretrained(self):
         self.models_infer('mobilenet_v2', pretrained=False)
@@ -77,7 +77,7 @@ def test_lenet(self):
         lenet.prepare()
 
         x = np.array(np.random.random((2, 1, 28, 28)), dtype=np.float32)
-        lenet.test_batch(x)
+        lenet.predict_batch(x)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 9079f91aac9fa..06f3f231ef3d2 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -272,7 +272,7 @@ def __call__(self, inputs):
             else:
                 outputs.append(apply_func(inputs[i]))
         if len(inputs) > len(self.keys):
-            outputs.extend(input[len(self.keys):])
+            outputs.extend(inputs[len(self.keys):])
 
         if len(outputs) == 1:
             outputs = outputs[0]

From e73051609075ed273b283668015d45d1b027151e Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 22 Oct 2020 11:30:04 +0800
Subject: [PATCH 033/185] [Dy2stat] Refine return mechanism in @to_static
 (#28116)

* remove some judgement

* fix len(outputs) == 1
---
 .../dygraph_to_static/program_translator.py   |   8 +-
 .../dygraph_to_static/test_return.py          | 100 +++++++++++++-----
 2 files changed, 76 insertions(+), 32 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 2ff3fe833d66d..6d9bfc909a1bb 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -606,9 +606,11 @@ def from_func_spec(func_spec, input_spec, class_instance):
                         error.attach_error_data(e)
                         raise
 
-                if not isinstance(outputs,
-                                  (tuple, list)) and outputs is not None:
-                    outputs = [outputs]
+                if outputs is not None:
+                    need_wrap_into_list = not isinstance(outputs, (
+                        tuple, list)) or len(outputs) == 1
+                    if need_wrap_into_list:
+                        outputs = [outputs]
 
         main_program = update_op_callstack_with_origin_info(main_program)
 
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
index 1f4f82146645d..f592b7ed24461 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_return.py
@@ -18,8 +18,8 @@
 import numpy as np
 import paddle.fluid as fluid
 import paddle.fluid.core as core
-from paddle.fluid.dygraph import declarative
-from paddle.fluid.dygraph import ProgramTranslator
+from paddle.jit import to_static
+from paddle.jit import ProgramTranslator
 
 from ifelse_simple_func import dyfunc_with_if_else
 
@@ -27,13 +27,13 @@
 np.random.seed(SEED)
 
 
-@declarative
+@to_static
 def test_return_base(x):
     x = fluid.dygraph.to_variable(x)
     return x
 
 
-@declarative
+@to_static
 def test_inside_func_base(x):
     x = fluid.dygraph.to_variable(x)
 
@@ -43,7 +43,7 @@ def inner_func(x):
     return inner_func(x)
 
 
-@declarative
+@to_static
 def test_return_if(x):
     x = fluid.dygraph.to_variable(x)
     if x < 0:
@@ -53,7 +53,7 @@ def test_return_if(x):
     return x
 
 
-@declarative
+@to_static
 def test_return_if_else(x):
     x = fluid.dygraph.to_variable(x)
     if x > 0:
@@ -66,7 +66,7 @@ def test_return_if_else(x):
         x -= 8888  # useless statement to test our code can handle it.
 
 
-@declarative
+@to_static
 def test_return_in_while(x):
     x = fluid.dygraph.to_variable(x)
     i = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
@@ -79,7 +79,7 @@ def test_return_in_while(x):
     return x
 
 
-@declarative
+@to_static
 def test_return_in_for(x):
     x = fluid.dygraph.to_variable(x)
     for i in range(10):
@@ -91,13 +91,13 @@ def test_return_in_for(x):
     return x - 1
 
 
-@declarative
+@to_static
 def test_recursive_return(x):
     x = fluid.dygraph.to_variable(x)
     return dyfunc_with_if_else(x)
 
 
-@declarative
+@to_static
 def test_return_different_length_if_body(x):
     x = fluid.dygraph.to_variable(x)
     y = x + 1
@@ -108,7 +108,7 @@ def test_return_different_length_if_body(x):
         return x
 
 
-@declarative
+@to_static
 def test_return_different_length_else(x):
     x = fluid.dygraph.to_variable(x)
     y = x + 1
@@ -119,13 +119,13 @@ def test_return_different_length_else(x):
         return x
 
 
-@declarative
+@to_static
 def test_no_return(x):
     x = fluid.dygraph.to_variable(x)
     y = x + 1
 
 
-@declarative
+@to_static
 def test_return_none(x):
     x = fluid.dygraph.to_variable(x)
     y = x + 1
@@ -136,7 +136,7 @@ def test_return_none(x):
         return x, y
 
 
-@declarative
+@to_static
 def test_return_no_variable(x):
     x = fluid.dygraph.to_variable(x)
     y = x + 1
@@ -147,6 +147,38 @@ def test_return_no_variable(x):
         return
 
 
+@to_static
+def test_return_list_one_value(x):
+    x = fluid.dygraph.to_variable(x)
+    x += 1
+    return [x]
+
+
+@to_static
+def test_return_list_many_values(x):
+    x = fluid.dygraph.to_variable(x)
+    x += 1
+    y = x * 2
+    z = x * x
+    return [x, y, z]
+
+
+@to_static
+def test_return_tuple_one_value(x):
+    x = fluid.dygraph.to_variable(x)
+    x += 1
+    return (x, )
+
+
+@to_static
+def test_return_tuple_many_values(x):
+    x = fluid.dygraph.to_variable(x)
+    x += 1
+    y = x * 2
+    z = x * x
+    return (x, y, z)
+
+
 class TestReturnBase(unittest.TestCase):
     def setUp(self):
         self.input = np.ones((1)).astype('int32')
@@ -158,29 +190,19 @@ def setUp(self):
     def init_dygraph_func(self):
         self.dygraph_func = test_return_base
 
-    def run_dygraph_mode(self):
-        self.program_translator.enable(False)
+    def _run(self, to_static=False):
+        self.program_translator.enable(to_static)
         with fluid.dygraph.guard():
             res = self.dygraph_func(self.input)
-            if isinstance(res, (tuple)):
-                return tuple(r.numpy() for r in res)
-            elif isinstance(res, core.VarBase):
-                return res.numpy()
-            return res
-
-    def run_static_mode(self):
-        self.program_translator.enable(True)
-        with fluid.dygraph.guard():
-            res = self.dygraph_func(self.input)
-            if isinstance(res, tuple):
+            if isinstance(res, (tuple, list)):
                 return tuple(r.numpy() for r in res)
             elif isinstance(res, core.VarBase):
                 return res.numpy()
             return res
 
     def test_transformed_static_result(self):
-        dygraph_res = self.run_dygraph_mode()
-        static_res = self.run_static_mode()
+        dygraph_res = self._run(to_static=False)
+        static_res = self._run(to_static=True)
         if isinstance(dygraph_res, tuple):
             self.assertTrue(isinstance(static_res, tuple))
             self.assertEqual(len(dygraph_res), len(static_res))
@@ -255,5 +277,25 @@ def init_dygraph_func(self):
         self.dygraph_func = test_return_no_variable
 
 
+class TestReturnListOneValue(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_list_one_value
+
+
+class TestReturnListManyValue(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_list_many_values
+
+
+class TestReturnTupleOneValue(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_tuple_one_value
+
+
+class TestReturnTupleManyValue(TestReturnBase):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_return_tuple_many_values
+
+
 if __name__ == '__main__':
     unittest.main()

From 1f3be859141649924b2bcd623a7bed738ea810ca Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Thu, 22 Oct 2020 01:31:00 -0500
Subject: [PATCH 034/185] Fix bug of fetch_async_op_handle when fetching the
 feed variable (#28194)

* fix bug of fetch_async_op_handle

* revert some changes of test_buffer_shared_memory_reuse_pass

* revert some changes of test_buffer_shared_memory_reuse_pass
---
 .../details/fetch_async_op_handle.cc          |  4 ++-
 .../fluid/framework/details/op_handle_base.cc | 30 +++++++++++++++++--
 .../fluid/framework/details/op_handle_base.h  |  9 ++++--
 paddle/fluid/memory/memcpy.cc                 | 13 ++++++++
 .../test_buffer_shared_memory_reuse_pass.py   | 18 ++---------
 5 files changed, 51 insertions(+), 23 deletions(-)

diff --git a/paddle/fluid/framework/details/fetch_async_op_handle.cc b/paddle/fluid/framework/details/fetch_async_op_handle.cc
index 09aedafc6bb2e..98cae9f9e5bce 100644
--- a/paddle/fluid/framework/details/fetch_async_op_handle.cc
+++ b/paddle/fluid/framework/details/fetch_async_op_handle.cc
@@ -13,8 +13,10 @@
 // limitations under the License.
 
 #include "paddle/fluid/framework/details/fetch_async_op_handle.h"
+
 #include <string>
 #include <utility>
+
 #include "paddle/fluid/platform/profiler.h"
 
 namespace paddle {
@@ -195,7 +197,7 @@ void FetchAsyncOpHandle::FetchMergedLodTensor(
 
 void FetchAsyncOpHandle::RunImpl() {
   platform::RecordEvent record_event(Name());
-  WaitInputVarGenerated();
+  WaitInputVarGenerated(true);
 
   // get src vars
   auto &scopes = *local_exec_scopes_;
diff --git a/paddle/fluid/framework/details/op_handle_base.cc b/paddle/fluid/framework/details/op_handle_base.cc
index 105c37192f57c..22b7bd17fe429 100644
--- a/paddle/fluid/framework/details/op_handle_base.cc
+++ b/paddle/fluid/framework/details/op_handle_base.cc
@@ -143,7 +143,7 @@ void OpHandleBase::AddOutput(VarHandleBase *out) {
   out->AddInput(this, this->Node());
 }
 
-void OpHandleBase::WaitInputVarGenerated() {
+void OpHandleBase::WaitInputVarGenerated(bool wait_for_feed) {
   for (auto in_var : inputs_) {
     if (NeedWait(in_var)) {
       // Dummy Variable is used to represent dependencies between operators, so
@@ -165,6 +165,30 @@ void OpHandleBase::WaitInputVarGenerated() {
         }
         // There are nothing to do when the place is CPUPlace.
       }
+    } else {
+      // NOTE(zhiqiu): Special case when using fetch_async_op_handle may lead to
+      // nodetermination due to parallel execution of cuda memory operation. Eg:
+      // execute stream: CPU->GPU copy (feed)
+      // fetch stream: GPU->CUDAPinned (fetch)
+      if (in_var && wait_for_feed) {
+        auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
+        if (in_var_handle) {
+          auto &place = in_var_handle->place();
+          if (platform::is_gpu_place(place)) {
+#ifdef PADDLE_WITH_CUDA
+            platform::DeviceContextPool &pool =
+                platform::DeviceContextPool::Instance();
+            auto stream =
+                static_cast<platform::CUDADeviceContext *>(pool.Get(place))
+                    ->stream();
+            PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#else
+            PADDLE_THROW(platform::errors::PreconditionNotMet(
+                "Not compiled with CUDA."));
+#endif
+          }
+        }
+      }
     }
   }
 }
@@ -172,8 +196,8 @@ void OpHandleBase::WaitInputVarGenerated() {
 void OpHandleBase::WaitInputVarGenerated(const platform::Place &place) {
   for (auto in_var : inputs_) {
     if (NeedWait(in_var)) {
-      // Dummy Variable is used to represent dependencies between operators, so
-      // there doesn't add event for it.
+      // Dummy Variable is used to represent dependencies between operators,
+      // so there doesn't add event for it.
       auto *in_var_handle = dynamic_cast<VarHandle *>(in_var);
       if (in_var_handle) {
         if (platform::is_gpu_place(in_var_handle->place())) {
diff --git a/paddle/fluid/framework/details/op_handle_base.h b/paddle/fluid/framework/details/op_handle_base.h
index eb3d9c32ffc1f..37e18adf9da9e 100644
--- a/paddle/fluid/framework/details/op_handle_base.h
+++ b/paddle/fluid/framework/details/op_handle_base.h
@@ -81,12 +81,15 @@ class OpHandleBase {
 
   // This method adds the wait events of all the input on all the device
   // context.
-  // NODE: This Wait is asynchronous operation.
-  virtual void WaitInputVarGenerated();
+  // NOTE: This Wait is asynchronous operation.
+  // NOTE: wait_for_feed is added to wait for feed var, since it has
+  // generated op, no event and cannot perform event wait. It is only
+  // used in fetch_async_op_handle currently.
+  virtual void WaitInputVarGenerated(bool wait_for_feed = false);
 
   // This method adds the wait events of all the input on the specified device
   // context.
-  // NODE: This Wait is asynchronous operation.
+  // NOTE: This Wait is asynchronous operation.
   virtual void WaitInputVarGenerated(const platform::Place &place);
 
   virtual bool NeedWait(VarHandleBase *in_var);
diff --git a/paddle/fluid/memory/memcpy.cc b/paddle/fluid/memory/memcpy.cc
index 225b6858cc1f2..8a04f74c6de82 100644
--- a/paddle/fluid/memory/memcpy.cc
+++ b/paddle/fluid/memory/memcpy.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/memcpy.h"
 
 #include <cstring>  // for memcpy
+
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
 
@@ -267,6 +268,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPlace>(
     const void* src, size_t num, cudaStream_t stream) {
   if (UNLIKELY(num == 0)) return;
 
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
   if (dst_place == src_place) {
     platform::SetDeviceId(src_place.device);
     if (stream) {
@@ -293,6 +296,8 @@ template <>
 void Copy<platform::CPUPlace, platform::CUDAPinnedPlace>(
     platform::CPUPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -301,6 +306,8 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CPUPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CPUPlace src_place, const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -309,6 +316,8 @@ template <>
 void Copy<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>(
     platform::CUDAPinnedPlace dst_place, void* dst,
     platform::CUDAPinnedPlace src_place, const void* src, size_t num) {
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place;
   if (UNLIKELY(num == 0)) return;
   std::memcpy(dst, src, num);
 }
@@ -320,6 +329,8 @@ void Copy<platform::CUDAPinnedPlace, platform::CUDAPlace>(
     cudaStream_t stream) {
   if (UNLIKELY(num == 0)) return;
   platform::SetDeviceId(src_place.device);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:GPU->CUDAPinned");
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyDeviceToHost, stream);
@@ -337,6 +348,8 @@ void Copy<platform::CUDAPlace, platform::CUDAPinnedPlace>(
   if (UNLIKELY(num == 0)) return;
 
   platform::SetDeviceId(dst_place.device);
+  VLOG(4) << "memory::Copy " << num << " Bytes from " << src_place << " to "
+          << dst_place << " by thream(" << stream << ")";
   if (stream) {
     platform::RecordEvent record_event("GpuMemcpyAsync:CUDAPinned->GPU");
     platform::GpuMemcpyAsync(dst, src, num, cudaMemcpyHostToDevice, stream);
diff --git a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
index 4b1a54d3c66a1..546124bbee899 100644
--- a/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_buffer_shared_memory_reuse_pass.py
@@ -34,7 +34,6 @@ class InplaceTestBase(unittest.TestCase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
-        self.fuse_all_reduce_ops = False
 
     def setUp(self):
         paddle.enable_static()
@@ -94,7 +93,6 @@ def check_single_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
-                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_prog = fluid.CompiledProgram(prog).with_data_parallel(
                     loss_name=loss.name,
                     build_strategy=build_strategy,
@@ -117,15 +115,7 @@ def check_single_card_fetch_var(self):
                         fetch_val2, = exe.run(compiled_prog,
                                               feed=feed_dict,
                                               fetch_list=[fetch_var])
-                        #NOTE(zhiqiu): Temporally changed from array_equal to allclose. 
-                        # The real root is fuse_all_reduce and fuse_all_optimizer_opss may 
-                        # result in diff because of the instruction set on the virtual machine.
-                        # And the related unit tests: test_fuse_all_reduce_pass and test_fuse_optimizer_pass use "almostEqual" in their checks.
-                        # There are also some related issues:
-                        # https://github.com/PaddlePaddle/Paddle/issues/21270
-                        # https://github.com/PaddlePaddle/Paddle/issues/21046
-                        # https://github.com/PaddlePaddle/Paddle/issues/21045
-                        self.assertTrue(np.allclose(fetch_val1, fetch_val2))
+                        self.assertTrue(np.array_equal(fetch_val1, fetch_val2))
 
     def check_multi_card_fetch_var(self):
         if self.is_invalid_test():
@@ -148,7 +138,6 @@ def check_multi_card_fetch_var(self):
                 build_strategy.memory_optimize = memory_optimize
                 build_strategy.enable_inplace = enable_inplace
                 build_strategy.fuse_all_optimizer_ops = self.fuse_all_optimizer_ops
-                build_strategy.fuse_all_reduce_ops = self.fuse_all_reduce_ops
                 compiled_program = fluid.CompiledProgram(
                     prog).with_data_parallel(
                         loss_name=loss.name,
@@ -170,15 +159,13 @@ def check_multi_card_fetch_var(self):
                         fetch_vals.append(fetch_val)
 
                 for item in fetch_vals:
-                    # save above
-                    self.assertTrue(np.allclose(fetch_vals[0], item))
+                    self.assertTrue(np.array_equal(fetch_vals[0], item))
 
 
 class CUDAInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = True
         self.fuse_all_optimizer_ops = False
-        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()
@@ -191,7 +178,6 @@ class CPUInplaceTest(InplaceTestBase):
     def initParameter(self):
         self.use_cuda = False
         self.fuse_all_optimizer_ops = False
-        self.fuse_all_reduce_ops = False
 
     def test_multi_card_fetch_var(self):
         self.check_multi_card_fetch_var()

From 271ee58f5cb6cd84aad3fd1887b18b2fa11c9aa4 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Thu, 22 Oct 2020 14:49:46 +0800
Subject: [PATCH 035/185] Enhance build detection (#28123)

* fix optimizer init

* Enhance the detection of whether to keep the build directory

* Enhance the detection of whether to keep the build directory
---
 paddle/scripts/paddle_build.bat | 59 +++++++++++++++++++++++----------
 1 file changed, 42 insertions(+), 17 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 8b1377415d481..207651b0f23f3 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -21,6 +21,7 @@ SETLOCAL
 
 rem -------clean up environment-----------
 set work_dir=%cd%
+set cache_dir=%work_dir:Paddle=cache%
 taskkill /f /im op_function_generator.exe
 wmic process where name="op_function_generator.exe" call terminate
 
@@ -36,9 +37,9 @@ if not defined WITH_PYTHON set WITH_PYTHON=ON
 if not defined ON_INFER set ON_INFER=ON
 if not defined WITH_INFERENCE_API_TEST set WITH_INFERENCE_API_TEST=ON
 if not defined WITH_STATIC_LIB set WITH_STATIC_LIB=ON
-if not defined WITH_CACHE set WITH_CACHE=ON
+if not defined WITH_CACHE set WITH_CACHE=OFF
 if not defined WITH_TPCACHE set WITH_TPCACHE=ON
-
+set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
 
 rem -------set cache build work directory-----------
 rmdir build\python /s/q
@@ -47,16 +48,41 @@ if "%WITH_CACHE%"=="OFF" (
     goto :mkbuild
 )
 
+set error_code=0
+type %cache_dir%\error_code.txt
+set /p error_code=< %cache_dir%\error_code.txt
+if %error_code% NEQ 0 (
+    rmdir build /s/q
+    goto :mkbuild
+)
+
+git show-ref --verify --quiet refs/heads/last_pr
+if %ERRORLEVEL% EQU 0 (
+    git diff HEAD last_pr --stat --name-only
+    git diff HEAD last_pr --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+    if %ERRORLEVEL% EQU 0 (
+        rmdir build /s/q
+    )
+    git branch -D last_pr
+    git branch last_pr
+) else (
+    rmdir build /s/q
+    git branch last_pr
+)
+
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set datetime=%%#
 set day_now=%datetime:~6,2%
 set day_before=-1
-set /p day_before=< %work_dir%\..\day.txt
+set /p day_before=< %cache_dir%\day.txt
 if %day_now% NEQ %day_before% (
-    echo %day_now% > %work_dir%\..\day.txt
-    type %work_dir%\..\day.txt
+    echo %day_now% > %cache_dir%\day.txt
+    type %cache_dir%\day.txt
     rmdir build /s/q
+    goto :mkbuild
 )
-git diff origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+
+git diff HEAD origin/develop --stat --name-only
+git diff HEAD origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
 if %ERRORLEVEL% EQU 0 (
     rmdir build /s/q
 )
@@ -67,6 +93,7 @@ if not exist build (
 )
 cd /d build
 dir .
+dir %cache_dir%
 dir paddle\fluid\pybind\Release
 
 rem ------initialize the python environment------
@@ -107,10 +134,6 @@ clcache.exe -M 21474836480
 
 
 rem ------set cache third_party------
-set cache_dir=%work_dir:Paddle=cache%
-dir %cache_dir%
-set INFERENCE_DEMO_INSTALL_DIR=%cache_dir:\=/%/inference_demo
-
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
 )
@@ -194,7 +217,7 @@ cmake .. -G "Visual Studio 14 2015 Win64" -DWITH_AVX=%WITH_AVX% -DWITH_GPU=%WITH
 goto:eof
 
 :cmake_error
-call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo 7 > %cache_dir%\error_code.txt
 echo Cmake failed, will exit!
 exit /b 7
 
@@ -239,7 +262,7 @@ echo Build Paddle successfully!
 goto:eof
 
 :build_error
-call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo 7 > %cache_dir%\error_code.txt
 echo Build Paddle failed, will exit!
 exit /b 7
 
@@ -257,6 +280,7 @@ call :timestamp "%start%" "%end%" "Build"
 tree /F %cd%\paddle_inference_install_dir\paddle
 %cache_dir%\tools\busybox64.exe du -h -d 0 -k %cd%\paddle_inference_install_dir\paddle\lib > lib_size.txt
 set /p libsize=< lib_size.txt
+@ECHO OFF
 for /F %%i in ("%libsize%") do (
     set /a libsize_m=%%i/1024
     echo "Windows Paddle_Inference Size: !libsize_m!M"
@@ -267,6 +291,7 @@ for /F %%i in ("%whlsize%") do echo "Windows PR whl Size: %%i"
 dir /s /b python\dist\*.whl > whl_file.txt
 set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
 
+@ECHO ON
 pip uninstall -y paddlepaddle
 pip uninstall -y paddlepaddle-gpu
 pip install -U %PADDLE_WHL_FILE_WIN% --user
@@ -280,7 +305,7 @@ python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
 
 :test_whl_pacakage_error
-call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo 1 > %cache_dir%\error_code.txt
 echo Test import paddle failed, will exit!
 exit /b 1
 
@@ -315,7 +340,7 @@ ctest.exe -E "(%disable_ut_quickly%)" --output-on-failure -C Release -j 8 --repe
 goto:eof
 
 :unit_test_error
-call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo 8 > %cache_dir%\error_code.txt
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "1 card TestCases Total"
@@ -339,7 +364,7 @@ cd %work_dir%\paddle\fluid\inference\api\demo_ci
 goto:eof
 
 :test_inference_error
-call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo 1 > %cache_dir%\error_code.txt
 echo Testing fluid library for inference failed!
 exit /b 1
 
@@ -418,7 +443,7 @@ echo git checkout -f origin_pr >>  check_change_of_unittest.sh
 goto:eof
 
 :check_change_of_unittest_error
-call paddle_winci\Scripts\deactivate.bat 2>NUL
+echo 1 > %cache_dir%\error_code.txt
 exit /b 1
 
 
@@ -476,7 +501,7 @@ taskkill /f /im cvtres.exe 2>NUL
 taskkill /f /im rc.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
-taskkill /f /im python.exe  2>NUL
+echo 0 > %cache_dir%\error_code.txt
 echo Windows CI run successfully!
 exit /b 0
 

From efe6e2840c6a043005e35a28394685011f69ca5b Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Thu, 22 Oct 2020 01:53:47 -0500
Subject: [PATCH 036/185] fix strided_slice_op's GetExpectedKernelType 
 (#28192)

* fix strided_slice_op's GetExpectedKernelType when input tensor is at CUDAPinnedPlace

* add unittest for tensors in cuda pinned place

* skip test for cuda pinned place on cpu machines
---
 paddle/fluid/operators/strided_slice_op.cc            |  7 ++++++-
 .../fluid/tests/unittests/test_strided_slice_op.py    | 11 +++++++++++
 2 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/strided_slice_op.cc b/paddle/fluid/operators/strided_slice_op.cc
index f8e5d9171087c..94a0576b77230 100644
--- a/paddle/fluid/operators/strided_slice_op.cc
+++ b/paddle/fluid/operators/strided_slice_op.cc
@@ -154,9 +154,14 @@ class StridedSliceOp : public framework::OperatorWithKernel {
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
+    // NOTE: cuda pinned tensor need to copy its data to target place
+    auto in_tensor = ctx.Input<Tensor>("Input");
+    if (platform::is_cuda_pinned_place(in_tensor->place())) {
+      return framework::OpKernelType(in_tensor->type(), ctx.device_context());
+    }
     return framework::OpKernelType(
         OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
-        ctx.Input<Tensor>("Input")->place());
+        in_tensor->place());
   }
   framework::OpKernelType GetKernelTypeForVar(
       const std::string &var_name, const Tensor &tensor,
diff --git a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
index 0fe6cd5e7e753..71550c8f24753 100644
--- a/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_strided_slice_op.py
@@ -511,6 +511,17 @@ def test_dygraph_op(self):
             x, axes=axes, starts=starts, ends=ends, strides=strides_1)
         assert sliced_1.shape == (3, 2, 2, 2)
 
+    @unittest.skipIf(not paddle.is_compiled_with_cuda(),
+                     "Cannot use CUDAPinnedPlace in CPU only version")
+    def test_cuda_pinned_place(self):
+        with paddle.fluid.dygraph.guard():
+            x = paddle.to_tensor(
+                np.random.randn(2, 10), place=paddle.CUDAPinnedPlace())
+            self.assertTrue(x.place.is_cuda_pinned_place())
+            y = x[:, ::2]
+            self.assertFalse(x.place.is_cuda_pinned_place())
+            self.assertFalse(y.place.is_cuda_pinned_place())
+
 
 if __name__ == "__main__":
     unittest.main()

From 2db77be42397296e032bbd92614b9ad5f571242a Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Thu, 22 Oct 2020 14:57:11 +0800
Subject: [PATCH 037/185] fix wrong data type, test=develop (#28203)

---
 paddle/fluid/operators/roi_align_op_xpu.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/roi_align_op_xpu.cc b/paddle/fluid/operators/roi_align_op_xpu.cc
index 75bd94142e6b7..699cc7b84a4e6 100644
--- a/paddle/fluid/operators/roi_align_op_xpu.cc
+++ b/paddle/fluid/operators/roi_align_op_xpu.cc
@@ -62,7 +62,7 @@ class XPUROIAlignOpKernel : public framework::OpKernel<T> {
     } else {
       auto _rois_lod = rois->lod().back();
       rois_batch_size = _rois_lod.size() - 1;
-      for (int n = 0; n < _rois_lod.size(); ++n) {
+      for (int n = 0; n < static_cast<int>(_rois_lod.size()); ++n) {
         rois_lod[n] = _rois_lod[n];
       }
       PADDLE_ENFORCE_EQ(

From 11acbfae06c20ec5309833bdcc36a1b1bf462f7d Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Thu, 22 Oct 2020 16:38:14 +0800
Subject: [PATCH 038/185] refine auto strategy, test=document_fix (#28211)

---
 python/paddle/distributed/fleet/base/distributed_strategy.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index c7798b15c67fe..847050b404f01 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -1073,8 +1073,12 @@ def auto(self):
 
             import paddle
             import paddle.distributed.fleet as fleet
+            paddle.enable_static()
+
             strategy = fleet.DistributedStrategy()
             strategy.auto = True
+            # if set other strategy at the same time, auto will not apply
+            # strategy.amp = True
 
             optimizer = paddle.optimizer.SGD(learning_rate=0.01)
             optimizer = fleet.distributed_optimizer(optimizer, strategy)

From 23b0190bd26944142f5e750d4a1db72aafbe5722 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Thu, 22 Oct 2020 17:37:16 +0800
Subject: [PATCH 039/185] add xpu white_list, test=kunlun (#28210)

---
 tools/static_mode_white_list.py | 20 ++++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 05e931a9a25ef..defa4f13495d2 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -653,4 +653,24 @@
     'test_sync_batch_norm_op',
     'test_multiprocess_dataloader_iterable_dataset_static',
     'test_multiprocess_dataloader_static',
+    'test_load_op_xpu',
+    'test_activation_op_xpu',
+    'test_adam_op_xpu',
+    'test_assign_op_xpu',
+    'test_batch_norm_op_xpu',
+    'test_cast_op_xpu',
+    'test_concat_op_xpu',
+    'test_elementwise_add_op_xpu',
+    'test_fill_constant_op_xpu',
+    'test_gather_op_xpu',
+    'test_matmul_op_xpu',
+    'test_matmul_v2_op_xpu',
+    'test_mean_op_xpu',
+    'test_momentum_op_xpu',
+    'test_reduce_mean_op_xpu',
+    'test_reduce_sum_op_xpu',
+    'test_reshape2_op_xpu',
+    'test_sgd_op_xpu',
+    'test_shape_op_xpu',
+    'test_slice_op_xpu',
 ]

From d835118dbdb78dbafef651a4be432311333f153a Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Fri, 23 Oct 2020 09:52:46 +0800
Subject: [PATCH 040/185] Hide log message (#28220)

---
 paddle/scripts/paddle_build.sh | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 016188057ad3c..711d6564cfa63 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -522,9 +522,11 @@ function run_mac_test() {
 EOF
         #remove proxy here to fix dist ut 'test_fl_listen_and_serv_op' error on mac. 
         #see details: https://github.com/PaddlePaddle/Paddle/issues/24738
+        set +x
         my_proxy=$http_proxy
         export http_proxy=
         export https_proxy=
+        set -x
 
         set +ex
         if [ "$1" == "cp27-cp27m" ]; then
@@ -601,8 +603,10 @@ EOF
         echo "Mac testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         paddle version
         # Recovery proxy to avoid failure in later steps
+        set +x
         export http_proxy=$my_proxy
         export https_proxy=$my_proxy
+        set -x
         if [ "$mactest_error" != 0 ];then
             if [[ "$failed_test_lists" == "" ]]; then
                 echo "========================================"

From a1e7fd4a139bd95abc29960ef97e4369af1a1f4c Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Fri, 23 Oct 2020 10:31:02 +0800
Subject: [PATCH 041/185] Fix test_parallel_executor_test_while_train Random
 Failure by Decreasing GPU Usage (#28213)

Recently, test_parallel_executor_test_while_train randomly failed on CI. On all CI logs, it showed NCCL initialization failed or cusolver initialization failed. I found online that those failure is usually caused by GPU shortage. Those API calls CUDA APIs directly so it shouldn't be the problem of allocator. It may be somewhere in PaddlePaddle increases GPU usage.

However, I run this test for 1000 times on my machine and the CI machine, either of them can reproduce the random failure. Maybe there is something related to the environment only happened in test env.

To verify my assumption that somewhere in PaddlePaddle increases GPU usage and also fix this CI, I decreased the batch_size to see whether the random failure disappears in test env.
---
 .../tests/unittests/test_parallel_executor_test_while_train.py  | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index fd47dc37e7694..76d93259a647e 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -36,7 +36,7 @@ def check_network_convergence(self, use_cuda, build_strategy=None):
             opt = fluid.optimizer.SGD(learning_rate=0.001)
             opt.minimize(loss)
 
-            batch_size = 32
+            batch_size = 16
             image = np.random.normal(size=(batch_size, 784)).astype('float32')
             label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
 

From 4ea23307598206c7d1bd0f6a8dcb10997b399d1b Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Fri, 23 Oct 2020 05:00:25 +0200
Subject: [PATCH 042/185] use FLAGS_use_mkldnn to prevent unnecessary attrs
 copy (#28146)

---
 paddle/fluid/imperative/prepared_operator.cc | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/imperative/prepared_operator.cc b/paddle/fluid/imperative/prepared_operator.cc
index 4e0e95dd01297..c58b1e9596f6c 100644
--- a/paddle/fluid/imperative/prepared_operator.cc
+++ b/paddle/fluid/imperative/prepared_operator.cc
@@ -20,6 +20,8 @@
 #include "paddle/fluid/imperative/infer_shape_context.h"
 #include "paddle/fluid/imperative/infer_var_type_context.h"
 
+DECLARE_bool(use_mkldnn);
+
 namespace paddle {
 namespace imperative {
 
@@ -91,8 +93,10 @@ PreparedOp PrepareOpImpl(const NameVarMap<VarType>& ins,
   // MKLDNN variant of code reads attributes in some of GetKernelTypeForVar and
   // GetKernelType functions, so we need to copy the attributes there.
   // Const qualifier of Attrs had to be discarded to overwrite it.
-  auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
-  mutable_op_attrs = attrs;
+  if (FLAGS_use_mkldnn) {
+    auto& mutable_op_attrs = const_cast<framework::AttributeMap&>(op.Attrs());
+    mutable_op_attrs = attrs;
+  }
 #endif
   auto expected_kernel_key =
       op.GetExpectedKernelType(DygraphExecutionContext<VarType>(

From 2babd6ff675a7c1516aa656a4640d256605adcf7 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 23 Oct 2020 11:34:21 +0800
Subject: [PATCH 043/185] Add compile limit for PADDLE_ENFORCE without error
 message (#28221)

* add compile limit for paddle enforce

* polish elementwise_op_function.cu.h

* fix failed unittest

* fix windows compile failed

* detail polish

* revert no type constructor
---
 .../framework/ir/attention_lstm_fuse_pass.cc  |  4 +++-
 .../memory/allocation/best_fit_allocator.cc   |  4 ++--
 .../memory/allocation/retry_allocator_test.cc |  5 ++--
 .../operators/affine_grid_cudnn_op.cu.cc      | 14 ++++-------
 paddle/fluid/operators/clip_op.h              |  3 ++-
 paddle/fluid/platform/errors.h                | 23 +++++--------------
 6 files changed, 21 insertions(+), 32 deletions(-)

diff --git a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
index a4b43086785b3..3fdc389102c5a 100644
--- a/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/attention_lstm_fuse_pass.cc
@@ -108,7 +108,9 @@ void FindWhileOp(Graph* graph) {
   GraphSafeRemoveNodes(graph, marked_nodes);
 }
 
-#define CHECK_P1(x) PADDLE_ENFORCE_NOT_NULL(x);
+#define CHECK_P1(x)        \
+  PADDLE_ENFORCE_NOT_NULL( \
+      x, platform::errors::NotFound("%s is a null pointer.", #x))
 #define CHECK_P2(x0, x1) \
   CHECK_P1(x0);          \
   CHECK_P1(x1);
diff --git a/paddle/fluid/memory/allocation/best_fit_allocator.cc b/paddle/fluid/memory/allocation/best_fit_allocator.cc
index 2b8d2164f68ad..e725a215ffa47 100644
--- a/paddle/fluid/memory/allocation/best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/best_fit_allocator.cc
@@ -158,8 +158,8 @@ Allocation* BestFitAllocator::AllocateImpl(size_t size) {
     }
   }
   if (UNLIKELY(highest_set_bit == free_chunks_.size())) {
-    PADDLE_THROW_BAD_ALLOC("Cannot allocate %d, All fragments size is %d", size,
-                           FreeSize());
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "Cannot allocate %d, All fragments size is %d.", size, FreeSize()));
   }
   auto chunk_it = SplitChunk(size, highest_set_bit, map_it);
   return new BestFitAllocation(this, chunk_it);
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index 5d3e133f97d38..b80e48460bf9f 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -105,7 +105,8 @@ class DummyAllocator : public Allocator {
 
  protected:
   Allocation *AllocateImpl(size_t size) override {
-    PADDLE_THROW_BAD_ALLOC("Always BadAlloc");
+    PADDLE_THROW_BAD_ALLOC(platform::errors::ResourceExhausted(
+        "Here is a test exception, always BadAlloc."));
   }
 
   void FreeImpl(Allocation *) override {}
@@ -120,7 +121,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
       ASSERT_TRUE(false);
       allocation.reset();
     } catch (BadAlloc &ex) {
-      ASSERT_TRUE(std::string(ex.what()).find("Always BadAlloc") !=
+      ASSERT_TRUE(std::string(ex.what()).find("always BadAlloc") !=
                   std::string::npos);
     }
   }
diff --git a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
index 009c397e0074c..c09f71f46c81c 100644
--- a/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/affine_grid_cudnn_op.cu.cc
@@ -28,10 +28,9 @@ class CUDNNAffineGridOpKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     PADDLE_ENFORCE_EQ(
         platform::is_gpu_place(ctx.GetPlace()), true,
-        platform::errors::InvalidArgument("Only "
-                                          "support for CUDAPlace.Please switch "
-                                          "your context from CPUPlace to "
-                                          "CUDAPlace or update your cudnn."));
+        platform::errors::InvalidArgument(
+            "Only support for CUDAPlace.Please switch your context from "
+            "CPUPlace to CUDAPlace or update your cudnn."));
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     auto handle = dev_ctx.cudnn_handle();
     auto* theta = ctx.Input<Tensor>("Theta");
@@ -106,12 +105,9 @@ class CUDNNAffineGridGradOpKernel : public framework::OpKernel<T> {
     const T* output_grad_data = output_grad->data<T>();
     T* theta_grad_data = theta_grad->mutable_data<T>(ctx.GetPlace());
 
-    PADDLE_ENFORCE_EQ(
+    PADDLE_ENFORCE_CUDA_SUCCESS(
         platform::dynload::cudnnSpatialTfGridGeneratorBackward(
-            handle, cudnn_st_desc, output_grad_data, theta_grad_data),
-        0,
-        "Some errors "
-        "has occurred during forward computation in cudnn;");
+            handle, cudnn_st_desc, output_grad_data, theta_grad_data));
   }
 };
 
diff --git a/paddle/fluid/operators/clip_op.h b/paddle/fluid/operators/clip_op.h
index 68f5d5460efd1..097b5e4863d6f 100644
--- a/paddle/fluid/operators/clip_op.h
+++ b/paddle/fluid/operators/clip_op.h
@@ -133,7 +133,8 @@ class ClipKernel : public framework::OpKernel<T> {
       trans(context.template device_context<DeviceContext>(), out_data,
             out_data + numel, out_data, ClipFunctor<T>(min, max));
     } else {
-      PADDLE_THROW("ClipOp only supports LoDTensor and SelectedRows");
+      PADDLE_THROW(platform::errors::Unavailable(
+          "ClipOp only supports LoDTensor and SelectedRows."));
     }
   }
 };
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index 5c145845fa657..a2f2e7c130ca2 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -30,33 +30,22 @@ typedef ::paddle::platform::error::Code Code;
 
 class ErrorSummary {
  public:
-  // Note(chenweihang): Final deprecated constructor
-  //   This constructor is only used to be compatible with
-  //   current existing no error message PADDLE_ENFORCE_*
-  ErrorSummary() {
-    code_ = paddle::platform::error::LEGACY;
-    msg_ =
-        "An error occurred here. There is no accurate error hint for this "
-        "error yet. We are continuously in the process of increasing hint for "
-        "this kind of error check. It would be helpful if you could inform us "
-        "of how this conversion went by opening a github issue. And we will "
-        "resolve it with high priority.\n"
-        "  - New issue link: "
-        "https://github.com/PaddlePaddle/Paddle/issues/new\n"
-        "  - Recommended issue content: all error stack information";
-  }
-
   // Note(chenweihang): Final deprecated constructor
   //   This constructor is used to be compatible with
   //   current existing untyped PADDLE_ENFORCE_*
   //   PADDLE_ENFORCE
+  // Note(chenweihang): Windows openblas need this
+  //   constructor for compiling PADDLE_ENFORCE in *.cu,
+  //   this is a bug cause we can't remove this
+  //   constructor now.
   template <typename... Args>
   explicit ErrorSummary(Args... args) {
     code_ = paddle::platform::error::LEGACY;
     msg_ = paddle::string::Sprintf(args...);
   }
 
-  // Note(chenweihang): Recommended constructor
+  // Note(chenweihang): Only recommended constructor
+  //   No longer supports PADDLE_ENFORCE without type or without error message
   explicit ErrorSummary(Code code, std::string msg) : code_(code), msg_(msg) {}
 
   Code code() const { return code_; }

From 4877bd59448f0c15df7383a14a47e59195d984b7 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Fri, 23 Oct 2020 23:12:13 +0800
Subject: [PATCH 044/185] fix CUDA9 error due to BuildCustomizations (#28222)

---
 cmake/paddle_win.props | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/paddle_win.props b/cmake/paddle_win.props
index 7e434c6d907cc..0115ad4b59fc4 100644
--- a/cmake/paddle_win.props
+++ b/cmake/paddle_win.props
@@ -15,11 +15,11 @@
             <Warning>InheritFromHost</Warning>
 
             <BaseCommandLineTemplate>-ccbin "%(VCBinDir)" -x cu [GenerateRelocatableDeviceCode] [Include] [RequiredIncludes] [InterleaveSourceInPTX] [GPUDebugInfo] [GenerateLineInfo] [Keep] [KeepDir] [MaxRegCount] [PtxAsOptionV] [TargetMachinePlatform] [NvccCompilation] [CudaRuntime] [AdditionalOptions]</BaseCommandLineTemplate>
-            <BuildCommandLineTemplate>--use-local-env</BuildCommandLineTemplate>
+            <BuildCommandLineTemplate>--use-local-env --cl-version $(CudaClVersion)</BuildCommandLineTemplate>
             <BuildDynamicCommandLineTemplate>[CodeGeneration]</BuildDynamicCommandLineTemplate>
             <CleanCommandLineTemplate>-clean</CleanCommandLineTemplate>
-            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
-            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] [ProgramDataBaseFileName] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
+            <!-- <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) /Zi [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate> -->
+            <HostCommandLineTemplate>-Xcompiler &quot;/EHsc [Warning] /nologo [Optimization] $(CudaForceSynchronousPdbWrites) [RuntimeChecks] [Runtime] [TypeInfo]&quot;</HostCommandLineTemplate>
 
             <DriverApiCommandLineTemplate>%(BaseCommandLineTemplate) [CompileOut] "%(FullPath)"</DriverApiCommandLineTemplate>
             <RuntimeApiCommandLineTemplate>%(BaseCommandLineTemplate) [HostDebugInfo] [Emulation] [FastMath] [Defines] %(HostCommandLineTemplate) [CompileOut] "%(FullPath)"</RuntimeApiCommandLineTemplate>

From 81244fbfabe40971284f37faf2e35d80f39d6ffa Mon Sep 17 00:00:00 2001
From: mapingshuo <mps2012@yeah.net>
Date: Mon, 26 Oct 2020 10:08:10 +0800
Subject: [PATCH 045/185] add sharding strategy in fleet(#27900)

* add sharding
---
 .../framework/distributed_strategy.proto      |   6 +
 .../fleet/base/distributed_strategy.py        |  49 +++
 .../fleet/meta_optimizers/__init__.py         |   1 +
 .../fleet/meta_optimizers/common.py           |   6 +
 .../fleet/meta_optimizers/dgc_optimizer.py    |   8 +
 .../meta_optimizers/sharding/__init__.py      |  13 +
 .../meta_optimizers/sharding/fp16_helper.py   | 154 +++++++
 .../sharding/gradient_clip_helper.py          |  90 ++++
 .../fleet/meta_optimizers/sharding/prune.py   | 131 ++++++
 .../fleet/meta_optimizers/sharding/shard.py   | 144 ++++++
 .../fleet/meta_optimizers/sharding/utils.py   | 274 ++++++++++++
 .../sharding/weight_decay_helper.py           |  37 ++
 .../meta_optimizers/sharding_optimizer.py     | 411 ++++++++++++++++++
 python/paddle/fluid/clip.py                   |   4 +-
 python/paddle/fluid/framework.py              |  36 +-
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 .../unittests/fleet_meta_optimizer_base.py    |  17 +-
 ...est_fleet_gradient_merge_meta_optimizer.py |   3 -
 .../test_fleet_sharding_meta_optimizer.py     | 275 ++++++++++++
 python/setup.py.in                            |   1 +
 20 files changed, 1648 insertions(+), 14 deletions(-)
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
 create mode 100644 python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py

diff --git a/paddle/fluid/framework/distributed_strategy.proto b/paddle/fluid/framework/distributed_strategy.proto
index 881ef30ffe690..50b7d62547bb3 100644
--- a/paddle/fluid/framework/distributed_strategy.proto
+++ b/paddle/fluid/framework/distributed_strategy.proto
@@ -24,6 +24,10 @@ enum Mode {
 
 message RecomputeConfig { repeated string checkpoints = 1; }
 
+message ShardingConfig {
+  optional float fuse_broadcast_MB = 1 [ default = 32.0 ];
+}
+
 message AMPConfig {
   optional float init_loss_scaling = 1 [ default = 32768.0 ];
   optional int32 incr_every_n_steps = 2 [ default = 1000 ];
@@ -130,6 +134,7 @@ message DistributedStrategy {
   optional bool cudnn_batchnorm_spatial_persistent = 23 [ default = true ];
   optional bool adaptive_localsgd = 24 [ default = false ];
   optional bool fp16_allreduce = 25 [ default = false ];
+  optional bool sharding = 26 [ default = false ];
 
   optional RecomputeConfig recompute_configs = 101;
   optional AMPConfig amp_configs = 102;
@@ -141,6 +146,7 @@ message DistributedStrategy {
   optional LarsConfig lars_configs = 108;
   optional LambConfig lamb_configs = 109;
   optional AdaptiveLocalSGDConfig adaptive_localsgd_configs = 110;
+  optional ShardingConfig sharding_configs = 111;
   optional BuildStrategy build_strategy = 201;
   optional ExecutionStrategy execution_strategy = 202;
 }
diff --git a/python/paddle/distributed/fleet/base/distributed_strategy.py b/python/paddle/distributed/fleet/base/distributed_strategy.py
index 847050b404f01..71eca424fe650 100755
--- a/python/paddle/distributed/fleet/base/distributed_strategy.py
+++ b/python/paddle/distributed/fleet/base/distributed_strategy.py
@@ -611,6 +611,55 @@ def recompute_configs(self, configs):
                           "checkpoint_configs")
         assign_configs_value(self.strategy.recompute_configs, configs)
 
+    @property
+    def sharding(self):
+        """
+        Indicating whether we are using sharding Optimizer for memory
+        optimization
+
+        Default value: False
+
+        Examples:
+          .. code-block:: python
+            import paddle.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sharding = True
+        """
+        return self.strategy.sharding
+
+    @sharding.setter
+    @is_strict_auto
+    def sharding(self, flag):
+        if isinstance(flag, bool):
+            self.strategy.sharding = flag
+        else:
+            print("WARNING: sharding should have value of bool type")
+
+    @property
+    def sharding_configs(self):
+        """
+        Set sharding configurations.
+
+        **Note**:
+            fuse_broadcast_MB(float): size of a fused group of broadcasted parameters.
+
+        Examples:
+          .. code-block:: python
+        
+            import paddle.distributed.fleet as fleet
+            strategy = fleet.DistributedStrategy()
+            strategy.sharding = True
+            strategy.sharding_configs = {"fuse_broadcast_MB": 32}
+        """
+        return get_msg_dict(self.strategy.sharding_configs)
+
+    @sharding_configs.setter
+    @is_strict_auto
+    def sharding_configs(self, configs):
+        check_configs_key(self.strategy.sharding_configs, configs,
+                          "sharding_configs")
+        assign_configs_value(self.strategy.sharding_configs, configs)
+
     @property
     def pipeline(self):
         """
diff --git a/python/paddle/distributed/fleet/meta_optimizers/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
index 2e63e82e630cc..cdc8162f6dee5 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/__init__.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/__init__.py
@@ -24,3 +24,4 @@
 from .dgc_optimizer import DGCOptimizer
 from .lamb_optimizer import LambOptimizer
 from .fp16_allreduce_optimizer import FP16AllReduceOptimizer
+from .sharding_optimizer import ShardingOptimizer
diff --git a/python/paddle/distributed/fleet/meta_optimizers/common.py b/python/paddle/distributed/fleet/meta_optimizers/common.py
index 8ff4114bf8eda..0f7ca4f4294ae 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/common.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/common.py
@@ -99,6 +99,12 @@ def _init_communicator(self, program, current_endpoint, endpoints, rank,
                 OP_ROLE_KEY: OpRole.Forward
             })
 
+    def _wait(self, current_endpoint, endpoints):
+        assert (self.wait_port)
+        other_endpoints = endpoints[:]
+        other_endpoints.remove(current_endpoint)
+        wait_server_ready(other_endpoints)
+
     def _broadcast_params(self):
         block = self.startup_program.global_block()
         ring_id = -1
diff --git a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
index be614a0514738..7bd6832556933 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/dgc_optimizer.py
@@ -30,6 +30,10 @@ def _set_basic_info(self, loss, role_maker, user_defined_optimizer,
         super(DGCOptimizer, self)._set_basic_info(
             loss, role_maker, user_defined_optimizer, user_defined_strategy)
 
+    def _init_dgc_opt(self):
+        if self.dgc_opt is not None:
+            return
+
         opt = self.inner_opt
 
         if not self.role_maker._is_collective:
@@ -86,13 +90,16 @@ def backward(self,
                  parameter_list=None,
                  no_grad_set=None,
                  callbacks=None):
+        self._init_dgc_opt()
         return self.dgc_opt.backward(loss, startup_program, parameter_list,
                                      no_grad_set, callbacks)
 
     def apply_gradients(self, params_grads):
+        self._init_dgc_opt()
         return self.dgc_opt.apply_gradients(params_grads=params_grads)
 
     def apply_optimize(self, loss, startup_program, params_grads):
+        self._init_dgc_opt()
         return self.dgc_opt.apply_optimize(
             loss, startup_program=startup_program, params_grads=params_grads)
 
@@ -101,6 +108,7 @@ def minimize_impl(self,
                       startup_program=None,
                       parameter_list=None,
                       no_grad_set=None):
+        self._init_dgc_opt()
         optimize_ops, params_grads = \
             self.dgc_opt.minimize(loss, startup_program,
                                   parameter_list, no_grad_set)
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py
new file mode 100644
index 0000000000000..5d358dbd35fa8
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
new file mode 100644
index 0000000000000..cf6ab514b0bfe
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/fp16_helper.py
@@ -0,0 +1,154 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet.meta_optimizers.common import is_optimizer_op, OP_ROLE_KEY, OpRole
+from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+
+from paddle.fluid import core
+
+
+class FP16Utils(object):
+    def __init__(self):
+        pass
+
+    @staticmethod
+    def is_fp16_cast_op(block, op, params):
+        if op.type != "cast":
+            return False
+        if is_optimizer_op(op):
+            return False
+        assert (len(op.desc.input_arg_names()) == 1)
+        assert (len(op.desc.output_arg_names()) == 1)
+        input_name, output_name = op.desc.input_arg_names()[
+            0], op.desc.output_arg_names()[0]
+        if input_name not in params:
+            return False
+        input_var = block.var(input_name)
+        output_var = block.var(output_name)
+        if input_var.dtype != core.VarDesc.VarType.FP32 or \
+            output_var.dtype != core.VarDesc.VarType.FP16:
+            return False
+        return True
+
+    @staticmethod
+    def is_fp32_cast_op(block, op):
+        if op.type != "cast":
+            return False
+        if not is_optimizer_op(op):
+            return False
+        assert (len(op.desc.input_arg_names()) == 1)
+        assert (len(op.desc.output_arg_names()) == 1)
+        input_name, output_name = op.desc.input_arg_names()[
+            0], op.desc.output_arg_names()[0]
+        input_var = block.var(input_name)
+        output_var = block.var(output_name)
+        if input_var.dtype != core.VarDesc.VarType.FP16 or \
+            output_var.dtype != core.VarDesc.VarType.FP32:
+            return False
+        return True
+
+    @staticmethod
+    def remove_cast_op(block, params, segment, offset):
+        inserted_op_num = 0
+        for op_idx in reversed(
+                range(offset + segment._start_idx, offset + segment._end_idx)):
+            op = block.ops[op_idx]
+            if FP16Utils.is_fp16_cast_op(block, op, params):
+                block._remove_op(op_idx, sync=False)
+                inserted_op_num -= 1
+        block._sync_with_cpp()
+        return inserted_op_num
+
+    @staticmethod
+    def prune_fp16(block, shard, reduced_grads_to_param, nrings):
+        # remove cast
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not FP16Utils.is_fp32_cast_op(block, op):
+                continue
+            output_name = op.desc.output_arg_names()[0]
+            param_name = output_name.strip("@GRAD")
+            if param_name not in shard.global_params:
+                raise ValueError("Input 'X' of check_finite_and_unscale must"
+                                 "be grads, but {} is not a grad".format(
+                                     input_name))
+            if output_name in reduced_grads_to_param:
+                continue
+            if shard.has_param(param_name):
+                continue
+            block._remove_op(idx, sync=False)
+            block._remove_var(output_name, sync=False)
+
+        block._sync_with_cpp()
+        update_loss_scaling_op_idx = -1
+        inf_var_name = ''
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type == "update_loss_scaling":
+                update_loss_scaling_op_idx = idx
+                inf_var_name = op.desc.input('FoundInfinite')[0]
+                op._rename_input(inf_var_name, inf_var_name + "@sharding")
+            if op.type in ["check_finite_and_unscale", "update_loss_scaling"]:
+                reversed_x = []
+                for input_name in op.desc.input('X'):
+                    param_name = input_name.strip("@GRAD")
+                    if param_name not in shard.global_params:
+                        raise ValueError(
+                            "Input 'X' of check_finite_and_unscale must"
+                            "be grads, but {} is not a grad".format(input_name))
+                    if shard.has_param(param_name):
+                        reversed_x.append(input_name)
+                op.desc.set_input('X', reversed_x)
+                op.desc.set_output('Out', reversed_x)
+        if update_loss_scaling_op_idx == -1:
+            return
+        inf_var = block.var(inf_var_name)
+        inf_var_fp32 = block.create_var(
+            name=inf_var_name + "@cast_int32",
+            shape=inf_var.shape,
+            dtype=core.VarDesc.VarType.INT32)
+        inf_var_sharding = block.create_var(
+            name=inf_var_name + "@sharding",
+            shape=inf_var.shape,
+            dtype=inf_var.dtype)
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx,
+            type='cast',
+            inputs={'X': inf_var},
+            outputs={'Out': inf_var_fp32},
+            attrs={
+                "in_dtype": inf_var.dtype,
+                "out_dtype": inf_var_fp32.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        insert_sync_calc_op(block, update_loss_scaling_op_idx + 1,
+                            [inf_var_fp32])
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 2,
+            type='c_allreduce_max',
+            inputs={'X': inf_var_fp32},
+            outputs={'Out': inf_var_fp32},
+            attrs={'ring_id': 0,
+                   OP_ROLE_KEY: OpRole.Optimize})
+        comm_op_num = insert_sync_comm_ops(
+            block, update_loss_scaling_op_idx + 3, nrings, [inf_var_fp32])
+        block._insert_op_without_sync(
+            update_loss_scaling_op_idx + 3 + comm_op_num,
+            type='cast',
+            inputs={'X': inf_var_fp32},
+            outputs={'Out': inf_var_sharding},
+            attrs={
+                "in_dtype": inf_var_fp32.dtype,
+                "out_dtype": inf_var_sharding.dtype,
+                OP_ROLE_KEY: OpRole.Optimize
+            })
+        block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
new file mode 100644
index 0000000000000..afa46f43fc0fe
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/gradient_clip_helper.py
@@ -0,0 +1,90 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_KEY, OpRole
+
+
+class GradientClipHelper(object):
+    def __init__(self):
+        pass
+
+    def _is_gradient_clip_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/gradient_clip")
+
+    def prune_gradient_clip(self, block, shard):
+        deperated_vars = set()
+        deperate_op_idx = set()
+        for idx, op in enumerate(block.ops):
+            if not self._is_gradient_clip_op(op):
+                continue
+            if op.type == "sum":
+                continue
+            deperate_op = False
+            for input_name in op.desc.input_arg_names():
+                if input_name in deperated_vars:
+                    deperate_op = True
+                param_name = input_name.strip("@GRAD")
+                if shard.is_param(param_name) and \
+                  not shard.has_param(param_name):
+                    deperate_op = True
+
+            if deperate_op:
+                deperate_op_idx.add(idx)
+                for output_name in op.desc.output_arg_names():
+                    deperated_vars.add(output_name)
+
+        if not deperated_vars:
+            # got no gradient_clip op
+            return
+
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not self._is_gradient_clip_op(op):
+                continue
+            if idx in deperate_op_idx:
+                block._remove_op(idx, sync=False)
+                continue
+            reversed_inputs = []
+            if op.type == "sum":
+                for input_name in op.desc.input_arg_names():
+                    if input_name not in deperated_vars:
+                        reversed_inputs.append(input_name)
+                op.desc.set_input("X", reversed_inputs)
+                assert (len(op.desc.output_arg_names()) == 1)
+                sum_res = op.desc.output_arg_names()[0]
+                block._insert_op_without_sync(
+                    idx + 1,
+                    type='c_sync_comm_stream',
+                    inputs={'X': sum_res},
+                    outputs={'Out': sum_res},
+                    attrs={'ring_id': 0,
+                           OP_ROLE_KEY: OpRole.Optimize})
+                block._insert_op_without_sync(
+                    idx + 1,
+                    type='c_allreduce_sum',
+                    inputs={'X': sum_res},
+                    outputs={'Out': sum_res},
+                    attrs={'ring_id': 0,
+                           OP_ROLE_KEY: OpRole.Optimize})
+                block._insert_op_without_sync(
+                    idx + 1,
+                    type='c_sync_calc_stream',
+                    inputs={'X': sum_res},
+                    outputs={'Out': sum_res},
+                    attrs={OP_ROLE_KEY: OpRole.Optimize})
+
+        for var_name in deperated_vars:
+            block._remove_var(var_name, sync=False)
+        block._sync_with_cpp()
+        return
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
new file mode 100644
index 0000000000000..7348e5f6d1445
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/prune.py
@@ -0,0 +1,131 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class ProgramDeps(object):
+    def __init__(self, block, start_vars, end_vars):
+        self._block = block
+        # vars where to start to build the deps
+        self._start_vars = start_vars
+        # vars where to stop to build the deps
+        self._end_vars = end_vars
+        # var name -> op idxs which depends on this var
+        self._var_to_use_op = {}
+        # sub block deps which is a subset of this topo
+        self._sub_block_deps = {}
+        # var name -> op idxs which generate var
+        self._var_to_generate_op = {}
+        self._should_removed_var = set()
+        self._father_block_deps = None
+        self._build_deps()
+
+    def get_sub_block_deps(self, idx):
+        if idx in self._sub_block_deps:
+            return self._sub_block_deps[idx]
+        else:
+            return None
+
+    def get_var_deps(self, var_name):
+        if var_name in self._var_to_use_op:
+            return self._var_to_use_op[var_name]
+        else:
+            return None
+
+    def _build_deps(self, ):
+        for var_name in self._start_vars:
+            self._var_to_use_op[var_name] = []
+            self._var_to_generate_op[var_name] = []
+
+        for idx, op in enumerate(self._block.ops):
+            if op.type in [
+                    "c_allreduce_sum", "c_sync_comm_stream",
+                    "c_calc_comm_stream"
+            ]:
+                continue
+            input_vars = op.desc.input_arg_names()
+            output_vars = op.desc.output_arg_names()
+            deps_reduce = False
+            for input_name in input_vars:
+                if input_name in self._var_to_use_op:
+                    deps_reduce = True
+            if not deps_reduce:
+                continue
+            for input_name in input_vars:
+                if input_name in self._var_to_use_op:
+                    self._var_to_use_op[input_name].append(idx)
+            for output_name in output_vars:
+                if output_name not in self._var_to_use_op:
+                    self._var_to_use_op[output_name] = []
+                if output_name not in self._var_to_generate_op:
+                    self._var_to_generate_op[output_name] = [idx]
+                else:
+                    self._var_to_generate_op[output_name].append(idx)
+            if op.type == "conditional_block":
+                # subblock
+                assert (op.desc.has_attr("sub_block"))
+                subblock_idx = op.desc.attr("sub_block").id
+                subblock_deps = ProgramDeps(
+                    self._block.program.block(subblock_idx),
+                    op.desc.input_arg_names(), op.desc.output_arg_names())
+                self._sub_block_deps[subblock_idx] = subblock_deps
+                subblock_deps._father_block_deps = self
+
+    def crop_input_var_from_op(self, op_idx, var_name):
+        if var_name in self._var_to_use_op:
+            # update var -> dep_var_op
+            if self._var_to_use_op[var_name] != []:
+                if op_idx not in self._var_to_use_op[var_name]:
+                    raise ValueError(
+                        "op_idx: {} is not in self._var_to_use_op[{}], "
+                        "self._var_to_use_op[{}] is {}".format(
+                            op_idx, var_name, var_name, self._var_to_use_op[
+                                var_name]))
+                self._var_to_use_op[var_name].remove(op_idx)
+            # update _should_removed_var
+            if var_name in self._start_vars:
+                self._should_removed_var.discard(var_name)
+            elif self._var_to_use_op[
+                    var_name] == []:  # no more deps of this var
+                self._should_removed_var.add(var_name)
+            elif self._var_to_generate_op[var_name][-1] >= self._var_to_use_op[
+                    var_name][-1]:
+                # there are circle in the graph
+                self._should_removed_var.add(var_name)
+            else:  # input_name should not be deleted
+                self._should_removed_var.discard(var_name)
+
+    def crop_output_var_from_op(self, op_idx, var_name):
+        if var_name in self._var_to_generate_op:
+            assert (op_idx in self._var_to_generate_op[var_name])
+            self._var_to_generate_op[var_name].remove(op_idx)
+        if self._block.has_var(var_name):
+            if var_name not in self._var_to_generate_op or self._var_to_generate_op[
+                    var_name] == []:
+                self._block._remove_var(var_name, sync=False)
+
+    def remove_op(self, op_idx):
+        # update deps
+        op = self._block.ops[op_idx]
+        for input_name in op.desc.input_arg_names():
+            self.crop_input_var_from_op(op_idx, input_name)
+        for output_name in op.desc.output_arg_names():
+            self.crop_output_var_from_op(op_idx, output_name)
+        self._block._remove_op(op_idx, sync=False)
+
+    def should_remove_op(self, op_idx):
+        op = self._block.ops[op_idx]
+        for output_name in op.desc.output_arg_names():
+            if output_name not in self._should_removed_var:
+                return False
+        return True
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
new file mode 100644
index 0000000000000..27c63fc406fcb
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/shard.py
@@ -0,0 +1,144 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet.meta_optimizers.common import is_optimizer_op
+from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
+
+
+class Shard(object):
+    def __init__(self, ):
+        self.global_params = set([])
+        self.worker_idx = -1
+        self.worker_num = -1
+        self.global_param2device = {}
+
+    def setup(self, params_grads, worker_idx, worker_num):
+        # param names of all devices
+        self.global_params = set([x[0].name for x in params_grads])
+        # _param(str) -> device_id(int) 
+        self.worker_idx = worker_idx
+        self.worker_num = worker_num
+        # global_param2device contains fp32 params and fp16 params
+        self.global_param2device = self._split_params(params_grads, worker_idx,
+                                                      worker_num)
+
+    def has_param(self, var_name):
+        return var_name in self.global_param2device and \
+            self._var_device_id(var_name) == self.worker_idx
+
+    def has_opt_var(self, var_name):
+        return self._var_device_id(var_name) == self.worker_idx
+
+    def has_var(self, var_name):
+        return self._var_device_id(var_name) == -1 or \
+            self._var_device_id(var_name) == self.worker_idx
+
+    def _split_params(self, params_grads, worker_idx, worker_num):
+        param2device = {}
+        total_param_mem = 0.0
+        param2mem = []
+        for param in [x[0] for x in params_grads]:
+            mem = get_var_size(param)
+            total_param_mem += mem
+            param2mem.append((param.name, mem))
+        device2params = {x: [] for x in range(worker_num)}
+        device_idx = 0
+        mem_accu = 0.0
+        for param_name, mem in param2mem:
+            if mem_accu > total_param_mem * 1.0 * (device_idx + 1) / worker_num:
+                device_idx += 1
+            device2params[device_idx].append(param_name)
+            param2device[param_name] = device_idx
+            mem_accu += mem
+        return param2device
+
+    def _var_device_id(self, var_name):
+        if var_name in self.global_param2device:
+            return self.global_param2device[var_name]
+        for suffix in [
+                "_moment1_0", "_moment2_0", "_beta1_pow_acc_0",
+                "_beta2_pow_acc_0", "_velocity_0"
+        ]:
+            base_name = re.sub(suffix, '', var_name)
+            if base_name in self.global_param2device:
+                return self.global_param2device[base_name]
+        return -1
+
+    def find_broadcast_params(self, block):
+        broadcast_vars = set([])
+        fp16_params = set([])
+        fp16_to_fp32 = {}
+
+        param_usage = {x: 0 for x in self.global_params}
+        for op in block.ops:
+            if is_optimizer_op(op):
+                continue
+            for input_name in op.desc.input_arg_names():
+                if input_name in self.global_params:
+                    param_usage[input_name] += 1
+
+        for op in block.ops:
+            if not FP16Utils.is_fp16_cast_op(block, op, self.global_params):
+                continue
+            input_name = op.input_arg_names[0]
+            output_name = op.output_arg_names[0]
+            broadcast_vars.add(output_name)
+            fp16_params.add(output_name)
+            fp16_to_fp32[output_name] = input_name
+            param_usage[input_name] -= 1
+            self.global_param2device[output_name] = self.global_param2device[
+                input_name]
+
+        for param, usage in param_usage.items():
+            if usage > 0:
+                broadcast_vars.add(param)
+        return broadcast_vars
+
+    def device(self, var_name):
+        return self._var_device_id(var_name)
+
+    def is_param(self, var_name):
+        return var_name in self.global_params
+
+    def is_opti_var(self, var_name):
+        if var_name in self.global_params:
+            return True
+        for suffix in [
+                "_moment1_0", "_moment2_0", "_beta1_pow_acc_0",
+                "_beta2_pow_acc_0", "_velocity_0"
+        ]:
+            base_name = re.sub(suffix, '', var_name)
+            if base_name in self.global_params:
+                return True
+        return False
+
+
+class ProgramSegment(object):
+    def __init__(self, block):
+        self._block = block
+        self._allreduce_vars = []
+        # sub program start idx
+        self._start_idx = -1
+        # sub program end idx
+        self._end_idx = -1
+        # param name to broadcast name
+        self._param2broadcast = {}
+        self._broadcast_vars = []
+        # cast op pairs, fp16 name (str) -> fp32 name (str)
+        self._cast_ops = {}
+        # fill constant vars
+        self._fill_constant_vars = []
+        # parameter mems
+        self._param_mem = 0.0
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
new file mode 100644
index 0000000000000..51435ebb9e5e9
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/utils.py
@@ -0,0 +1,274 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import core
+from functools import reduce
+from paddle.distributed.fleet.meta_optimizers.common import is_loss_grad_op
+from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_KEY, OP_ROLE_VAR_KEY
+
+import re
+
+
+def check_broadcast(block):
+    """
+    if a var is broadcasted, it should have a sync_comm before
+    this var is used, if not, raise error.
+    if the broadcasted var has a fill_constant op, the fill_constant
+    op should stay forward before the broadcast op, and before a
+    sync_calc op. Otherwise, raise error.
+    """
+    broadcast_vars = {}
+    for idx, op in enumerate(block.ops):
+        if op.type == "c_broadcast":
+            var_name = op.desc.input_arg_names()[0]
+            if "@BroadCast" in var_name:
+                if var_name in broadcast_vars:
+                    raise ValueError("var_name areadly exist: {}"
+                                     "the old pos is {}, the new pos is {}".
+                                     format(var_name, broadcast_vars[var_name][
+                                         "broadcast_pos"], idx))
+                broadcast_vars[var_name] = {
+                    "fill_constant_pos": -1,
+                    "broadcast_pos": idx,
+                }
+
+    for idx, op in enumerate(block.ops):
+        if op.type == "fill_constant":
+            var_name = op.desc.output_arg_names()[0]
+            if var_name in broadcast_vars:
+                broadcast_vars[var_name]["fill_constant_pos"] = idx
+            continue
+
+    last_sync_comm_op_idx = -1
+    last_sync_calc_op_idx = -1
+    for idx, op in enumerate(block.ops):
+        if op.type == "c_sync_comm_stream":
+            last_sync_comm_op_idx = idx
+            continue
+        if op.type == "c_sync_calc_stream":
+            last_sync_calc_op_idx = idx
+            continue
+        if op.type == "c_broadcast":
+            var_name = op.desc.input_arg_names()[0]
+            if "@BroadCast" in var_name:
+                if broadcast_vars[var_name]["fill_constant_pos"] != -1:
+                    assert (last_sync_calc_op_idx != -1)
+                    assert (broadcast_vars[var_name]["fill_constant_pos"] <
+                            last_sync_calc_op_idx)
+                    assert (last_sync_calc_op_idx < idx)
+                continue
+        for input_name in op.desc.input_arg_names():
+            if input_name in broadcast_vars:
+                assert (broadcast_vars[input_name]["broadcast_pos"] != -1)
+                assert (broadcast_vars[input_name]["broadcast_pos"] <
+                        last_sync_comm_op_idx)
+                assert (last_sync_comm_op_idx < idx)
+    return
+
+
+def check_allreduce_sum(block):
+    """
+    if a Var is allreduced, the op order should be:
+        - 0: op that generate Var
+        - 1: sync_calc
+        - 2: allreduce_sum op
+        - 3: sync_comm
+        - 4: op that use Var
+    """
+    var_status = {}
+    for op in block.ops:
+        if op.type == "c_allreduce_sum":
+            var_name = op.desc.input_arg_names()[0]
+            var_status[var_name] = -1
+
+    for op in block.ops:
+        if op.type == "c_sync_calc_stream":
+            for var_name in var_status:
+                if var_name in var_status and var_status[var_name] == 0:
+                    var_status[var_name] = 1
+        elif op.type == "c_allreduce_sum":
+            var_name = op.desc.input_arg_names()[0]
+            if var_status[var_name] == -1:
+                raise ValueError("{} is not generated, but you are"
+                                 "trying to all-reduce it".format(var_name))
+            if var_status[var_name] == 0:
+                raise ValueError("There should be a sync_calc op "
+                                 "after generate Var: {} and before the"
+                                 "c_allreduce_sum op".format(var_name))
+            assert (var_status[var_name] == 1)
+            var_status[var_name] = 2
+        elif op.type == "c_sync_comm_stream":
+            for var_name in op.desc.input_arg_names():
+                if var_name in var_status and var_status[var_name] == 2:
+                    var_status[var_name] = 3
+        else:
+            for input_name in op.desc.input_arg_names():
+                if input_name in var_status:
+                    if var_status[input_name] != 3:
+                        raise ValueError("There should be a sync_comm op "
+                                         "after allreduce the Var: {}".format(
+                                             var_name))
+            for output_name in op.desc.output_arg_names():
+                if output_name in var_status and \
+                    var_status[output_name] == -1:
+                    var_status[output_name] = 0
+    return
+
+
+def insert_sync_calc_op(block, insert_idx, calc_dep_vars):
+    """
+    _insert_sync_calc_op
+    """
+    op_role = block.ops[insert_idx].attr('op_role')
+    block._insert_op_without_sync(
+        insert_idx,
+        type='c_sync_calc_stream',
+        inputs={'X': calc_dep_vars},
+        outputs={'Out': calc_dep_vars},
+        attrs={OP_ROLE_KEY: op_role})
+    return
+
+
+def insert_sync_comm_ops(block, insert_idx, nrings, comm_dep_vars):
+    """
+    _insert_sync_comm_ops
+    """
+    op_role = block.ops[insert_idx].attr('op_role')
+    for i in range(nrings):
+        block._insert_op_without_sync(
+            insert_idx,
+            type='c_sync_comm_stream',
+            inputs={'X': comm_dep_vars},
+            outputs={'Out': comm_dep_vars},
+            attrs={'ring_id': i,
+                   OP_ROLE_KEY: op_role})
+    return nrings
+
+
+def insert_fill_constant_ops(block, insert_idx, fill_constant_vars):
+    """
+    _add_fill_constant_ops
+    """
+    op_role = block.ops[insert_idx].attr('op_role')
+    for broadcast_name in fill_constant_vars:
+        broadcast_var = block.var(broadcast_name)
+        block._insert_op_without_sync(
+            insert_idx,
+            type="fill_constant",
+            outputs={"Out": broadcast_var.name},
+            attrs={
+                "shape": broadcast_var.shape,
+                "dtype": broadcast_var.dtype,
+                "value": 0.0,
+                OP_ROLE_KEY: op_role
+            })
+    return
+
+
+def insert_cast_ops(block, insert_idx, cast_ops):
+    """
+    _add_cast_ops
+    """
+    op_role = block.ops[insert_idx].attr('op_role')
+    for fp16_name, fp32_name in cast_ops.items():
+        block._insert_op_without_sync(
+            insert_idx,
+            type="cast",
+            inputs={"X": fp32_name},
+            outputs={"Out": fp16_name},
+            attrs={
+                "in_dtype": core.VarDesc.VarType.FP32,
+                "out_dtype": core.VarDesc.VarType.FP16,
+                OP_ROLE_KEY: op_role
+            })
+    return
+
+
+def insert_allreduce_ops(block, insert_idx, nrings, allreduce_vars):
+    """
+    _add_allreduce_ops
+    """
+    ring_id = -1
+    for var in allreduce_vars:
+        ring_id = (ring_id + 1) % nrings
+        block._insert_op_without_sync(
+            insert_idx,
+            type='c_allreduce_sum',
+            inputs={'X': var},
+            outputs={'Out': var},
+            attrs={'ring_id': ring_id,
+                   OP_ROLE_KEY: OpRole.Backward})
+    return
+
+
+def insert_broadcast_ops(block, insert_idx, nrings, broadcast2root):
+    """
+    _add_broadcast_ops
+    """
+    ring_id = -1
+    op_role = block.ops[insert_idx].attr('op_role')
+    for broadcast_name, root_device in broadcast2root:
+        ring_id = (ring_id + 1) % nrings
+        block._insert_op_without_sync(
+            insert_idx,
+            type='c_broadcast',
+            inputs={'X': broadcast_name},
+            outputs={'Out': broadcast_name},
+            attrs={
+                'ring_id': ring_id,
+                'root': root_device,
+                OP_ROLE_KEY: op_role
+            })
+    return
+
+
+DtypeToSize = {
+    core.VarDesc.VarType.FP16: 2,
+    core.VarDesc.VarType.FP32: 4,
+    core.VarDesc.VarType.FP64: 8,
+    core.VarDesc.VarType.INT16: 2,
+    core.VarDesc.VarType.INT32: 4,
+    core.VarDesc.VarType.INT64: 8,
+    core.VarDesc.VarType.BOOL: 1,
+    core.VarDesc.VarType.UINT8: 1,
+}
+
+
+def get_var_size(param):
+    """
+    input:
+        - param: var
+    return:
+        var size in Bytes
+    """
+    assert -1 not in param.shape
+    return reduce(lambda x, y: x * y,
+                  param.shape) * DtypeToSize[param.dtype] / 1024.0 / 1024.0
+
+
+def insert_scale_loss_grad_ops(block, scale=1.0):
+    '''
+    In order to keep the learning rate consistent in different numbers of
+    training workers, we scale the loss grad by the number of workers
+    '''
+    for idx, op in reversed(list(enumerate(block.ops))):
+        if is_loss_grad_op(op):
+            loss_grad_var = block.vars[op.output_arg_names[0]]
+            block._insert_op_without_sync(
+                idx + 1,
+                type='scale',
+                inputs={'X': loss_grad_var},
+                outputs={'Out': loss_grad_var},
+                attrs={'scale': scale,
+                       OP_ROLE_KEY: OpRole.Backward})
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
new file mode 100644
index 0000000000000..2833e8c6dac4b
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding/weight_decay_helper.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.distributed.fleet.meta_optimizers.common import OP_ROLE_VAR_KEY
+
+
+class WeightDecayHelper(object):
+    def __init__(self):
+        pass
+
+    def _is_weight_decay_op(self, op):
+        return op.desc.has_attr("op_namescope") \
+            and op.desc.attr("op_namescope").startswith("/regularization")
+
+    def prune_weight_decay(self, block, shard):
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if not self._is_weight_decay_op(op):
+                continue
+            if OP_ROLE_VAR_KEY not in op.attr_names:
+                raise ValueError(
+                    "The Weight Dacay op should hold op_role_var attribute"
+                    "but the {} op does not hold op_role_var".format(op.type))
+            op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+            if not shard.has_param(op_role_var[0]):
+                block._remove_op(idx, sync=False)
+        block._sync_with_cpp()
diff --git a/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
new file mode 100644
index 0000000000000..a449821f8c212
--- /dev/null
+++ b/python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
@@ -0,0 +1,411 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle.fluid import unique_name, core
+import paddle.fluid as fluid
+
+from paddle.distributed.fleet.meta_optimizers.common import OpRole, OP_ROLE_VAR_KEY, CollectiveHelper
+from paddle.distributed.fleet.meta_optimizers.common import is_backward_op
+from paddle.distributed.fleet.meta_optimizers.meta_optimizer_base import MetaOptimizerBase
+from paddle.distributed.fleet.meta_optimizers.sharding.shard import Shard, ProgramSegment
+from paddle.distributed.fleet.meta_optimizers.sharding.fp16_helper import FP16Utils
+from paddle.distributed.fleet.meta_optimizers.sharding.weight_decay_helper import WeightDecayHelper
+from paddle.distributed.fleet.meta_optimizers.sharding.gradient_clip_helper import GradientClipHelper
+from paddle.distributed.fleet.meta_optimizers.sharding.prune import ProgramDeps
+from paddle.distributed.fleet.meta_optimizers.sharding.utils import *
+
+from functools import reduce
+
+__all__ = ["ShardingOptimizer"]
+
+
+class ShardingOptimizer(MetaOptimizerBase):
+    def __init__(self, optimizer):
+        super(ShardingOptimizer, self).__init__(optimizer)
+        self.inner_opt = optimizer
+        self.meta_optimizers_white_list = [
+            "RecomputeOptimizer",
+            "AMPOptimizer",
+        ]
+        self.meta_optimizers_black_list = ["GraphExecutionOptimizer", ]
+        self._main_program = None
+        self._startup_program = None
+        self._segments = []
+        # params and fp16 params is for broadcast
+        self._params = set([])
+        self._broadcast_vars = set([])
+        # reduced grads to param name
+        self._reduced_grads_to_param = {}
+        self._shard = Shard()
+
+    def _can_apply(self):
+        if not self.role_maker._is_collective:
+            return False
+        if self.role_maker._worker_num() <= 1:
+            return False
+        return self.user_defined_strategy.sharding
+
+    def _disable_strategy(self, dist_strategy):
+        dist_strategy.sharding = False
+        dist_strategy.sharding_configs = {}
+
+    def _enable_strategy(self, dist_strategy, context):
+        dist_strategy.sharding = True
+        dist_strategy.sharding_configs = {"fuse_broadcast_MB": 32}
+
+    def minimize_impl(self,
+                      loss,
+                      startup_program=None,
+                      parameter_list=None,
+                      no_grad_set=None):
+        self._nrings = self.user_defined_strategy.nccl_comm_num
+        self._fuse_broadcast_MB = self.user_defined_strategy.sharding_configs[
+            "fuse_broadcast_MB"]
+
+        if self.inner_opt is None:
+            raise ValueError(
+                "self.inner_opt of ShardingOptimizer should not be None.")
+        optimize_ops, params_grads = self.inner_opt.minimize(
+            loss, startup_program, parameter_list, no_grad_set)
+
+        if startup_program is None:
+            startup_program = default_startup_program()
+        main_block = loss.block
+        startup_block = startup_program.global_block()
+        self._main_program = main_block.program
+        self._startup_program = startup_program
+
+        # step1: set_up
+        self._set_up(params_grads)
+
+        # step2: split_program
+        self._split_program(main_block)
+
+        # step3: add broadcast and reduce ops
+        self._add_broadcast_allreduce(main_block)
+        main_block._sync_with_cpp()
+        startup_block._sync_with_cpp()
+
+        # step4: insert reduce_sum for grad
+        insert_scale_loss_grad_ops(
+            main_block, scale=1.0 / self.role_maker._worker_num())
+        main_block._sync_with_cpp()
+
+        # step5: remove unneeded ops and vars from block
+        self._prune_main_program(main_block)
+        self._prune_startup_program(startup_block)
+
+        # check op dependecy
+        check_broadcast(main_block)
+        check_allreduce_sum(main_block)
+        self._wait()
+        return optimize_ops, params_grads
+
+    def _set_up(self, params_grads):
+        # step 1: initialize nccl
+        worker_idx = self.role_maker._worker_index()
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[worker_idx]
+        self._collective_helper = CollectiveHelper(self.role_maker,
+                                                   self._nrings)
+        for ring_id in range(self._nrings):
+            self._collective_helper._init_communicator(
+                self._startup_program, current_endpoint, endpoints, worker_idx,
+                ring_id, None)
+        startup_block = self._startup_program.global_block()
+        startup_block._sync_with_cpp()
+
+        # step 2: split params
+        self._params = set([x[0].name for x in params_grads])
+        self._shard.setup(params_grads, worker_idx,
+                          self.role_maker._worker_num())
+
+        # step 3: get broadcast vars
+        self._broadcast_vars = self._shard.find_broadcast_params(
+            self._main_program.global_block())
+
+    def _wait(self, ):
+        endpoints = self.role_maker._get_trainer_endpoints()
+        current_endpoint = endpoints[self.role_maker._worker_index()]
+        if self.role_maker._worker_index() == 0:
+            self._collective_helper._wait(current_endpoint, endpoints)
+
+    def _split_program(self, block):
+        for op_idx, op in reversed(list(enumerate(block.ops))):
+            if int(op.attr('op_role')) != int(OpRole.Optimize):
+                last_backward_op_idx = op_idx + 1
+                break
+        segment = ProgramSegment(block)
+        segment._end_idx = last_backward_op_idx
+        for op_idx in reversed(range(last_backward_op_idx)):
+            op = block.ops[op_idx]
+            assert (int(op.attr('op_role')) != int(OpRole.Optimize))
+            if segment._param_mem >= self._fuse_broadcast_MB:
+                segment._start_idx = op_idx + 1
+                self._segments.insert(0, segment)
+                segment = ProgramSegment(block)
+                segment._end_idx = op_idx + 1
+
+            # find broadcast vars
+            for input_name in op.desc.input_arg_names():
+                if input_name not in self._broadcast_vars:
+                    continue
+                if input_name in segment._param2broadcast:
+                    # skip broadcast because it reuse the old broadcast var
+                    broadcast_name = segment._param2broadcast[input_name]
+                    if input_name != broadcast_name:
+                        op._rename_input(input_name, broadcast_name)
+                    continue
+                if self._shard.has_param(input_name):
+                    broadcast_var_name = input_name
+                else:
+                    broadcast_var_name = unique_name.generate(input_name +
+                                                              "@BroadCast")
+                    segment._fill_constant_vars.append(broadcast_var_name)
+                segment._param2broadcast[input_name] = broadcast_var_name
+                segment._broadcast_vars.append((broadcast_var_name,
+                                                self._shard.device(input_name)))
+                segment._param_mem += get_var_size(
+                    self._main_program.global_block().var(input_name))
+
+            # find reduce vars
+            if is_backward_op(op) and \
+                    OP_ROLE_VAR_KEY in op.attr_names:
+                op_role_var = op.all_attrs()[OP_ROLE_VAR_KEY]
+                if len(op_role_var) != 0:
+                    assert len(op_role_var) % 2 == 0
+                    for i in range(0, len(op_role_var), 2):
+                        param, reduced_grad = op_role_var[i], op_role_var[i + 1]
+                        segment._allreduce_vars.append(reduced_grad)
+                        assert (
+                            reduced_grad not in self._reduced_grads_to_param)
+                        self._reduced_grads_to_param[reduced_grad] = param
+
+            # find cast op
+            if FP16Utils.is_fp16_cast_op(block, op, self._params):
+                fp32_param = op.desc.input_arg_names()[0]
+                fp16_param = op.desc.output_arg_names()[0]
+                if self._shard.has_param(fp32_param):
+                    segment._cast_ops[fp16_param] = fp32_param
+
+        if segment._param_mem > 0:
+            segment._start_idx = 0
+            self._segments.insert(0, segment)
+        return
+
+    def _prune_main_program(self, block):
+        """
+        calculate deps from allredce op to optimize op,
+        remove ops and vars not needed in this worker
+        """
+        weightdecay_helper = WeightDecayHelper()
+        weightdecay_helper.prune_weight_decay(block, self._shard)
+        FP16Utils.prune_fp16(block, self._shard, self._reduced_grads_to_param,
+                             self._nrings)
+        gradientclip_helper = GradientClipHelper()
+        gradientclip_helper.prune_gradient_clip(block, self._shard)
+
+        # build prog deps
+        reduced_grads = []
+        for idx, op in enumerate(block.ops):
+            input_names = op.desc.input_arg_names()
+            output_names = op.desc.output_arg_names()
+            if op.type == "c_allreduce_sum":
+                assert (len(output_names) == 1)
+                output_name = output_names[0]
+                reduced_grads.append(output_name)
+
+        pruned_opti_vars = []
+        for var_name in list(block.vars.keys()):
+            if self._shard.is_opti_var(var_name) and \
+              not self._shard.has_opt_var(var_name):
+                pruned_opti_vars.append(var_name)
+        program_deps = ProgramDeps(block, reduced_grads, pruned_opti_vars)
+
+        # Init
+        for var_name in program_deps._end_vars:
+            program_deps._should_removed_var.add(var_name)
+
+        # Prune
+        for idx, op in reversed(list(enumerate(block.ops))):
+            if op.type in [
+                    "c_allreduce_sum", "c_sync_comm_stream",
+                    "c_calc_comm_stream", "c_gen_nccl_id", "c_comm_init"
+            ]:
+                pass
+            elif op.type == "conditional_block":
+                assert (op.desc.has_attr("sub_block"))
+                subblock_idx = op.desc.attr("sub_block").id
+                subblock_deps = program_deps.get_sub_block_deps(subblock_idx)
+                # only prune amp subblock
+                if subblock_deps is None or not self._is_amp_subblock(op):
+                    continue
+                # init
+                reversed_output_vars = []
+                for output_name in op.desc.output("Out"):
+                    if output_name in program_deps._should_removed_var:
+                        subblock_deps._should_removed_var.add(output_name)
+                        program_deps.crop_output_var_from_op(idx, output_name)
+                    else:
+                        reversed_output_vars.append(output_name)
+                # prune
+                for sub_op_idx, _ in reversed(
+                        list(enumerate(subblock_deps._block.ops))):
+                    if subblock_deps.should_remove_op(sub_op_idx):
+                        subblock_deps.remove_op(sub_op_idx)
+                reversed_input_vars = []
+                for input_name in op.desc.input('Input'):
+                    if input_name not in subblock_deps._should_removed_var:
+                        reversed_input_vars.append(input_name)
+                    else:
+                        program_deps.crop_input_var_from_op(idx, input_name)
+                op.desc.set_input('Input', reversed_input_vars)
+                op.desc.set_output('Out', reversed_output_vars)
+            else:
+                if program_deps.should_remove_op(idx):
+                    program_deps.remove_op(idx)
+
+        block._sync_with_cpp()
+        return
+
+    def _add_broadcast_allreduce(self, block):
+        """
+        _add_broadcast_allreduce
+        """
+        ring_id = -1
+        if len(self._segments) < 1:
+            return
+
+        if self._segments[-1]._allreduce_vars:
+            insert_sync_comm_ops(block, self._segments[-1]._end_idx,
+                                 self._nrings,
+                                 self._segments[-1]._allreduce_vars)
+            insert_allreduce_ops(block, self._segments[-1]._end_idx,
+                                 self._nrings,
+                                 self._segments[-1]._allreduce_vars)
+
+        for idx, segment in reversed(list(enumerate(self._segments))):
+            allreduce_vars = self._segments[
+                idx - 1]._allreduce_vars if idx > 0 else []
+            broadcast_vars = self._segments[idx +
+                                            1]._broadcast_vars if idx < len(
+                                                self._segments) - 1 else []
+            fill_constant_vars = self._segments[
+                idx + 2]._fill_constant_vars if idx < len(
+                    self._segments) - 2 else []
+            cast_ops = self._segments[idx + 2]._cast_ops if idx < len(
+                self._segments) - 2 else {}
+
+            for op_idx in reversed(range(segment._start_idx, segment._end_idx)):
+                op = block.ops[op_idx]
+                for input_name in op.desc.input_arg_names():
+                    if input_name in segment._param2broadcast and \
+                        input_name != segment._param2broadcast[input_name]:
+                        op._rename_input(input_name,
+                                         segment._param2broadcast[input_name])
+
+            for param_name, broadcast_name in segment._param2broadcast.items():
+                if param_name != broadcast_name:
+                    block.create_var(
+                        name=broadcast_name,
+                        shape=self._main_program.global_block().var(
+                            param_name).shape,
+                        dtype=self._main_program.global_block().var(param_name)
+                        .dtype,
+                        persistable=False)
+
+            # step1: remove cast ops
+            block._sync_with_cpp()
+            segment._end_idx += FP16Utils.remove_cast_op(block, self._params,
+                                                         segment, 0)
+
+            # step2: add Sync ops
+            comm_dep_vars = allreduce_vars + [x[0] for x in broadcast_vars]
+            if len(comm_dep_vars) > 0:
+                insert_sync_comm_ops(
+                    block,
+                    segment._end_idx,
+                    self._nrings,
+                    comm_dep_vars, )
+            calc_dep_vars = fill_constant_vars + [
+                k for k, v in cast_ops.items()
+            ] + self._segments[idx]._allreduce_vars
+
+            if len(calc_dep_vars) > 0:
+                insert_sync_calc_op(block, segment._end_idx,
+                                    [calc_dep_vars[-1]])
+
+            # step3: insert `fill_constant` ops 
+            insert_fill_constant_ops(block, segment._end_idx,
+                                     fill_constant_vars)
+
+            # step4: add `cast` ops     
+            insert_cast_ops(block, segment._end_idx, cast_ops)
+
+            # step5: add broadcast ops
+            insert_broadcast_ops(block, segment._start_idx, self._nrings,
+                                 broadcast_vars)
+
+            # step6: add all_reduce ops
+            insert_allreduce_ops(block, segment._start_idx, self._nrings,
+                                 allreduce_vars)
+
+            block._sync_with_cpp()
+
+        if self._segments[0]._broadcast_vars:
+            insert_sync_comm_ops(
+                block, self._segments[0]._start_idx, self._nrings,
+                [x[0] for x in self._segments[0]._broadcast_vars])
+            insert_broadcast_ops(block, self._segments[0]._start_idx,
+                                 self._nrings,
+                                 self._segments[0]._broadcast_vars)
+
+        fill_constant_vars = []
+        for x in self._segments[:2]:
+            fill_constant_vars += x._fill_constant_vars
+
+        # Join
+        cast_ops = {}
+        for x in self._segments[:2]:
+            for k, v in x._cast_ops.items():
+                cast_ops[k] = v
+
+        calc_deps_vars = fill_constant_vars + [k for k, v in cast_ops.items()]
+        if fill_constant_vars or cast_ops:
+            insert_sync_calc_op(block, self._segments[0]._start_idx,
+                                [calc_deps_vars[-1]])
+
+        if fill_constant_vars:
+            insert_fill_constant_ops(block, self._segments[0]._start_idx,
+                                     fill_constant_vars)
+
+        if cast_ops:
+            insert_cast_ops(block, self._segments[0]._start_idx, cast_ops)
+
+        return
+
+    def _prune_startup_program(self, block):
+        for idx, op in reversed(list(enumerate(block.ops))):
+            for output_name in op.desc.output_arg_names():
+                if self._shard.has_var(output_name):
+                    continue
+                #TODO why do we remove op, when only one var is removed
+                block._remove_op(idx, sync=False)
+                break
+
+        for var_name in list(block.vars.keys()):
+            if self._shard.has_var(var_name):
+                continue
+            block._remove_var(var_name, sync=False)
+        block._sync_with_cpp()
diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py
index 505d6fef8fb53..f20716c3a1503 100644
--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -669,7 +669,7 @@ def append_gradient_clip_ops(param_grads):
         if g is None:
             continue
         with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('gradient_clip_@CLIP'):
+            [p, g]), framework.name_scope('gradient_clip'):
             clip_attr = getattr(p, 'gradient_clip_attr', None)
             if clip_attr is None:
                 return param_grads
@@ -685,7 +685,7 @@ def append_gradient_clip_ops(param_grads):
         if g is None:
             continue
         with p.block.program._optimized_guard(
-            [p, g]), framework.name_scope('graident_clip_@CLIP'):
+            [p, g]), framework.name_scope('gradient_clip'):
             param, new_grad = clip_attr._create_operators(param=p, grad=g)
             param_new_grad_name_dict[param.name] = new_grad.name
             res.append([param, new_grad])
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index aaceb22b98dff..6be7fe0612e5a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -2100,10 +2100,16 @@ def find_name(var_list, name):
                             % (out_proto.name, len(out_args)))
                     out_arg_names = []
                     for arg in out_args:
-                        out_arg_names.append(cpt.to_text(arg.name))
+                        if isinstance(arg, six.string_types):
+                            out_arg_names.append(arg)
+                        else:
+                            out_arg_names.append(cpt.to_text(arg.name))
                         # TODO(minqiyang): could we remove variable's op in static mode?
                         if not in_dygraph_mode():
-                            arg.op = self
+                            if isinstance(arg, six.string_types):
+                                block.var(arg).op = self
+                            else:
+                                arg.op = self
                     self.desc.set_output(out_proto.name, out_arg_names)
 
             if op_attrs is not None:
@@ -2837,8 +2843,9 @@ def _rename_var(self, name, new_name):
         self._sync_with_cpp()
         return var
 
-    def _remove_var(self, name):
-        self._sync_with_cpp()
+    def _remove_var(self, name, sync=True):
+        if sync == True:
+            self._sync_with_cpp()
         self.desc._remove_var(cpt.to_bytes(name))
         del self.vars[name]
 
@@ -2936,7 +2943,23 @@ def _insert_op(self, index, *args, **kwargs):
         self.ops.insert(index, op)
         return op
 
-    def _remove_op(self, index):
+    def _insert_op_without_sync(self, index, *args, **kwargs):
+        """
+        Insert an Operator according to the giving arguments, 
+        without sync_with_cpp to meke the compilation faster.
+
+        Args:
+            index(int): the place that the operator to insert.
+
+        Returns:
+            Operator: the insert Operator.
+        """
+        op_desc = self.desc._insert_op(index)
+        op = Operator(block=self, desc=op_desc, *args, **kwargs)
+        self.ops.insert(index, op)
+        return op
+
+    def _remove_op(self, index, sync=True):
         """
         Remove the specific position operator.
 
@@ -2946,7 +2969,8 @@ def _remove_op(self, index):
         Returns:
             None
         """
-        self._sync_with_cpp()
+        if sync == True:
+            self._sync_with_cpp()
         self.desc._remove_op(index, index + 1)
         del self.ops[index]
 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 39e44f6aaa1ff..101242808b22f 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -41,6 +41,7 @@ list(APPEND MIXED_DIST_TEST_OPS test_fleet_recompute_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_pipeline_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_amp_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_gradient_merge_meta_optimizer)
+list(APPEND MIXED_DIST_TEST_OPS test_fleet_sharding_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_localsgd_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lars_meta_optimizer)
 list(APPEND MIXED_DIST_TEST_OPS test_fleet_lamb_meta_optimizer)
@@ -461,6 +462,7 @@ if(WITH_DISTRIBUTE)
     	   py_test_modules(test_fleet_recompute_meta_optimizer MODULES test_fleet_recompute_meta_optimizer ENVS ${dist_ENVS})
 	       py_test_modules(test_fleet_graph_executor MODULES test_fleet_graph_executor ENVS ${dist_ENVS})
            py_test_modules(test_fleet_gradient_merge_meta_optimizer MODULES test_fleet_gradient_merge_meta_optimizer ENVS ${dist_ENVS})
+           py_test_modules(test_fleet_sharding_meta_optimizer MODULES test_fleet_sharding_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_amp_meta_optimizer MODULES test_fleet_amp_meta_optimizer ENVS ${dist_ENVS})
            py_test_modules(test_fleet_fp16_allreduce_meta_optimizer MODULES test_fleet_fp16_allreduce_meta_optimizer ENVS ${dist_ENVS})
     	   py_test_modules(test_fleet_pipeline_meta_optimizer MODULES test_fleet_pipeline_meta_optimizer ENVS ${dist_ENVS})
diff --git a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
index 48df06cddd934..b6ecc07fd9f89 100755
--- a/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
+++ b/python/paddle/fluid/tests/unittests/fleet_meta_optimizer_base.py
@@ -55,14 +55,22 @@ def optimizer(self,
                   strategy,
                   train_prog,
                   startup_prog,
-                  name='momentum'):
+                  name='momentum',
+                  regularization=None,
+                  grad_clip=None):
         with fluid.program_guard(train_prog, startup_prog):
             with fluid.unique_name.guard():
                 if name == 'momentum':
                     optimizer = paddle.fluid.optimizer.Momentum(
-                        learning_rate=0.01, momentum=0.9)
+                        learning_rate=0.01,
+                        momentum=0.9,
+                        regularization=regularization,
+                        grad_clip=grad_clip)
                 elif name == 'adam':
-                    optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
+                    optimizer = paddle.fluid.optimizer.Adam(
+                        learning_rate=0.01,
+                        regularization=regularization,
+                        grad_clip=grad_clip)
                 optimizer = fleet.distributed_optimizer(
                     optimizer, strategy=strategy)
                 optimizer.minimize(loss)
@@ -121,5 +129,8 @@ def set_strategy(self, strategy, name):
         elif name == "gradient_merge":
             strategy.gradient_merge = True
             strategy.gradient_merge_configs = {"k_steps": 2, "avg": True}
+        elif name == "sharding":
+            strategy.sharding = True
+            strategy.sharding_configs = {"fuse_broadcast_MB": 0.2}
         else:
             raise NotImplementedError()
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
index 29eb3d9ab16ac..a40bc9a9fba6e 100644
--- a/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_fleet_gradient_merge_meta_optimizer.py
@@ -32,9 +32,6 @@ def test_gradient_merge_optimizer(self):
         self.optimizer(avg_cost, strategy, train_prog, startup_prog)
 
         vars = [x.name for x in train_prog.list_vars()]
-        with open("main_program", 'w') as f:
-            f.write(str(train_prog))
-
         self.assertIn('@GradientMerge', ''.join(vars))
 
     def test_recom_gm_optimizer(self):
diff --git a/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
new file mode 100644
index 0000000000000..6a9f3e3ba7bf3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_fleet_sharding_meta_optimizer.py
@@ -0,0 +1,275 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import os
+import paddle.distributed.fleet as fleet
+import paddle.distributed.fleet.base.role_maker as role_maker
+
+from fleet_meta_optimizer_base import TestFleetMetaOptimizer
+
+paddle.enable_static()
+
+
+class TestFleetShardingMetaOptimizer(TestFleetMetaOptimizer):
+    def test_sharding_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertEqual(
+            set(parameters),
+            set([
+                "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
+                "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
+            ]))
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_amp_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+        self.assertEqual(
+            set(parameters),
+            set([
+                "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
+                "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
+                "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
+            ]))
+        self.assertEqual(ops, [
+            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'fill_constant', 'c_sync_calc_stream', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_sync_comm_stream', 'cast', 'mul', 'elementwise_add', 'cast',
+            'tanh', 'cast', 'mul', 'elementwise_add', 'cast', 'tanh', 'cast',
+            'mul', 'elementwise_add', 'softmax', 'cast', 'cross_entropy2',
+            'mean', 'elementwise_mul', 'fill_constant', 'scale',
+            'elementwise_mul_grad', 'mean_grad', 'cross_entropy_grad2', 'cast',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad', 'cast',
+            'tanh_grad', 'cast', 'elementwise_add_grad', 'mul_grad',
+            'c_sync_calc_stream', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
+            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
+            'update_loss_scaling', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_recompute_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.set_strategy(strategy, 'recompute')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertIn('subprog', ''.join(vars))
+        self.assertEqual(
+            set(parameters),
+            set([
+                "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
+                "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
+            ]))
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'mul',
+            'elementwise_add', 'tanh_grad', 'elementwise_add_grad', 'mul_grad',
+            'mul', 'elementwise_add', 'tanh_grad', 'elementwise_add_grad',
+            'mul_grad', 'c_sync_calc_stream', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_sync_comm_stream',
+            'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_amp_recompute_optimizer(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        self.set_strategy(strategy, 'recompute')
+        self.set_strategy(strategy, 'amp')
+        self.optimizer(avg_cost, strategy, train_prog, startup_prog)
+
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertIn('subprog', ''.join(vars))
+        self.assertIn('cast', ops)
+        self.assertIn('check_finite_and_unscale', ops)
+
+        self.assertEqual(
+            set(parameters),
+            set([
+                "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
+                "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0",
+                "loss_scaling_0", "num_bad_steps_0", "num_good_steps_0"
+            ]))
+
+        self.assertEqual(ops, [
+            'cast', 'cast', 'cast', 'fill_constant', 'fill_constant',
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_sync_comm_stream', 'cast', 'cast',
+            'mul', 'cast', 'elementwise_add', 'cast', 'tanh', 'cast', 'mul',
+            'elementwise_add', 'cast', 'tanh', 'cast', 'mul', 'elementwise_add',
+            'softmax', 'cast', 'cross_entropy2', 'mean', 'elementwise_mul',
+            'fill_constant', 'scale', 'elementwise_mul_grad', 'mean_grad',
+            'cross_entropy_grad2', 'cast', 'softmax_grad',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', 'cast',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'cast', 'cast', 'mul', 'cast',
+            'elementwise_add', 'cast', 'tanh_grad', 'cast',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'cast', 'cast', 'cast',
+            'check_finite_and_unscale', 'cast', 'c_sync_calc_stream',
+            'c_allreduce_max', 'c_sync_comm_stream', 'cast',
+            'update_loss_scaling', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_weight_decay(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        regularization = paddle.fluid.regularizer.L2Decay(0.0001)
+        self.optimizer(
+            avg_cost,
+            strategy,
+            train_prog,
+            startup_prog,
+            regularization=regularization)
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertEqual(
+            set(parameters),
+            set([
+                "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
+                "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
+            ]))
+
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'scale', 'sum', 'scale', 'sum', 'scale',
+            'sum', 'momentum', 'momentum', 'momentum'
+        ])
+
+    def test_sharding_gradient_clip(self):
+        train_prog, startup_prog = paddle.fluid.Program(), paddle.fluid.Program(
+        )
+        avg_cost, strategy = self.net(train_prog, startup_prog)
+        self.set_strategy(strategy, 'sharding')
+        clip = paddle.fluid.clip.GradientClipByGlobalNorm(clip_norm=1.0)
+        self.optimizer(
+            avg_cost, strategy, train_prog, startup_prog, grad_clip=clip)
+        parameters = [
+            x.name for x in train_prog.list_vars() if x.persistable == True
+        ]
+        ops = [op.type for op in avg_cost.block.ops]
+        vars = [x.name for x in train_prog.list_vars()]
+        self.assertIn('@BroadCast', ''.join(vars))
+        self.assertEqual(
+            set(parameters),
+            set([
+                "fc_1.b_0", "fc_2.b_0", "fc_2.w_0", "fc_1.b_0_velocity_0",
+                "fc_2.b_0_velocity_0", "fc_2.w_0_velocity_0", "learning_rate_0"
+            ]))
+        self.assertEqual(ops, [
+            'fill_constant', 'fill_constant', 'fill_constant',
+            'c_sync_calc_stream', 'c_broadcast', 'c_broadcast', 'c_broadcast',
+            'c_broadcast', 'c_broadcast', 'c_broadcast', 'c_sync_comm_stream',
+            'mul', 'elementwise_add', 'tanh', 'mul', 'elementwise_add', 'tanh',
+            'mul', 'elementwise_add', 'softmax', 'cross_entropy2', 'mean',
+            'fill_constant', 'scale', 'mean_grad', 'cross_entropy_grad2',
+            'softmax_grad', 'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'tanh_grad',
+            'elementwise_add_grad', 'mul_grad', 'c_sync_calc_stream',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_allreduce_sum', 'c_allreduce_sum', 'c_allreduce_sum',
+            'c_sync_comm_stream', 'square', 'reduce_sum', 'square',
+            'reduce_sum', 'square', 'reduce_sum', 'sum', 'c_sync_calc_stream',
+            'c_allreduce_sum', 'c_sync_comm_stream', 'sqrt', 'fill_constant',
+            'elementwise_max', 'elementwise_div', 'elementwise_mul',
+            'elementwise_mul', 'elementwise_mul', 'momentum', 'momentum',
+            'momentum'
+        ])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/setup.py.in b/python/setup.py.in
index f09c189a68e1c..f9395f8dd318b 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -148,6 +148,7 @@ packages=['paddle',
           'paddle.distributed.fleet',
           'paddle.distributed.fleet.base',
           'paddle.distributed.fleet.meta_optimizers',
+          'paddle.distributed.fleet.meta_optimizers.sharding',
           'paddle.distributed.fleet.runtime',
           'paddle.distributed.fleet.dataset',
           'paddle.distributed.fleet.data_generator',

From 11089cacdb802e8f35bcbcbf01b3b959ef77d1aa Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Mon, 26 Oct 2020 10:22:37 +0800
Subject: [PATCH 046/185] Fix xpu notest (#28204)

* Fix xpu notest;test=kunlun

* fix

* test=kunlun

* test=kunlun
---
 paddle/scripts/paddle_build.sh                   |  1 +
 .../fluid/tests/unittests/xpu/CMakeLists.txt     |  3 +++
 .../unittests/xpu/test_batch_norm_op_xpu.py      | 16 +++++++++-------
 .../tests/unittests/xpu/test_mean_op_xpu.py      |  4 ++--
 4 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 711d6564cfa63..87fb6628f4223 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -1200,6 +1200,7 @@ EOF
 set +x
         ut_startTime_s=`date +%s`
         test_cases=$(ctest -N -V | grep "_xpu" )        # cases list which would be run exclusively
+        get_quickly_disable_ut||disable_ut_quickly=''   # indicate whether the case was in quickly disable list
         while read -r line; do
             if [[ "$line" == "" ]]; then
                 continue
diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
index f71e04c09aa38..6ac4b93bf6d66 100644
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
@@ -1,6 +1,9 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+list(REMOVE_ITEM TEST_OPS test_concat_op_xpu)
+list(REMOVE_ITEM TEST_OPS test_mean_op_xpu)
+
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
index 0d9387d6b75a7..1cdec863b2ac3 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_batch_norm_op_xpu.py
@@ -159,13 +159,15 @@ def set_attrs(self):
     def test_infer(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_np.shape, self.x_np.dtype)
-            scale = paddle.data('Scale', self.scale_np.shape,
-                                self.scale_np.dtype)
-            bias = paddle.data('Bias', self.bias_np.shape, self.bias_np.dtype)
-            mean = paddle.data('Mean', self.mean_np.shape, self.mean_np.dtype)
-            variance = paddle.data('Variance', self.variance_np.shape,
-                                   self.variance_np.dtype)
+            x = paddle.fluid.data('X', self.x_np.shape, self.x_np.dtype)
+            scale = paddle.fluid.data('Scale', self.scale_np.shape,
+                                      self.scale_np.dtype)
+            bias = paddle.fluid.data('Bias', self.bias_np.shape,
+                                     self.bias_np.dtype)
+            mean = paddle.fluid.data('Mean', self.mean_np.shape,
+                                     self.mean_np.dtype)
+            variance = paddle.fluid.data('Variance', self.variance_np.shape,
+                                         self.variance_np.dtype)
             y = F.batch_norm(x, mean, variance, scale, bias, False,
                              self.momentum, self.epsilon, self.data_layout)
             exe = paddle.static.Executor(self.place)
diff --git a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
index f43516235c057..3ebdd110d32cc 100644
--- a/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
+++ b/python/paddle/fluid/tests/unittests/xpu/test_mean_op_xpu.py
@@ -88,7 +88,7 @@ def setUp(self):
     def test_api_static(self):
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', self.x_shape)
+            x = paddle.fluid.data('X', self.x_shape)
             out1 = paddle.mean(x)
             out2 = paddle.tensor.mean(x)
             out3 = paddle.tensor.stat.mean(x)
@@ -136,7 +136,7 @@ def test_errors(self):
         self.assertRaises(Exception, paddle.mean, x, 2)
         paddle.enable_static()
         with paddle.static.program_guard(paddle.static.Program()):
-            x = paddle.data('X', [10, 12], 'int32')
+            x = paddle.fluid.data('X', [10, 12], 'int32')
             self.assertRaises(TypeError, paddle.mean, x)
 
 

From 95ac49c346617242ac32218b3eeb8564942f5118 Mon Sep 17 00:00:00 2001
From: joejiong <wujionghao@baidu.com>
Date: Mon, 26 Oct 2020 10:33:45 +0800
Subject: [PATCH 047/185] add new dockerfile for paddle and cinn setup (#28225)

---
 .../Dockerfile.cuda10_ubuntu18_cinn           | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100755 tools/manylinux1/Dockerfile.cuda10_ubuntu18_cinn

diff --git a/tools/manylinux1/Dockerfile.cuda10_ubuntu18_cinn b/tools/manylinux1/Dockerfile.cuda10_ubuntu18_cinn
new file mode 100755
index 0000000000000..964f082b56137
--- /dev/null
+++ b/tools/manylinux1/Dockerfile.cuda10_ubuntu18_cinn
@@ -0,0 +1,152 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV HOME /root
+# Add bash enhancements
+COPY paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+  apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
+  apt-get update && \
+  apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
+    coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
+
+
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+      RUN apt-get update --fix-missing
+      COPY tools/dockerfile/build_scripts /build_scripts 
+      RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
+      RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+
+RUN apt-get update && \
+  apt-get install -y python2.7 python2.7-dev \
+  python3.5 python3.5-dev \
+  python3.6 python3.6-dev \
+  python3.7 python3.7-dev \
+  python3.8 python3.8-dev && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python2.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.5 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
+  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
+    tar -xzf binutils-2.33.1.tar.gz && \ 
+    cd binutils-2.33.1 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
+
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip --no-cache-dir install ipykernel==4.6.0 wheel 
+
+#For docstring checker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.7 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip --no-cache-dir install pylint pytest astroid isort
+
+COPY ./python/requirements.txt /root/
+RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.7 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip --no-cache-dir install -r /root/requirements.txt
+
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+#CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+# For CINN environment 
+RUN apt update --fix-missing
+RUN apt-get install autoconf autogen
+RUN apt-get install libtool
+RUN apt-get install zlib1g-dev
+RUN apt install libginac-dev -y
+RUN apt install clang cmake -y
+RUN python3 -m pip install numpy
+RUN python3 -m pip install pybind11
+
+
+# Install LLVM
+RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
+RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
+RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
+RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
+RUN ln -s /usr/bin/llvm-config-6.0 /usr/bin/llvm-config
+RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
+
+RUN apt update
+RUN apt install libclang-dev llvm-10 llvm-10-dev libclang-10-dev -y
+
+
+EXPOSE 22

From f4f823c860367fcbcd471c1ec1cdb7e455ec47b7 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Sun, 25 Oct 2020 22:09:34 -0500
Subject: [PATCH 048/185] Refine the format of printing tensor 2 (#28216)

* refine format

* update doc

* handle uninitialized tensor

* add ut
---
 .../fluid/tests/unittests/test_var_base.py    | 34 ++++++++
 python/paddle/tensor/to_string.py             | 79 +++++++++++--------
 2 files changed, 80 insertions(+), 33 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 2df24b00797c1..ecbf2415247b1 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -444,6 +444,40 @@ def test_tensor_str(self):
         self.assertEqual(a_str, expected)
         paddle.enable_static()
 
+    def test_tensor_str2(self):
+        paddle.disable_static(paddle.CPUPlace())
+        a = paddle.to_tensor([[1.5111111, 1.0], [0, 0]])
+        a_str = str(a)
+
+        if six.PY2:
+            expected = '''Tensor(shape=[2L, 2L], dtype=float32, place=CPUPlace, stop_gradient=True,
+       [[1.5111, 1.    ],
+        [0.    , 0.    ]])'''
+        else:
+            expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
+       [[1.5111, 1.    ],
+        [0.    , 0.    ]])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
+    def test_tensor_str3(self):
+        paddle.disable_static(paddle.CPUPlace())
+        a = paddle.to_tensor([[-1.5111111, 1.0], [0, -0.5]])
+        a_str = str(a)
+
+        if six.PY2:
+            expected = '''Tensor(shape=[2L, 2L], dtype=float32, place=CPUPlace, stop_gradient=True,
+       [[-1.5111,  1.    ],
+        [ 0.    , -0.5000]])'''
+        else:
+            expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
+       [[-1.5111,  1.    ],
+        [ 0.    , -0.5000]])'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
 
 class TestVarBaseSetitem(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index c56c1baa7a8e9..bd956b923a663 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -58,18 +58,14 @@ def set_printoptions(precision=None,
             print(a)
             
             '''
-            Tensor: dygraph_tmp_0
-            - place: CPUPlace
-            - shape: [10, 20]
-            - layout: NCHW
-            - dtype: float32
-            - data: [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
-                    [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
-                    [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
+            Tensor(shape=[10, 20], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+                   [[0.0002, 0.8503, 0.0135, ..., 0.9508, 0.2621, 0.6661],
+                    [0.9710, 0.2605, 0.9950, ..., 0.4427, 0.9241, 0.9363],
+                    [0.0948, 0.3226, 0.9955, ..., 0.1198, 0.0889, 0.9231],
                     ...,
-                    [0.3426, 0.1909, 0.7240, ..., 0.4218, 0.2676, 0.5679],
-                    [0.5561, 0.2081, 0.0676, ..., 0.9778, 0.3302, 0.9559],
-                    [0.2665, 0.8483, 0.5389, ..., 0.4956, 0.6862, 0.9178]]
+                    [0.7206, 0.0941, 0.5292, ..., 0.4856, 0.1379, 0.0351],
+                    [0.1745, 0.5621, 0.3602, ..., 0.2998, 0.4011, 0.1764],
+                    [0.0728, 0.7786, 0.0314, ..., 0.2583, 0.1654, 0.0637]])
             '''
     """
     kwargs = {}
@@ -101,7 +97,7 @@ def _to_sumary(var):
         return var
     elif len(var.shape) == 1:
         if var.shape[0] > 2 * edgeitems:
-            return paddle.concat([var[:edgeitems], var[-edgeitems:]])
+            return np.concatenate([var[:edgeitems], var[-edgeitems:]])
         else:
             return var
     else:
@@ -109,12 +105,12 @@ def _to_sumary(var):
         if var.shape[0] > 2 * edgeitems:
             begin = [x for x in var[:edgeitems]]
             end = [x for x in var[-edgeitems:]]
-            return paddle.stack([_to_sumary(x) for x in (begin + end)])
+            return np.stack([_to_sumary(x) for x in (begin + end)])
         else:
-            return paddle.stack([_to_sumary(x) for x in var])
+            return np.stack([_to_sumary(x) for x in var])
 
 
-def _format_item(np_var, max_width=0):
+def _format_item(np_var, max_width=0, signed=False):
     if np_var.dtype == np.float32 or np_var.dtype == np.float64 or np_var.dtype == np.float16:
         if DEFAULT_PRINT_OPTIONS.sci_mode:
             item_str = '{{:.{}e}}'.format(
@@ -128,54 +124,66 @@ def _format_item(np_var, max_width=0):
         item_str = '{}'.format(np_var)
 
     if max_width > len(item_str):
-        return '{indent}{data}'.format(
-            indent=(max_width - len(item_str)) * ' ', data=item_str)
-    else:
+        if signed:  # handle sign character for tenosr with negative item
+            if np_var < 0:
+                return item_str.ljust(max_width)
+            else:
+                return ' ' + item_str.ljust(max_width - 1)
+        else:
+            return item_str.ljust(max_width)
+    else:  # used for _get_max_width
         return item_str
 
 
 def _get_max_width(var):
     max_width = 0
-    for item in list(var.numpy().flatten()):
+    signed = False
+    for item in list(var.flatten()):
+        if (not signed) and (item < 0):
+            signed = True
         item_str = _format_item(item)
         max_width = max(max_width, len(item_str))
-    return max_width
 
+    return max_width, signed
 
-def _format_tensor(var, sumary, indent=0):
+
+def _format_tensor(var, sumary, indent=0, max_width=0, signed=False):
     edgeitems = DEFAULT_PRINT_OPTIONS.edgeitems
-    max_width = _get_max_width(_to_sumary(var))
 
     if len(var.shape) == 0:
         # currently, shape = [], i.e., scaler tensor is not supported.
         # If it is supported, it should be formatted like this.
-        return _format_item(var.numpy().item(0), max_width)
+        return _format_item(var.item(0), max_width, signed)
     elif len(var.shape) == 1:
         if sumary and var.shape[0] > 2 * edgeitems:
             items = [
-                _format_item(item, max_width)
-                for item in list(var.numpy())[:DEFAULT_PRINT_OPTIONS.edgeitems]
+                _format_item(item, max_width, signed)
+                for item in list(var)[:DEFAULT_PRINT_OPTIONS.edgeitems]
             ] + ['...'] + [
-                _format_item(item, max_width)
-                for item in list(var.numpy())[-DEFAULT_PRINT_OPTIONS.edgeitems:]
+                _format_item(item, max_width, signed)
+                for item in list(var)[-DEFAULT_PRINT_OPTIONS.edgeitems:]
             ]
         else:
             items = [
-                _format_item(item, max_width) for item in list(var.numpy())
+                _format_item(item, max_width, signed) for item in list(var)
             ]
-
         s = ', '.join(items)
         return '[' + s + ']'
     else:
         # recursively handle all dimensions
         if sumary and var.shape[0] > 2 * edgeitems:
             vars = [
-                _format_tensor(x, sumary, indent + 1) for x in var[:edgeitems]
+                _format_tensor(x, sumary, indent + 1, max_width, signed)
+                for x in var[:edgeitems]
             ] + ['...'] + [
-                _format_tensor(x, sumary, indent + 1) for x in var[-edgeitems:]
+                _format_tensor(x, sumary, indent + 1, max_width, signed)
+                for x in var[-edgeitems:]
             ]
         else:
-            vars = [_format_tensor(x, sumary, indent + 1) for x in var]
+            vars = [
+                _format_tensor(x, sumary, indent + 1, max_width, signed)
+                for x in var
+            ]
 
         return '[' + (',' + '\n' * (len(var.shape) - 1) + ' ' *
                       (indent + 1)).join(vars) + ']'
@@ -190,6 +198,8 @@ def to_string(var, prefix='Tensor'):
     if not tensor._is_initialized():
         return "Tensor(Not initialized)"
 
+    np_var = var.numpy()
+
     if len(var.shape) == 0:
         size = 0
     else:
@@ -201,7 +211,10 @@ def to_string(var, prefix='Tensor'):
     if size > DEFAULT_PRINT_OPTIONS.threshold:
         sumary = True
 
-    data = _format_tensor(var, sumary, indent=indent)
+    max_width, signed = _get_max_width(_to_sumary(np_var))
+
+    data = _format_tensor(
+        np_var, sumary, indent=indent, max_width=max_width, signed=signed)
 
     return _template.format(
         prefix=prefix,

From d252219779bf1415f62c717191046b745d29dc13 Mon Sep 17 00:00:00 2001
From: XiaoguangHu <46782768+XiaoguangHu01@users.noreply.github.com>
Date: Mon, 26 Oct 2020 11:40:20 +0800
Subject: [PATCH 049/185] add git mirror url to speed up clone (#28241)

---
 CMakeLists.txt                  | 7 +++++++
 cmake/external/brpc.cmake       | 2 +-
 cmake/external/cryptopp.cmake   | 4 ++--
 cmake/external/cub.cmake        | 2 +-
 cmake/external/dlpack.cmake     | 2 +-
 cmake/external/eigen.cmake      | 4 ++--
 cmake/external/gflags.cmake     | 2 +-
 cmake/external/glog.cmake       | 2 +-
 cmake/external/gloo.cmake       | 2 +-
 cmake/external/gtest.cmake      | 2 +-
 cmake/external/leveldb.cmake    | 2 +-
 cmake/external/libxsmm.cmake    | 2 +-
 cmake/external/lite.cmake       | 4 ++--
 cmake/external/mkldnn.cmake     | 2 +-
 cmake/external/openblas.cmake   | 2 +-
 cmake/external/protobuf.cmake   | 2 +-
 cmake/external/pybind11.cmake   | 2 +-
 cmake/external/rocprim.cmake    | 2 +-
 cmake/external/threadpool.cmake | 2 +-
 cmake/external/warpctc.cmake    | 2 +-
 cmake/external/xbyak.cmake      | 2 +-
 cmake/external/xxhash.cmake     | 2 +-
 cmake/external/zlib.cmake       | 2 +-
 23 files changed, 32 insertions(+), 25 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1a8eef5e66b1c..80820c6487c50 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -98,6 +98,13 @@ else(WIN32)
 endif(WIN32)
 
 find_package(Git REQUIRED)
+
+# config GIT_URL with github mirrors to speed up dependent repos clone
+option(GIT_URL "Git URL to clone dependent repos" ${GIT_URL})
+if(NOT GIT_URL)
+    set(GIT_URL "https://github.com")
+endif()
+
 find_package(Threads REQUIRED)
 
 include(simd)
diff --git a/cmake/external/brpc.cmake b/cmake/external/brpc.cmake
index ec42eaa759dbc..064e35112ff6f 100644
--- a/cmake/external/brpc.cmake
+++ b/cmake/external/brpc.cmake
@@ -40,7 +40,7 @@ ExternalProject_Add(
     extern_brpc
     ${EXTERNAL_PROJECT_LOG_ARGS}
     ${SHALLOW_CLONE}
-    GIT_REPOSITORY  "https://github.com/apache/incubator-brpc.git"
+    GIT_REPOSITORY  "${GIT_URL}/apache/incubator-brpc.git"
     GIT_TAG         "ad00fe940b4f05225b214131959293bbed8744a0" #rdma branch's head now.
     PREFIX          ${BRPC_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/external/cryptopp.cmake b/cmake/external/cryptopp.cmake
index 351ef1c7c7aeb..3176e2a665c63 100644
--- a/cmake/external/cryptopp.cmake
+++ b/cmake/external/cryptopp.cmake
@@ -17,7 +17,7 @@ INCLUDE(ExternalProject)
 SET(CRYPTOPP_PREFIX_DIR  ${THIRD_PARTY_PATH}/cryptopp)
 SET(CRYPTOPP_INSTALL_DIR ${THIRD_PARTY_PATH}/install/cryptopp)
 SET(CRYPTOPP_INCLUDE_DIR "${CRYPTOPP_INSTALL_DIR}/include" CACHE PATH "cryptopp include directory." FORCE)
-SET(CRYPTOPP_REPOSITORY https://github.com/weidai11/cryptopp.git)
+SET(CRYPTOPP_REPOSITORY ${GIT_URL}/weidai11/cryptopp.git)
 SET(CRYPTOPP_TAG        CRYPTOPP_8_2_0)
 
 IF(WIN32)
@@ -55,7 +55,7 @@ ExternalProject_Add(
     SOURCE_DIR      ${CRYPTOPP_SOURCE_DIR}
     PATCH_COMMAND
     COMMAND ${CMAKE_COMMAND} -E remove_directory "<SOURCE_DIR>/cmake/"
-    COMMAND git clone https://github.com/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
+    COMMAND git clone ${GIT_URL}/noloader/cryptopp-cmake "<SOURCE_DIR>/cmake"
     COMMAND cd "<SOURCE_DIR>/cmake" && git checkout tags/${CRYPTOPP_TAG} -b ${CRYPTOPP_TAG}
     COMMAND ${CMAKE_COMMAND} -E copy_directory "<SOURCE_DIR>/cmake/" "<SOURCE_DIR>/"
     INSTALL_DIR     ${CRYPTOPP_INSTALL_DIR}
diff --git a/cmake/external/cub.cmake b/cmake/external/cub.cmake
index 6f790f1af8e1a..a26568860f42d 100644
--- a/cmake/external/cub.cmake
+++ b/cmake/external/cub.cmake
@@ -16,7 +16,7 @@ include(ExternalProject)
 
 set(CUB_PREFIX_DIR ${THIRD_PARTY_PATH}/cub)
 set(CUB_SOURCE_DIR ${THIRD_PARTY_PATH}/cub/src/extern_cub)
-set(CUB_REPOSITORY https://github.com/NVlabs/cub.git)
+set(CUB_REPOSITORY ${GIT_URL}/NVlabs/cub.git)
 set(CUB_TAG        1.8.0)
 
 cache_third_party(extern_cub
diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake
index a5144f8e463d2..87db181d953af 100644
--- a/cmake/external/dlpack.cmake
+++ b/cmake/external/dlpack.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(DLPACK_PREFIX_DIR ${THIRD_PARTY_PATH}/dlpack)
 set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack/src/extern_dlpack)
 
-set(DLPACK_REPOSITORY https://github.com/dmlc/dlpack.git)
+set(DLPACK_REPOSITORY ${GIT_URL}/dmlc/dlpack.git)
 set(DLPACK_TAG        v0.2)
 
 cache_third_party(extern_dlpack
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index 631803da31d5a..b1e3897891027 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -22,14 +22,14 @@ set(EIGEN_TAG        4da2c6b1974827b1999bab652a3d4703e1992d26)
 
 # the recent version of eigen will cause compilation error on windows
 if(WIN32)
-    set(EIGEN_REPOSITORY https://github.com/eigenteam/eigen-git-mirror.git)
+    set(EIGEN_REPOSITORY ${GIT_URL}/eigenteam/eigen-git-mirror.git)
     set(EIGEN_TAG        917060c364181f33a735dc023818d5a54f60e54c)
 endif()
 
 # eigen on cuda9.1 missing header of math_funtions.hpp
 # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen
 if(WITH_AMD_GPU)
-    set(EIGEN_REPOSITORY https://github.com/sabreshao/hipeigen.git)
+    set(EIGEN_REPOSITORY ${GIT_URL}/sabreshao/hipeigen.git)
     set(EIGEN_TAG        7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e)
 endif()
 
diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake
index 2842d73081b7b..a077c8061b1a2 100644
--- a/cmake/external/gflags.cmake
+++ b/cmake/external/gflags.cmake
@@ -18,7 +18,7 @@ SET(GFLAGS_PREFIX_DIR  ${THIRD_PARTY_PATH}/gflags)
 SET(GFLAGS_SOURCE_DIR  ${THIRD_PARTY_PATH}/gflags/src/extern_gflags)
 SET(GFLAGS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gflags)
 SET(GFLAGS_INCLUDE_DIR "${GFLAGS_INSTALL_DIR}/include" CACHE PATH "gflags include directory." FORCE)
-set(GFLAGS_REPOSITORY https://github.com/gflags/gflags.git)
+set(GFLAGS_REPOSITORY ${GIT_URL}/gflags/gflags.git)
 set(GFLAGS_TAG        77592648e3f3be87d6c7123eb81cbad75f9aef5a)
 IF(WIN32)
   set(GFLAGS_LIBRARIES "${GFLAGS_INSTALL_DIR}/lib/gflags_static.lib" CACHE FILEPATH "GFLAGS_LIBRARIES" FORCE)
diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake
index f66ecce18a047..649152bd43636 100644
--- a/cmake/external/glog.cmake
+++ b/cmake/external/glog.cmake
@@ -18,7 +18,7 @@ SET(GLOG_PREFIX_DIR  ${THIRD_PARTY_PATH}/glog)
 SET(GLOG_SOURCE_DIR  ${THIRD_PARTY_PATH}/glog/src/extern_glog)
 SET(GLOG_INSTALL_DIR ${THIRD_PARTY_PATH}/install/glog)
 SET(GLOG_INCLUDE_DIR "${GLOG_INSTALL_DIR}/include" CACHE PATH "glog include directory." FORCE)
-SET(GLOG_REPOSITORY https://github.com/google/glog.git)
+SET(GLOG_REPOSITORY ${GIT_URL}/google/glog.git)
 SET(GLOG_TAG        v0.3.5)
 
 IF(WIN32)
diff --git a/cmake/external/gloo.cmake b/cmake/external/gloo.cmake
index ace71a7f63413..ea7af315e1a69 100644
--- a/cmake/external/gloo.cmake
+++ b/cmake/external/gloo.cmake
@@ -21,7 +21,7 @@ SET(GLOO_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gloo)
 SET(GLOO_INCLUDE_DIR   "${GLOO_INSTALL_DIR}/include" CACHE PATH "gloo include directory." FORCE)
 SET(GLOO_LIBRARY_DIR   "${GLOO_INSTALL_DIR}/lib" CACHE PATH "gloo library directory." FORCE)
 # As we add extra features for gloo, we use the non-official repo
-SET(GLOO_REPOSITORY    https://github.com/sandyhouse/gloo.git)
+SET(GLOO_REPOSITORY    ${GIT_URL}/sandyhouse/gloo.git)
 SET(GLOO_TAG           v0.0.2)
 SET(GLOO_LIBRARIES     "${GLOO_INSTALL_DIR}/lib/libgloo.a" CACHE FILEPATH "gloo library." FORCE)
 
diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake
index 60061c44b78eb..3db12f084eb5a 100644
--- a/cmake/external/gtest.cmake
+++ b/cmake/external/gtest.cmake
@@ -25,7 +25,7 @@ SET(GTEST_PREFIX_DIR    ${THIRD_PARTY_PATH}/gtest)
 SET(GTEST_SOURCE_DIR    ${THIRD_PARTY_PATH}/gtest/src/extern_gtest)
 SET(GTEST_INSTALL_DIR   ${THIRD_PARTY_PATH}/install/gtest)
 SET(GTEST_INCLUDE_DIR   "${GTEST_INSTALL_DIR}/include" CACHE PATH "gtest include directory." FORCE)
-set(GTEST_REPOSITORY     https://github.com/google/googletest.git)
+set(GTEST_REPOSITORY     ${GIT_URL}/google/googletest.git)
 set(GTEST_TAG            release-1.8.1)
 
 INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIR})
diff --git a/cmake/external/leveldb.cmake b/cmake/external/leveldb.cmake
index 384268a2d814c..be6d70c82629b 100644
--- a/cmake/external/leveldb.cmake
+++ b/cmake/external/leveldb.cmake
@@ -25,7 +25,7 @@ ExternalProject_Add(
     ${EXTERNAL_PROJECT_LOG_ARGS}
     ${SHALLOW_CLONE}
     PREFIX ${LEVELDB_SOURCES_DIR}
-    GIT_REPOSITORY "https://github.com/google/leveldb.git"
+    GIT_REPOSITORY "${GIT_URL}/google/leveldb.git"
     GIT_TAG v1.18
     CONFIGURE_COMMAND ""
     BUILD_COMMAND CXXFLAGS=-fPIC make -j ${NUM_OF_PROCESSOR} libleveldb.a
diff --git a/cmake/external/libxsmm.cmake b/cmake/external/libxsmm.cmake
index b6ab2d1528447..0d09576286d90 100644
--- a/cmake/external/libxsmm.cmake
+++ b/cmake/external/libxsmm.cmake
@@ -24,7 +24,7 @@ SET(LIBXSMM_LIBS        "${LIBXSMM_LIBRARY_DIR}/libxsmm.a"
 ExternalProject_Add(
     extern_libxsmm
     ${SHALLOW_CLONE}
-    GIT_REPOSITORY  "https://github.com/hfp/libxsmm.git"
+    GIT_REPOSITORY  "${GIT_URL}/hfp/libxsmm.git"
     GIT_TAG         "7cc03b5b342fdbc6b6d990b190671c5dbb8489a2"
     PREFIX          ${LIBXSMM_SOURCES_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/external/lite.cmake b/cmake/external/lite.cmake
index 9781d33966679..9cf305a4421d8 100644
--- a/cmake/external/lite.cmake
+++ b/cmake/external/lite.cmake
@@ -62,7 +62,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
     ExternalProject_Add(
       ${LITE_PROJECT}
       ${EXTERNAL_PROJECT_LOG_ARGS}
-      GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+      GIT_REPOSITORY      "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
       GIT_TAG             ${LITE_GIT_TAG}
       PREFIX              ${LITE_SOURCES_DIR}
       PATCH_COMMAND       mkdir -p ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code && touch ${LITE_SOURCES_DIR}/src/extern_lite-build/lite/gen_code/__generated_code__.cc
@@ -106,7 +106,7 @@ if (NOT LITE_SOURCE_DIR OR NOT LITE_BINARY_DIR)
     ExternalProject_Add(
         ${LITE_PROJECT}
         ${EXTERNAL_PROJECT_LOG_ARGS}
-        GIT_REPOSITORY      "https://github.com/PaddlePaddle/Paddle-Lite.git"
+        GIT_REPOSITORY      "${GIT_URL}/PaddlePaddle/Paddle-Lite.git"
         GIT_TAG             ${LITE_GIT_TAG}
         PREFIX              ${LITE_SOURCES_DIR}
         UPDATE_COMMAND      ""
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
index e3ac8624a809a..9c9e1d18d90d6 100644
--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
@@ -19,7 +19,7 @@ SET(MKLDNN_PREFIX_DIR     ${THIRD_PARTY_PATH}/mkldnn)
 SET(MKLDNN_SOURCE_DIR     ${THIRD_PARTY_PATH}/mkldnn/src/extern_mkldnn)
 SET(MKLDNN_INSTALL_DIR    ${THIRD_PARTY_PATH}/install/mkldnn)
 SET(MKLDNN_INC_DIR        "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
-SET(MKLDNN_REPOSITORY     https://github.com/oneapi-src/oneDNN.git)
+SET(MKLDNN_REPOSITORY     ${GIT_URL}/oneapi-src/oneDNN.git)
 SET(MKLDNN_TAG            361725600224f41b7347a1c6bee9b04d1e6c14d7)
 
 # Introduce variables:
diff --git a/cmake/external/openblas.cmake b/cmake/external/openblas.cmake
index 5bc7eaaff3abe..5e67a91c3d854 100644
--- a/cmake/external/openblas.cmake
+++ b/cmake/external/openblas.cmake
@@ -17,7 +17,7 @@ INCLUDE(ExternalProject)
 SET(CBLAS_PREFIX_DIR  ${THIRD_PARTY_PATH}/openblas)
 SET(CBLAS_SOURCE_DIR  ${THIRD_PARTY_PATH}/openblas/src/extern_openblas)
 SET(CBLAS_INSTALL_DIR ${THIRD_PARTY_PATH}/install/openblas)
-SET(CBLAS_REPOSITORY  https://github.com/xianyi/OpenBLAS.git)
+SET(CBLAS_REPOSITORY  ${GIT_URL}/xianyi/OpenBLAS.git)
 SET(CBLAS_TAG         v0.3.7)
 IF(WITH_ARM)
     # Under the FT2000 architecture, the calculation result of blas.sgemm in openblas 0.3+ is wrong,
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index 82dd4fa2e8eae..905c17b9304ae 100644
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -198,7 +198,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
             "-Dprotobuf_MSVC_STATIC_RUNTIME=${MSVC_STATIC_CRT}")
     ENDIF()
 
-    SET(PROTOBUF_REPOSITORY  https://github.com/protocolbuffers/protobuf.git)
+    SET(PROTOBUF_REPOSITORY  ${GIT_URL}/protocolbuffers/protobuf.git)
     SET(PROTOBUF_TAG         9f75c5aa851cd877fb0d93ccc31b8567a6706546)
 
     cache_third_party(${TARGET_NAME}
diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 1297d613839ca..3a0b3676db36e 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -16,7 +16,7 @@ include(ExternalProject)
 
 set(PYBIND_PREFIX_DIR     ${THIRD_PARTY_PATH}/pybind)
 set(PYBIND_SOURCE_DIR     ${THIRD_PARTY_PATH}/pybind/src/extern_pybind)
-SET(PYBIND_REPOSITORY     https://github.com/pybind/pybind11.git)
+SET(PYBIND_REPOSITORY     ${GIT_URL}/pybind/pybind11.git)
 SET(PYBIND_TAG            v2.2.4)
 
 cache_third_party(extern_pybind
diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake
index 7cb765039714e..6bcecb88e9886 100644
--- a/cmake/external/rocprim.cmake
+++ b/cmake/external/rocprim.cmake
@@ -29,7 +29,7 @@ SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include)
 ExternalProject_Add(
     extern_rocprim
     ${SHALLOW_CLONE}
-    GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git"
+    GIT_REPOSITORY "${GIT_URL}/ROCmSoftwarePlatform/rocPRIM.git"
     GIT_TAG        5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc 
     PREFIX         ${ROCPRIM_SOURCE_DIR}
     UPDATE_COMMAND  ""
diff --git a/cmake/external/threadpool.cmake b/cmake/external/threadpool.cmake
index 24827b5bdf954..205e8d26d93ca 100644
--- a/cmake/external/threadpool.cmake
+++ b/cmake/external/threadpool.cmake
@@ -16,7 +16,7 @@ INCLUDE(ExternalProject)
 
 SET(THREADPOOL_PREFIX_DIR ${THIRD_PARTY_PATH}/threadpool)
 SET(THREADPOOL_SOURCE_DIR ${THIRD_PARTY_PATH}/threadpool/src/extern_threadpool)
-SET(THREADPOOL_REPOSITORY https://github.com/progschj/ThreadPool.git)
+SET(THREADPOOL_REPOSITORY ${GIT_URL}/progschj/ThreadPool.git)
 SET(THREADPOOL_TAG        9a42ec1329f259a5f4881a291db1dcb8f2ad9040)
 
 cache_third_party(extern_threadpool
diff --git a/cmake/external/warpctc.cmake b/cmake/external/warpctc.cmake
index 7f2ab1fb11d84..0ee3e2116a94b 100644
--- a/cmake/external/warpctc.cmake
+++ b/cmake/external/warpctc.cmake
@@ -17,7 +17,7 @@ INCLUDE(ExternalProject)
 SET(WARPCTC_PREFIX_DIR  ${THIRD_PARTY_PATH}/warpctc)
 SET(WARPCTC_SOURCE_DIR  ${THIRD_PARTY_PATH}/warpctc/src/extern_warpctc)
 SET(WARPCTC_INSTALL_DIR ${THIRD_PARTY_PATH}/install/warpctc)
-set(WARPCTC_REPOSITORY  https://github.com/baidu-research/warp-ctc.git)
+set(WARPCTC_REPOSITORY  ${GIT_URL}/baidu-research/warp-ctc.git)
 set(WARPCTC_TAG         95a461eddeabd51099ef059dcfada1117eb1bfb8)
 
 SET(WARPCTC_INCLUDE_DIR "${WARPCTC_INSTALL_DIR}/include"
diff --git a/cmake/external/xbyak.cmake b/cmake/external/xbyak.cmake
index f990914ab73a6..6627c4eed112f 100644
--- a/cmake/external/xbyak.cmake
+++ b/cmake/external/xbyak.cmake
@@ -19,7 +19,7 @@ set(XBYAK_PREFIX_DIR    ${THIRD_PARTY_PATH}/xbyak)
 SET(XBYAK_SOURCE_DIR     ${THIRD_PARTY_PATH}/xbyak/src/extern_xbyak)
 set(XBYAK_INSTALL_ROOT  ${THIRD_PARTY_PATH}/install/xbyak)
 set(XBYAK_INC_DIR       ${XBYAK_INSTALL_ROOT}/include)
-set(XBYAK_REPOSITORY    https://github.com/herumi/xbyak.git)
+set(XBYAK_REPOSITORY    ${GIT_URL}/herumi/xbyak.git)
 set(XBYAK_TAG           v5.661) # Jul 26th
 
 include_directories(${XBYAK_INC_DIR})
diff --git a/cmake/external/xxhash.cmake b/cmake/external/xxhash.cmake
index 3be938141f7e8..bdd7df190ff10 100644
--- a/cmake/external/xxhash.cmake
+++ b/cmake/external/xxhash.cmake
@@ -18,7 +18,7 @@ set(XXHASH_PREFIX_DIR ${THIRD_PARTY_PATH}/xxhash)
 set(XXHASH_SOURCE_DIR ${THIRD_PARTY_PATH}/xxhash/src/extern_xxhash)
 set(XXHASH_INSTALL_DIR ${THIRD_PARTY_PATH}/install/xxhash)
 set(XXHASH_INCLUDE_DIR "${XXHASH_INSTALL_DIR}/include")
-set(XXHASH_REPOSITORY  https://github.com/Cyan4973/xxHash.git)
+set(XXHASH_REPOSITORY  ${GIT_URL}/Cyan4973/xxHash.git)
 set(XXHASH_TAG         v0.6.5)
 
 cache_third_party(extern_xxhash
diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake
index 2f2a6b7470115..4464787a0c2a6 100644
--- a/cmake/external/zlib.cmake
+++ b/cmake/external/zlib.cmake
@@ -19,7 +19,7 @@ SET(ZLIB_SOURCE_DIR ${THIRD_PARTY_PATH}/zlib/src/extern_zlib)
 SET(ZLIB_INSTALL_DIR ${THIRD_PARTY_PATH}/install/zlib)
 SET(ZLIB_ROOT ${ZLIB_INSTALL_DIR} CACHE FILEPATH "zlib root directory." FORCE)
 SET(ZLIB_INCLUDE_DIR "${ZLIB_INSTALL_DIR}/include" CACHE PATH "zlib include directory." FORCE)
-set(ZLIB_REPOSITORY https://github.com/madler/zlib.git)
+set(ZLIB_REPOSITORY ${GIT_URL}/madler/zlib.git)
 set(ZLIB_TAG        v1.2.8)
 
 INCLUDE_DIRECTORIES(${ZLIB_INCLUDE_DIR}) # For zlib code to include its own headers.

From 994087188816575d456c2f9c2a6c90aad83b4e71 Mon Sep 17 00:00:00 2001
From: cnn <liuhui29@baidu.com>
Date: Mon, 26 Oct 2020 11:53:18 +0800
Subject: [PATCH 050/185] [cherry pick ] cherry pick 28108 28198 28199 from
 release2.0rc (#28215)

* Release 2.0rc cherry pick api rename #28108 (#28184)

* rename count_include_pad-->exclusive  return_indices-->return_mask

* remove track_running_stats

* fix typo.

* rename xxxd-->xxxxD

* solve conflicts

* 2.0rc api add all any (#28199)

* reduce trt warning message (#28011)

add paddle.enable_static() on sample code

alias recude_all-->all, reduce_any-->any

add import reduce_all and reduce_any in python/paddle/tensor/math.py

import all and any in python/paddle/tensor/__init__.py

remove all and any OP in python/paddle/tensor/logic.py, add all and any OP in python/paddle/tensor/math.py

fix import error

remove TestAllAPI temporary

* fix doc of recdue_all and reduce_any, test=document_fix

* fix typo

* fix unittest for all and any API

Co-authored-by: Pei Yang <peiyang@baidu.com>

* rename conv_transposeXd-->convXd_transpose (#28198)

* fix sample code of reduce_all and reduce_any

Co-authored-by: Pei Yang <peiyang@baidu.com>
---
 python/paddle/__init__.py                     |   4 +-
 python/paddle/fluid/layers/nn.py              |  79 +++++--
 .../parallel_dygraph_sync_batch_norm.py       |   5 +-
 .../unittests/test_adaptive_max_pool2d.py     |   2 +-
 .../unittests/test_conv1d_transpose_layer.py  |   2 +-
 .../unittests/test_conv2d_transpose_layer.py  |   2 +-
 .../unittests/test_conv3d_transpose_layer.py  |   2 +-
 .../test_functional_conv2d_transpose.py       |   6 +-
 .../test_functional_conv3d_transpose.py       |   6 +-
 .../fluid/tests/unittests/test_pool1d_api.py  |  11 +-
 .../fluid/tests/unittests/test_pool2d_api.py  |  10 +-
 .../fluid/tests/unittests/test_pool3d_api.py  |  10 +-
 .../fluid/tests/unittests/test_reduce_op.py   | 112 ++++++++++
 python/paddle/nn/functional/__init__.py       |   6 +-
 python/paddle/nn/functional/conv.py           |  22 +-
 python/paddle/nn/functional/pooling.py        | 120 +++++------
 python/paddle/nn/layer/conv.py                |  12 +-
 python/paddle/nn/layer/norm.py                |  56 +----
 python/paddle/nn/layer/pooling.py             | 100 ++++-----
 python/paddle/tensor/__init__.py              |   4 +-
 python/paddle/tensor/logic.py                 |   2 +
 python/paddle/tensor/math.py                  | 202 +++++++++++++++++-
 22 files changed, 542 insertions(+), 233 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 54e51200dc745..ae4dda166c733 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -103,8 +103,6 @@
 from .tensor.logic import logical_or  #DEFINE_ALIAS
 from .tensor.logic import logical_xor  #DEFINE_ALIAS
 from .tensor.logic import not_equal  #DEFINE_ALIAS
-# from .tensor.logic import reduce_all  #DEFINE_ALIAS
-# from .tensor.logic import reduce_any  #DEFINE_ALIAS
 from .tensor.logic import allclose  #DEFINE_ALIAS
 from .tensor.logic import equal_all  #DEFINE_ALIAS
 # from .tensor.logic import isnan        #DEFINE_ALIAS
@@ -162,6 +160,8 @@
 # from .tensor.math import reduce_min  #DEFINE_ALIAS
 # from .tensor.math import reduce_prod  #DEFINE_ALIAS
 # from .tensor.math import reduce_sum  #DEFINE_ALIAS
+from .tensor.math import all  #DEFINE_ALIAS
+from .tensor.math import any  #DEFINE_ALIAS
 from .tensor.math import round  #DEFINE_ALIAS
 from .tensor.math import rsqrt  #DEFINE_ALIAS
 from .tensor.math import scale  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index c2bb96ead2bf9..ac762944b3a68 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -315,6 +315,8 @@ def fc(input,
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           # when input is single tensor
           data = fluid.data(name="data", shape=[-1, 32], dtype="float32")
           fc = fluid.layers.fc(input=data, size=1000, act="tanh")
@@ -468,6 +470,9 @@ def embedding(input,
 
           import paddle.fluid as fluid
           import numpy as np
+          import paddle
+          paddle.enable_static()
+          
           data = fluid.data(name='x', shape=[None, 1], dtype='int64')
 
           # example 1
@@ -731,6 +736,8 @@ def linear_chain_crf(input, label, param_attr=None, length=None):
 
             import paddle.fluid as fluid
             import numpy as np
+            import paddle
+            paddle.enable_static()
 
             #define net structure, using LodTensor
             train_program = fluid.Program()
@@ -855,6 +862,8 @@ def crf_decoding(input, param_attr, label=None, length=None):
         .. code-block:: python
 
            import paddle.fluid as fluid
+           import paddle
+           paddle.enable_static()
 
            # LoDTensor-based example
            num_labels = 10
@@ -1458,6 +1467,9 @@ def conv2d(input,
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
+          
           data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
           conv2d = fluid.layers.conv2d(input=data, num_filters=2, filter_size=3, act="relu")
     """
@@ -1728,6 +1740,8 @@ def conv3d(input,
         .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           data = fluid.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32')
           conv3d = fluid.layers.conv3d(input=data, num_filters=2, filter_size=3, act="relu")
     """
@@ -2377,6 +2391,7 @@ def adaptive_pool2d(input,
           #             output[:, :, i, j] = avg(input[:, :, hstart: hend, wstart: wend])
           #
           import paddle
+          paddle.enable_static()
           data = paddle.rand(shape=[1,3,32,32])
           pool_out = paddle.fluid.layers.adaptive_pool2d(
                             input=data,
@@ -2531,6 +2546,7 @@ def adaptive_pool3d(input,
           #
 
           import paddle
+          paddle.enable_static()
           data = paddle.rand(shape=[1,3,32,32,32])
           pool_out = paddle.fluid.layers.adaptive_pool3d(
                             input=data,
@@ -2726,6 +2742,8 @@ def batch_norm(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             x = fluid.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
             hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
             hidden2 = fluid.layers.batch_norm(input=hidden1)
@@ -2735,6 +2753,8 @@ def batch_norm(input,
             # batch_norm with momentum as Variable
             import paddle.fluid as fluid
             import paddle.fluid.layers.learning_rate_scheduler as lr_scheduler
+            import paddle
+            paddle.enable_static()
 
             def get_decay_momentum(momentum_init, decay_steps, decay_rate):
                 global_step = lr_scheduler._decay_step_counter()
@@ -3134,6 +3154,8 @@ def instance_norm(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             x = fluid.data(name='x', shape=[3, 7, 3, 7], dtype='float32')
             hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w')
             hidden2 = fluid.layers.instance_norm(input=hidden1)
@@ -3269,6 +3291,7 @@ def data_norm(input,
         .. code-block:: python
 
             import paddle
+            paddle.enable_static()
 
             x = paddle.randn(shape=[32,100])
             hidden2 = paddle.static.nn.data_norm(input=x)
@@ -3451,6 +3474,8 @@ def layer_norm(input,
 
             import paddle.fluid as fluid
             import numpy as np
+            import paddle
+            paddle.enable_static()
             x = fluid.data(name='x', shape=[-1, 32, 32], dtype='float32')
             hidden1 = fluid.layers.layer_norm(input=x, begin_norm_axis=1)
             place = fluid.CPUPlace()
@@ -3566,6 +3591,9 @@ def group_norm(input,
        .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
+            
             data = fluid.data(name='data', shape=[None, 8, 32, 32], dtype='float32')
             x = fluid.layers.group_norm(input=data, groups=4)
     """
@@ -3887,6 +3915,8 @@ def conv2d_transpose(input,
        .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           data = fluid.data(name='data', shape=[None, 3, 32, 32], dtype='float32')
           conv2d_transpose = fluid.layers.conv2d_transpose(input=data, num_filters=2, filter_size=3)
     """
@@ -4177,6 +4207,8 @@ def conv3d_transpose(input,
        .. code-block:: python
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
           data = fluid.data(name='data', shape=[None, 3, 12, 32, 32], dtype='float32')
           conv3d_transpose = fluid.layers.conv3d_transpose(input=data, num_filters=2, filter_size=3)
     """
@@ -4659,7 +4691,7 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
     This OP computes the ``logical and`` of tensor elements over the given dimension, and output the result.
 
     Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor, the input data type should be `bool`.
+        input (Tensor): the input tensor, it's data type should be `bool`.
         dim (list|int|optional): The dimension along which the logical and is computed.
             If :attr:`None`, compute the logical and over all elements of
             :attr:`input` and return a Tensor variable with a single element,
@@ -4672,11 +4704,12 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
                        will be named automatically. The default value is None.
 
     Returns:
-        Variable, the output data type is bool. : The reduced tensor variable with ``logical and`` in given dims.
+        Tensor, the output data type is bool. : The reduced tensor variable with ``logical and`` in given dims.
 
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
             import numpy as np
@@ -4684,15 +4717,15 @@ def reduce_all(input, dim=None, keep_dim=False, name=None):
             # x is a bool Tensor variable with following elements:
             #    [[True, False]
             #     [True, True]]
-            x = layers.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
-            x = layers.cast(x, 'bool')
+            x = fluid.layers.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
+            x = fluid.layers.cast(x, 'bool')
 
-            out = layers.reduce_all(x)  # False
-            out = layers.reduce_all(x, dim=0)  # [True, False]
-            out = layers.reduce_all(x, dim=-1)  # [False, True]
+            out = fluid.layers.reduce_all(x)  # False
+            out = fluid.layers.reduce_all(x, dim=0)  # [True, False]
+            out = fluid.layers.reduce_all(x, dim=-1)  # [False, True]
             # keep_dim=False, x.shape=(2,2), out.shape=(2,)
 
-            out = layers.reduce_all(x, dim=1, keep_dim=True)  # [[False], [True]]
+            out = fluid.layers.reduce_all(x, dim=1, keep_dim=True)  # [[False], [True]]
             # keep_dim=True, x.shape=(2,2), out.shape=(2,1)
 
     """
@@ -4719,7 +4752,7 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
     This OP computes the ``logical or`` of tensor elements over the given dimension, and output the result.
 
     Args:
-        input (Variable): The input variable which is a Tensor or LoDTensor, the input data type should be `bool`.
+        input (Tensor): the input tensor, it's data type should be `bool`.
         dim (list|int|optional): The dimension along which the logical and is computed.
             If :attr:`None`, compute the logical and over all elements of
             :attr:`input` and return a Tensor variable with a single element,
@@ -4728,14 +4761,15 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
         keep_dim (bool): Whether to reserve the reduced dimension in the
             output Tensor. The result tensor will have one fewer dimension
             than the :attr:`input` unless :attr:`keep_dim` is true. The default value is False.
-        name(str|None): A name for this layer(optional). If set None, the layer
+        name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
-        Variable, the output data type is bool. : The reduced tensor variable with ``logical or`` in given dims.
+        Tensor, the output data type is bool. : The reduced tensor variable with ``logical or`` in given dims.
 
     Examples:
         .. code-block:: python
 
+            import paddle
             import paddle.fluid as fluid
             import paddle.fluid.layers as layers
             import numpy as np
@@ -4743,15 +4777,15 @@ def reduce_any(input, dim=None, keep_dim=False, name=None):
             # x is a bool Tensor variable with following elements:
             #    [[True, False]
             #     [False, False]]
-            x = layers.assign(np.array([[1, 0], [0, 0]], dtype='int32'))
-            x = layers.cast(x, 'bool')
+            x = fluid.layers.assign(np.array([[1, 0], [0, 0]], dtype='int32'))
+            x = fluid.layers.cast(x, 'bool')
 
-            out = layers.reduce_any(x)  # True
-            out = layers.reduce_any(x, dim=0)  # [True, False]
-            out = layers.reduce_any(x, dim=-1)  # [True, False]
+            out = fluid.layers.reduce_any(x)  # True
+            out = fluid.layers.reduce_any(x, dim=0)  # [True, False]
+            out = fluid.layers.reduce_any(x, dim=-1)  # [True, False]
             # keep_dim=False, x.shape=(2,2), out.shape=(2,)
 
-            out = layers.reduce_any(x, dim=1,
+            out = fluid.layers.reduce_any(x, dim=1,
                                      keep_dim=True)  # [[True], [False]]
             # keep_dim=True, x.shape=(2,2), out.shape=(2,1)
 
@@ -5613,6 +5647,8 @@ def im2sequence(input,
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             data = fluid.data(name='data', shape=[None, 3, 32, 32],
                                      dtype='float32')
             output = fluid.layers.im2sequence(
@@ -5669,6 +5705,8 @@ def row_conv(input, future_context_size, param_attr=None, act=None):
     Examples:
         >>>  # for LodTensor inputs
         >>> import paddle.fluid as fluid
+        >>> import paddle
+        >>> paddle.enable_static()
         >>> x = fluid.data(name='x', shape=[9, 16],
         >>>                        dtype='float32', lod_level=1)
         >>> out = fluid.layers.row_conv(input=x, future_context_size=2)
@@ -5982,6 +6020,8 @@ def autoincreased_step_counter(counter_name=None, begin=1, step=1):
         .. code-block:: python
 
            import paddle.fluid as fluid
+           import paddle
+           paddle.enable_static()
            global_step = fluid.layers.autoincreased_step_counter(
                counter_name='@LR_DECAY_COUNTER@', begin=0, step=1)
     """
@@ -9730,6 +9770,8 @@ def prelu(x, mode, param_attr=None, name=None):
         .. code-block:: python
 
             import paddle.fluid as fluid
+            import paddle
+            paddle.enable_static()
             from paddle.fluid.param_attr import ParamAttr
             x = fluid.data(name="x", shape=[None,5,10,10], dtype="float32")
             mode = 'channel'
@@ -14307,6 +14349,9 @@ def deformable_conv(input,
           #deformable conv v2:
 
           import paddle.fluid as fluid
+          import paddle
+          paddle.enable_static()
+          
           C_in, H_in, W_in = 3, 32, 32
           filter_size, deformable_groups = 3, 1
           data = fluid.data(name='data', shape=[None, C_in, H_in, W_in], dtype='float32')
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
index dcf5151578ad5..d525009fbd734 100644
--- a/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sync_batch_norm.py
@@ -63,10 +63,7 @@ def __init__(self,
             bias_attr=False)
 
         self._sync_batch_norm2 = SyncBatchNorm(
-            num_filters,
-            weight_attr=False,
-            bias_attr=False,
-            track_running_stats=False)
+            num_filters, weight_attr=False, bias_attr=False)
 
     def forward(self, inputs):
         y = self._conv(inputs)
diff --git a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
index 944725fab6435..18860db9dae51 100644
--- a/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
+++ b/python/paddle/fluid/tests/unittests/test_adaptive_max_pool2d.py
@@ -150,7 +150,7 @@ def test_dynamic_graph(self):
             x = paddle.to_tensor(self.x_np)
 
             out_1 = paddle.nn.functional.adaptive_max_pool2d(
-                x=x, return_indices=False, output_size=[3, 3])
+                x=x, return_mask=False, output_size=[3, 3])
 
             out_2 = paddle.nn.functional.adaptive_max_pool2d(x=x, output_size=5)
 
diff --git a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
index 9c43e2f3e6e9d..40b7074ed3914 100644
--- a/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv1d_transpose_layer.py
@@ -92,7 +92,7 @@ def functional(self, place):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.out_channels, ), dtype=self.dtype)
-                y_var = F.conv_transpose1d(
+                y_var = F.conv1d_transpose(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
index 28c3a466aa6c8..f51baf50ec898 100644
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_layer.py
@@ -128,7 +128,7 @@ def functional(self, place):
                 else:
                     output_size = self.output_size
 
-                y_var = F.conv_transpose2d(
+                y_var = F.conv2d_transpose(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
index dac84a8486ef2..a567ec7273893 100644
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_layer.py
@@ -119,7 +119,7 @@ def functional(self, place):
                     "weight", self.weight_shape, dtype=self.dtype)
                 b_var = fluid.data(
                     "bias", (self.num_filters, ), dtype=self.dtype)
-                y_var = F.conv_transpose3d(
+                y_var = F.conv3d_transpose(
                     x_var,
                     w_var,
                     None if self.no_bias else b_var,
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
index 1fb07bf434590..e3b821a07bffd 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv2d_transpose.py
@@ -111,7 +111,7 @@ def static_graph_case_2(self):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv_transpose2d(
+                y = F.conv2d_transpose(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -134,7 +134,7 @@ def dygraph_case(self):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv_transpose2d(
+            y = F.conv2d_transpose(
                 x,
                 weight,
                 bias,
@@ -215,7 +215,7 @@ def static_graph_case(self):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv_transpose2d(
+                y = F.conv2d_transpose(
                     x,
                     weight,
                     None if self.no_bias else bias,
diff --git a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
index 7441f7cb915e8..910d28515b778 100644
--- a/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
+++ b/python/paddle/fluid/tests/unittests/test_functional_conv3d_transpose.py
@@ -113,7 +113,7 @@ def static_graph_case_2(self):
                     "weight", self.weight.shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias.shape, dtype=self.dtype)
-                y = F.conv_transpose3d(
+                y = F.conv3d_transpose(
                     x,
                     weight,
                     None if self.no_bias else bias,
@@ -138,7 +138,7 @@ def dygraph_case(self):
             x = dg.to_variable(self.input)
             weight = dg.to_variable(self.weight)
             bias = None if self.no_bias else dg.to_variable(self.bias)
-            y = F.conv_transpose3d(
+            y = F.conv3d_transpose(
                 x,
                 weight,
                 bias,
@@ -222,7 +222,7 @@ def static_graph_case(self):
                     "weight", self.weight_shape, dtype=self.dtype)
                 if not self.no_bias:
                     bias = fluid.data("bias", self.bias_shape, dtype=self.dtype)
-                y = F.conv_transpose3d(
+                y = F.conv3d_transpose(
                     x,
                     weight,
                     None if self.no_bias else bias,
diff --git a/python/paddle/fluid/tests/unittests/test_pool1d_api.py b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
index cc2490d1f1245..00f75337baafb 100644
--- a/python/paddle/fluid/tests/unittests/test_pool1d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool1d_api.py
@@ -148,11 +148,7 @@ def check_avg_dygraph_padding_results(self, place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result = F.avg_pool1d(
-                input,
-                kernel_size=2,
-                stride=2,
-                padding=[1],
-                count_include_pad=True)
+                input, kernel_size=2, stride=2, padding=[1], exclusive=True)
 
             result_np = avg_pool1D_forward_naive(
                 input_np, ksize=[2], strides=[2], paddings=[1], exclusive=False)
@@ -160,7 +156,8 @@ def check_avg_dygraph_padding_results(self, place):
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
             avg_pool1d_dg = paddle.nn.AvgPool1D(
-                kernel_size=2, stride=None, padding=1, count_include_pad=True)
+                kernel_size=2, stride=None, padding=1, exclusive=True)
+
             result = avg_pool1d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -200,7 +197,7 @@ def check_max_dygraph_return_index_results(self, place):
             input_np = np.random.random([2, 3, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result, index = F.max_pool1d(
-                input, kernel_size=2, stride=2, padding=0, return_indices=True)
+                input, kernel_size=2, stride=2, padding=0, return_mask=True)
 
             result_np = max_pool1D_forward_naive(
                 input_np, ksize=[2], strides=[2], paddings=[0])
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_api.py b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
index 66505327c2df3..f4432bf338647 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_api.py
@@ -134,7 +134,7 @@ def check_max_dygraph_results(self, place):
             input_np = np.random.random([2, 3, 32, 32]).astype("float32")
             input = fluid.dygraph.to_variable(input_np)
             result = max_pool2d(
-                input, kernel_size=2, stride=2, padding=0, return_indices=False)
+                input, kernel_size=2, stride=2, padding=0, return_mask=False)
 
             result_np = pool2D_forward_naive(
                 input_np,
@@ -159,7 +159,7 @@ def check_max_dygraph_nhwc_results(self, place):
                 kernel_size=2,
                 stride=2,
                 padding=0,
-                return_indices=False,
+                return_mask=False,
                 data_format="NHWC")
 
             result_np = pool2D_forward_naive(
@@ -222,7 +222,7 @@ def check_max_dygraph_stride_is_none(self, place):
                 kernel_size=2,
                 stride=None,
                 padding="SAME",
-                return_indices=True)
+                return_mask=True)
 
             result_np = pool2D_forward_naive(
                 input_np,
@@ -269,7 +269,7 @@ def check_max_dygraph_padding(self, place):
                 kernel_size=2,
                 stride=2,
                 padding=padding,
-                return_indices=False)
+                return_mask=False)
 
             result_np = pool2D_forward_naive(
                 input_np,
@@ -490,7 +490,7 @@ def run9():
                     padding=0,
                     ceil_mode=False,
                     data_format='NHWC',
-                    return_indices=True)
+                    return_mask=True)
 
         self.assertRaises(ValueError, run9)
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_api.py b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
index b2700303ee477..91158fe674b1e 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_api.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_api.py
@@ -83,7 +83,7 @@ def check_avg_dygraph_padding_results(self, place):
                 stride=2,
                 padding=1,
                 ceil_mode=False,
-                count_include_pad=True)
+                exclusive=True)
 
             result_np = avg_pool3D_forward_naive(
                 input_np,
@@ -100,7 +100,7 @@ def check_avg_dygraph_padding_results(self, place):
                 stride=None,
                 padding=1,
                 ceil_mode=False,
-                count_include_pad=True)
+                exclusive=True)
             result = avg_pool3d_dg(input)
             self.assertTrue(np.allclose(result.numpy(), result_np))
 
@@ -175,7 +175,7 @@ def check_max_dygraph_ndhwc_results(self, place):
                 stride=2,
                 padding=0,
                 data_format="NDHWC",
-                return_indices=False)
+                return_mask=False)
 
             result_np = pool3D_forward_naive(
                 input_np,
@@ -239,7 +239,7 @@ def check_max_dygraph_stride_is_none(self, place):
                 kernel_size=2,
                 stride=None,
                 padding="SAME",
-                return_indices=True)
+                return_mask=True)
 
             result_np = pool3D_forward_naive(
                 input_np,
@@ -467,7 +467,7 @@ def run10():
                     stride=2,
                     padding=0,
                     data_format='NDHWC',
-                    return_indices=True)
+                    return_mask=True)
 
         self.assertRaises(ValueError, run10)
 
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 80b201d084218..e549a2eca2d7d 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -767,5 +767,117 @@ def test_dygraph(self):
         self.assertTrue((out3 == np.sum(np_x, axis=(0, 1, 2))).all())
 
 
+class TestAllAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        paddle.enable_static()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[4, 4], dtype="bool")
+            result = paddle.all(x=input)
+            input_np = np.random.randint(0, 2, [4, 4]).astype("bool")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], np.all(input_np)))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool)
+                x = fluid.layers.assign(np_x)
+                x = fluid.layers.cast(x, 'bool')
+
+                out1 = paddle.all(x)
+                np_out1 = out1.numpy()
+                expect_res1 = np.all(np_x)
+                self.assertTrue((np_out1 == expect_res1).all())
+
+                out2 = paddle.all(x, axis=0)
+                np_out2 = out2.numpy()
+                expect_res2 = np.all(np_x, axis=0)
+                self.assertTrue((np_out2 == expect_res2).all())
+
+                out3 = paddle.all(x, axis=-1)
+                np_out3 = out3.numpy()
+                expect_res3 = np.all(np_x, axis=-1)
+                self.assertTrue((np_out3 == expect_res3).all())
+
+                out4 = paddle.all(x, axis=1, keepdim=True)
+                np_out4 = out4.numpy()
+                expect_res4 = np.all(np_x, axis=1, keepdims=True)
+                self.assertTrue((np_out4 == expect_res4).all())
+
+        paddle.enable_static()
+
+
+class TestAnyAPI(unittest.TestCase):
+    def setUp(self):
+        np.random.seed(123)
+        paddle.enable_static()
+        self.places = [fluid.CPUPlace()]
+        if core.is_compiled_with_cuda():
+            self.places.append(fluid.CUDAPlace(0))
+
+    def check_static_result(self, place):
+        with fluid.program_guard(fluid.Program(), fluid.Program()):
+            input = fluid.data(name="input", shape=[4, 4], dtype="bool")
+            result = paddle.any(x=input)
+            input_np = np.random.randint(0, 2, [4, 4]).astype("bool")
+
+            exe = fluid.Executor(place)
+            fetches = exe.run(fluid.default_main_program(),
+                              feed={"input": input_np},
+                              fetch_list=[result])
+            self.assertTrue(np.allclose(fetches[0], np.any(input_np)))
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def test_dygraph(self):
+        paddle.disable_static()
+        for place in self.places:
+            with fluid.dygraph.guard(place):
+                np_x = np.random.randint(0, 2, (12, 10)).astype(np.bool)
+                x = fluid.layers.assign(np_x)
+                x = fluid.layers.cast(x, 'bool')
+
+                out1 = paddle.any(x)
+                np_out1 = out1.numpy()
+                expect_res1 = np.any(np_x)
+                self.assertTrue((np_out1 == expect_res1).all())
+
+                out2 = paddle.any(x, axis=0)
+                np_out2 = out2.numpy()
+                expect_res2 = np.any(np_x, axis=0)
+                self.assertTrue((np_out2 == expect_res2).all())
+
+                out3 = paddle.any(x, axis=-1)
+                np_out3 = out3.numpy()
+                expect_res3 = np.any(np_x, axis=-1)
+                self.assertTrue((np_out3 == expect_res3).all())
+
+                out4 = paddle.any(x, axis=1, keepdim=True)
+                np_out4 = out4.numpy()
+                expect_res4 = np.any(np_x, axis=1, keepdims=True)
+                self.assertTrue((np_out4 == expect_res4).all())
+
+        paddle.enable_static()
+
+
 if __name__ == '__main__':
+    import paddle
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index 5f9307845ae9d..07e8b1f4d6d0f 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -73,12 +73,12 @@
 from .common import upsample  #DEFINE_ALIAS
 from .common import bilinear  #DEFINE_ALIAS
 from .conv import conv1d  #DEFINE_ALIAS
-from .conv import conv_transpose1d  #DEFINE_ALIAS
+from .conv import conv1d_transpose  #DEFINE_ALIAS
 from .common import linear  #DEFINE_ALIAS
 from .conv import conv2d  #DEFINE_ALIAS
-from .conv import conv_transpose2d  #DEFINE_ALIAS
+from .conv import conv2d_transpose  #DEFINE_ALIAS
 from .conv import conv3d  #DEFINE_ALIAS
-from .conv import conv_transpose3d  #DEFINE_ALIAS
+from .conv import conv3d_transpose  #DEFINE_ALIAS
 # from .extension import add_position_encoding  #DEFINE_ALIAS
 # from .extension import autoincreased_step_counter        #DEFINE_ALIAS
 # from .extension import continuous_value_model  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/conv.py b/python/paddle/nn/functional/conv.py
index 03dd40fb140cf..6df1ce368c1b0 100644
--- a/python/paddle/nn/functional/conv.py
+++ b/python/paddle/nn/functional/conv.py
@@ -15,11 +15,11 @@
 
 __all__ = [
     'conv1d',
-    'conv_transpose1d',
+    'conv1d_transpose',
     'conv2d',
-    'conv_transpose2d',
+    'conv2d_transpose',
     'conv3d',
-    'conv_transpose3d',
+    'conv3d_transpose',
 ]
 
 import numpy as np
@@ -541,7 +541,7 @@ def conv2d(x,
     return out
 
 
-def conv_transpose1d(x,
+def conv1d_transpose(x,
                      weight,
                      bias=None,
                      stride=1,
@@ -682,7 +682,7 @@ def conv_transpose1d(x,
                       [[4, 2]]]).astype(np.float32)
           x_var = paddle.to_tensor(x)
           w_var = paddle.to_tensor(w)
-          y_var = F.conv_transpose1d(x_var, w_var)
+          y_var = F.conv1d_transpose(x_var, w_var)
           y_np = y_var.numpy()
           print y_np
           
@@ -802,7 +802,7 @@ def conv_transpose1d(x,
     return out
 
 
-def conv_transpose2d(x,
+def conv2d_transpose(x,
                      weight,
                      bias=None,
                      stride=1,
@@ -920,7 +920,7 @@ def conv_transpose2d(x,
            None by default.
 
     Returns:
-        A Tensor representing the conv_transpose2d, whose 
+        A Tensor representing the conv2d_transpose, whose
         data type is the same with input and shape is (num_batches, channels, out_h, 
         out_w) or (num_batches, out_h, out_w, channels). The tensor variable storing 
         transposed convolution result.
@@ -946,7 +946,7 @@ def conv_transpose2d(x,
           x_var = paddle.randn((2, 3, 8, 8), dtype='float32')
           w_var = paddle.randn((3, 6, 3, 3), dtype='float32')
 
-          y_var = F.conv_transpose2d(x_var, w_var)
+          y_var = F.conv2d_transpose(x_var, w_var)
           y_np = y_var.numpy()
 
           print(y_np.shape)
@@ -1242,7 +1242,7 @@ def conv3d(x,
     return out
 
 
-def conv_transpose3d(x,
+def conv3d_transpose(x,
                      weight,
                      bias=None,
                      stride=1,
@@ -1364,7 +1364,7 @@ def conv_transpose3d(x,
            None by default.
 
     Returns:
-        A Tensor representing the conv_transpose3d, whose data 
+        A Tensor representing the conv3d_transpose, whose data
         type is the same with input and shape is (num_batches, channels, out_d, out_h, 
         out_w) or (num_batches, out_d, out_h, out_w, channels). If act is None, the tensor 
         variable storing the transposed convolution result, and if act is not None, the tensor 
@@ -1391,7 +1391,7 @@ def conv_transpose3d(x,
           x_var = paddle.randn((2, 3, 8, 8, 8), dtype='float32')
           w_var = paddle.randn((3, 6, 3, 3, 3), dtype='float32')
 
-          y_var = F.conv_transpose3d(x_var, w_var)
+          y_var = F.conv3d_transpose(x_var, w_var)
           y_np = y_var.numpy()
 
           print(y_np.shape)
diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 73652ff1266f5..73e3cb31221f1 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -157,7 +157,7 @@ def avg_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
-               count_include_pad=True,
+               exclusive=True,
                ceil_mode=False,
                name=None):
     """
@@ -179,7 +179,7 @@ def avg_pool1d(x,
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
             If it is set to False, the floor function will be used. The default value is False.
@@ -230,8 +230,8 @@ def avg_pool1d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
             padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
+            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            data_format)
         return squeeze(output, [2])
 
     op_type = 'pool2d'
@@ -253,7 +253,7 @@ def avg_pool1d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": not exclusive,
             "data_format": data_format,
         })
 
@@ -265,7 +265,7 @@ def avg_pool2d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=True,
+               exclusive=True,
                divisor_override=None,
                data_format="NCHW",
                name=None):
@@ -294,7 +294,7 @@ def avg_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `true`.
         divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
@@ -338,8 +338,8 @@ def avg_pool2d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'padding_algorithm', padding_algorithm, 'strides', stride,
             'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
+            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            data_format)
         if divisor_override is None:
             return output
         else:
@@ -365,7 +365,7 @@ def avg_pool2d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": not exclusive,
             "data_format": data_format,
         })
 
@@ -381,7 +381,7 @@ def avg_pool3d(x,
                stride=None,
                padding=0,
                ceil_mode=False,
-               count_include_pad=True,
+               exclusive=True,
                divisor_override=None,
                data_format="NCDHW",
                name=None):
@@ -408,7 +408,7 @@ def avg_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is True.
         divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -452,8 +452,8 @@ def avg_pool3d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
             'paddings', padding, 'global_pooling', False, 'padding_algorithm',
             padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not count_include_pad,
-            'data_format', data_format)
+            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            data_format)
         if divisor_override is None:
             return output
         else:
@@ -481,7 +481,7 @@ def avg_pool3d(x,
             "use_cudnn": True,
             "ceil_mode": ceil_mode,
             "use_mkldnn": False,
-            "exclusive": not count_include_pad,
+            "exclusive": not exclusive,
             "data_format": data_format,
         })
 
@@ -497,7 +497,7 @@ def max_pool1d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
+               return_mask=False,
                ceil_mode=False,
                name=None):
     """
@@ -519,7 +519,7 @@ def max_pool1d(x,
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
         ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
             If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
@@ -542,7 +542,7 @@ def max_pool1d(x,
           data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
           pool_out = F.max_pool1d(data, kernel_size=2, stride=2, padding=0)
           # pool_out shape: [1, 3, 16]
-          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_indices=True)
+          pool_out, indices = F.max_pool1d(data, kernel_size=2, stride=2, padding=0, return_mask=True)
           # pool_out shape: [1, 3, 16],  indices shape: [1, 3, 16]
     """
     """NCL to NCHW"""
@@ -563,16 +563,16 @@ def max_pool1d(x,
     padding = _expand_low_nd_padding(padding)
 
     if in_dygraph_mode():
-        if return_indices:
+        if return_mask:
             pool_out = core.ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
                 padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
-            return (squeeze(pool_out[0], [2]), squeeze(
-                pool_out[1],
-                [2])) if return_indices else squeeze(pool_out[0], [2])
+            return (squeeze(pool_out[0], [2]),
+                    squeeze(pool_out[1],
+                            [2])) if return_mask else squeeze(pool_out[0], [2])
         else:
             pool_out = core.ops.pool2d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
@@ -582,7 +582,7 @@ def max_pool1d(x,
                 'data_format', data_format)
             return squeeze(pool_out, [2])
 
-    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -608,14 +608,14 @@ def max_pool1d(x,
         })
 
     return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
 
 
 def max_pool2d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
+               return_mask=False,
                ceil_mode=False,
                data_format="NCHW",
                name=None):
@@ -643,7 +643,7 @@ def max_pool2d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
+        return_mask (bool): Whether to return the max indices along with the outputs. Default False, only support `"NCHW"` data format
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -668,12 +668,12 @@ def max_pool2d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           # output.shape [1, 3, 16, 16]
-          # for return_indices=True
+          # for return_mask=True
           out, max_indices = F.max_pool2d(x,
                                              kernel_size=2,
                                              stride=2,
                                              padding=0,
-                                             return_indices=True)
+                                             return_mask=True)
           # out.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool2d')
@@ -693,20 +693,20 @@ def max_pool2d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, num_dims=2, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    if data_format == "NHWC" and return_indices:
+    if data_format == "NHWC" and return_mask:
         raise ValueError(
-            "When setting return_indices to true, data_format must be set to NCHW in API:max_pool2d"
+            "When setting return_mask to true, data_format must be set to NCHW in API:max_pool2d"
         )
 
     if in_dygraph_mode():
-        if return_indices:
+        if return_mask:
             output = core.ops.max_pool2d_with_index(
                 x, 'ksize', kernel_size, 'global_pooling', False, 'strides',
                 stride, 'paddings', padding, 'padding_algorithm',
                 padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
                 'use_mkldnn', False, 'exclusive', True, 'data_format',
                 data_format)
-            return output if return_indices else output[0]
+            return output if return_mask else output[0]
         else:
             output = core.ops.pool2d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
@@ -716,7 +716,7 @@ def max_pool2d(x,
                 'data_format', data_format)
             return output
 
-    op_type = 'max_pool2d_with_index' if return_indices else "pool2d"
+    op_type = 'max_pool2d_with_index' if return_mask else "pool2d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -741,14 +741,14 @@ def max_pool2d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (pool_out, mask) if return_mask else pool_out
 
 
 def max_pool3d(x,
                kernel_size,
                stride=None,
                padding=0,
-               return_indices=False,
+               return_mask=False,
                ceil_mode=False,
                data_format="NCDHW",
                name=None):
@@ -773,7 +773,7 @@ def max_pool3d(x,
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
+        return_mask (bool): Whether to return the max indices along with the outputs. Default False. Only support "NDCHW" data_format.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -798,13 +798,13 @@ def max_pool3d(x,
                                 kernel_size=2,
                                 stride=2, padding=0)
           output.shape [1, 3, 16, 16, 16]
-          # for return_indices=True
+          # for return_mask=True
           x = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32, 32, 32]).astype(np.float32))
           output, max_indices = paddle.nn.functional.max_pool3d(x,
                                         kernel_size = 2,
                                         stride = 2,
                                         padding=0,
-                                        return_indices=True)
+                                        return_mask=True)
           # output.shape [None, 3, 16, 16, 16], max_indices.shape [None, 3, 16, 16, 16],
     """
     check_variable_and_dtype(x, 'x', ['float32', 'float64'], 'max_pool3d')
@@ -819,20 +819,20 @@ def max_pool3d(x,
     padding, padding_algorithm = _update_padding_nd(
         padding, 3, channel_last=channel_last, ceil_mode=ceil_mode)
 
-    if data_format == "NDHWC" and return_indices:
+    if data_format == "NDHWC" and return_mask:
         raise ValueError(
-            "When setting return_indices to true, data_format must be set to NCDHW in API:max_pool3d"
+            "When setting return_mask to true, data_format must be set to NCDHW in API:max_pool3d"
         )
 
     if in_dygraph_mode():
-        if return_indices:
+        if return_mask:
             output = core.ops.max_pool3d_with_index(
                 x, 'pooling_type', 'max', 'ksize', kernel_size, 'strides',
                 stride, 'paddings', padding, 'global_pooling', False,
                 'padding_algorithm', padding_algorithm, 'use_cudnn', True,
                 'ceil_mode', ceil_mode, 'use_mkldnn', False, 'exclusive', True,
                 'data_format', data_format)
-            return output if return_indices else output[0]
+            return output if return_mask else output[0]
         else:
             output = core.ops.pool3d(
                 x, 'pooling_type', 'max', 'ksize', kernel_size,
@@ -842,7 +842,7 @@ def max_pool3d(x,
                 'data_format', data_format)
             return output
 
-    op_type = "max_pool3d_with_index" if return_indices else "pool3d"
+    op_type = "max_pool3d_with_index" if return_mask else "pool3d"
     helper = LayerHelper(op_type, **locals())
     dtype = helper.input_dtype()
     pool_out = helper.create_variable_for_type_inference(dtype)
@@ -867,7 +867,7 @@ def max_pool3d(x,
             "data_format": data_format,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (pool_out, mask) if return_mask else pool_out
 
 
 def adaptive_avg_pool1d(x, output_size, name=None):
@@ -1148,7 +1148,7 @@ def adaptive_avg_pool3d(x, output_size, data_format='NCDHW', name=None):
     return pool_out
 
 
-def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
+def adaptive_max_pool1d(x, output_size, return_mask=False, name=None):
     """
     This API implements adaptive max pooling 1d operation.
     See more details in :ref:`api_nn_pooling_AdaptiveMaxPool1d` .
@@ -1159,7 +1159,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                               where N is batch size, C is the number of channels, L is the
                               length of the feature. The data type is float32 or float64.
         output_size (int): The pool kernel size. The value should be an integer.
-        return_indices (bool): If true, the index of max pooling point will be returned along
+        return_mask (bool): If true, the index of max pooling point will be returned along
                 with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
                                  to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -1190,7 +1190,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
               data = paddle.to_tensor(np.random.uniform(-1, 1, [1, 3, 32]).astype(np.float32))
               pool_out = F.adaptive_max_pool1d(data, output_size=16)
               # pool_out shape: [1, 3, 16])
-              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_indices=True)
+              pool_out, indices = F.adaptive_max_pool1d(data, output_size=16, return_mask=True)
               # pool_out shape: [1, 3, 16] indices  shape: [1, 3, 16]
     """
     pool_type = 'max'
@@ -1198,7 +1198,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
                              'adaptive_max_pool1d')
     _check_input(x, 3)
     check_type(output_size, 'pool_size', int, 'adaptive_max_pool1d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool1d')
+    check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool1d')
 
     pool_size = [1] + utils.convert_to_list(output_size, 1, 'pool_size')
 
@@ -1209,7 +1209,7 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
         pool_out = core.ops.max_pool2d_with_index(
             x, 'pooling_type', pool_type, 'ksize', pool_size, 'adaptive', True)
         return (squeeze(pool_out[0], [2]), squeeze(
-            pool_out[1], [2])) if return_indices else squeeze(pool_out[0], [2])
+            pool_out[1], [2])) if return_mask else squeeze(pool_out[0], [2])
 
     helper = LayerHelper(l_type, **locals())
     dtype = helper.input_dtype()
@@ -1229,10 +1229,10 @@ def adaptive_max_pool1d(x, output_size, return_indices=False, name=None):
         })
 
     return (squeeze(pool_out, [2]),
-            squeeze(mask, [2])) if return_indices else squeeze(pool_out, [2])
+            squeeze(mask, [2])) if return_mask else squeeze(pool_out, [2])
 
 
-def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
+def adaptive_max_pool2d(x, output_size, return_mask=False, name=None):
     """
         This operation applies a 2D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool2d` .
@@ -1240,7 +1240,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
         Args:
             x (Tensor): The input tensor of adaptive max pool2d operator, which is a 4-D tensor. The data type can be float16, float32, float64, int32 or int64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two elements, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
         Returns:
@@ -1280,7 +1280,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
                                  'adaptive_max_pool2d')
     _check_input(x, 4)
     #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool2d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool2d')
+    check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool2d')
 
     in_h, in_w = x.shape[2:4]
     if isinstance(output_size, int):
@@ -1295,7 +1295,7 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
     if in_dygraph_mode():
         pool_out = core.ops.max_pool2d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
-        return pool_out if return_indices else pool_out[0]
+        return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool2d_with_index'
 
@@ -1315,11 +1315,11 @@ def adaptive_max_pool2d(x, output_size, return_indices=False, name=None):
             "ksize": output_size,
             "adaptive": True,
         })
-    #return (pool_out, mask) if return_indices else pool_out
+    #return (pool_out, mask) if return_mask else pool_out
     return pool_out
 
 
-def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
+def adaptive_max_pool3d(x, output_size, return_mask=False, name=None):
     """
         This operation applies a 3D adaptive max pooling on input tensor.
         See more details in :ref:`api_nn_pooling_AdaptiveMaxPool3d` .
@@ -1327,7 +1327,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
         Args:
             x (Tensor): The input tensor of adaptive max pool3d operator, which is a 5-D tensor. The data type can be float32, float64.
             output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-            return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+            return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
             name(str, optional): For detailed information, please refer to :ref:`api_guide_Name`. Usually name is no need to set and None by default.
 
         Returns:
@@ -1371,7 +1371,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
                                  'adaptive_max_pool3d')
     _check_input(x, 5)
     #check_type(output_size, 'pool_size', (int), 'adaptive_max_pool3d')
-    check_type(return_indices, 'return_indices', bool, 'adaptive_max_pool3d')
+    check_type(return_mask, 'return_mask', bool, 'adaptive_max_pool3d')
 
     in_l, in_h, in_w = x.shape[2:5]
     if isinstance(output_size, int):
@@ -1388,7 +1388,7 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
     if in_dygraph_mode():
         pool_out = core.ops.max_pool3d_with_index(
             x, 'pooling_type', 'max', 'ksize', output_size, 'adaptive', True)
-        return pool_out if return_indices else pool_out[0]
+        return pool_out if return_mask else pool_out[0]
 
     l_type = 'max_pool3d_with_index'
 
@@ -1409,4 +1409,4 @@ def adaptive_max_pool3d(x, output_size, return_indices=False, name=None):
             "adaptive": True,
         })
 
-    return (pool_out, mask) if return_indices else pool_out
+    return (pool_out, mask) if return_mask else pool_out
diff --git a/python/paddle/nn/layer/conv.py b/python/paddle/nn/layer/conv.py
index 51c466d113f02..f97e549464738 100644
--- a/python/paddle/nn/layer/conv.py
+++ b/python/paddle/nn/layer/conv.py
@@ -427,7 +427,7 @@ def __init__(self,
             data_format=data_format)
 
     def forward(self, x, output_size=None):
-        out = F.conv_transpose1d(
+        out = F.conv1d_transpose(
             x,
             self.weight,
             bias=self.bias,
@@ -748,7 +748,7 @@ def forward(self, x, output_size=None):
         else:
             output_padding = 0
 
-        out = F.conv_transpose2d(
+        out = F.conv2d_transpose(
             x,
             self.weight,
             bias=self.bias,
@@ -954,16 +954,16 @@ class Conv3DTranspose(_ConvNd):
 
     **Note**:
 
-          The conv_transpose3d can be seen as the backward of the conv3d. For conv3d, 
+          The conv3d_transpose can be seen as the backward of the conv3d. For conv3d,
           when stride > 1, conv3d maps multiple input shape to the same output shape, 
-          so for conv_transpose3d, when stride > 1, input shape maps multiple output shape.
+          so for conv3d_transpose, when stride > 1, input shape maps multiple output shape.
           If output_size is None, :math:`H_{out} = H^\prime_{out}, :math:`H_{out} = \
           H^\prime_{out}, W_{out} = W^\prime_{out}`; else, the :math:`D_{out}` of the output 
           size must between :math:`D^\prime_{out}` and :math:`D^\prime_{out} + strides[0]`, 
           the :math:`H_{out}` of the output size must between :math:`H^\prime_{out}` 
           and :math:`H^\prime_{out} + strides[1]`, and the :math:`W_{out}` of the output size must 
           between :math:`W^\prime_{out}` and :math:`W^\prime_{out} + strides[2]`, 
-          conv_transpose3d can compute the kernel size automatically.
+          conv3d_transpose can compute the kernel size automatically.
 
     Parameters:
         in_channels(int): The number of channels in the input image.
@@ -1086,7 +1086,7 @@ def forward(self, x, output_size=None):
         else:
             output_padding = 0
 
-        out = F.conv_transpose3d(
+        out = F.conv3d_transpose(
             x,
             self.weight,
             bias=self.bias,
diff --git a/python/paddle/nn/layer/norm.py b/python/paddle/nn/layer/norm.py
index a996844c8f5a8..5e2292d40d2bf 100644
--- a/python/paddle/nn/layer/norm.py
+++ b/python/paddle/nn/layer/norm.py
@@ -73,7 +73,6 @@ def __init__(self,
                  momentum=0.9,
                  weight_attr=None,
                  bias_attr=None,
-                 track_running_stats=False,
                  data_format="NCHW",
                  name=None):
         super(_InstanceNormBase, self).__init__()
@@ -135,9 +134,6 @@ class InstanceNorm1D(_InstanceNormBase):
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: False.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
@@ -159,9 +155,6 @@ class InstanceNorm1D(_InstanceNormBase):
     Returns:
         None.
 
-    **Note**:
-        Momentum and track_running_stats is not effective. The next version will fix the problem .
-
 
     Examples:
 
@@ -214,9 +207,6 @@ class InstanceNorm2D(_InstanceNormBase):
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: False.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
@@ -237,8 +227,6 @@ class InstanceNorm2D(_InstanceNormBase):
     Returns:
         None.
 
-    **Note**:
-        Momentum and track_running_stats is not effective. The next version will fix the problem .
 
     Examples:
 
@@ -290,9 +278,6 @@ class InstanceNorm3D(_InstanceNormBase):
         epsilon(float, optional): A value added to the denominator for
             numerical stability. Default is 1e-5.
         momentum(float, optional): The value used for the moving_mean and moving_var computation. Default: 0.9.
-        track_running_stats(bool, optional): Whether to use global mean and
-            variance. In train mode, when setting track_running_stats True, the global mean
-            and variance are also used during train period. Default: False.
         weight_attr(ParamAttr|bool, optional): The parameter attribute for Parameter `scale`
              of instance_norm. If it is set to None or one attribute of ParamAttr, instance_norm
 	     will create ParamAttr as weight_attr, the name of scale can be set in ParamAttr.
@@ -313,8 +298,6 @@ class InstanceNorm3D(_InstanceNormBase):
     Returns:
         None.
 
-    **Note**:
-        Momentum and track_running_stats is not effective. The next version will fix the problem .
 
     Examples:
 
@@ -570,7 +553,6 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
-                 track_running_stats=True,
                  name=None):
         super(_BatchNormBase, self).__init__()
         self._num_features = num_features
@@ -636,7 +618,6 @@ def __init__(self,
         self._momentum = momentum
         self._epsilon = epsilon
         self._fuse_with_relu = False
-        self._track_running_stats = track_running_stats
         self._name = name
 
     def _check_input_dim(self, input):
@@ -651,11 +632,7 @@ def forward(self, input):
 
         self._check_input_dim(input)
 
-        if not self.training and not self._track_running_stats:
-            raise ValueError(
-                'When inference, expected track_running_stats is True.')
-
-        if self.training and not self._track_running_stats:
+        if self.training:
             warnings.warn(
                 "When training, we now always track global mean and variance.")
 
@@ -720,9 +697,6 @@ class BatchNorm1D(_BatchNormBase):
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, may be "NC", "NCL" or "NLC". Defalut "NCL".
-        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
-            True will track global mean and variance used for inference. When inference, track_running_stats must be 
-            True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
@@ -732,9 +706,6 @@ class BatchNorm1D(_BatchNormBase):
 
     Returns:
         None.
-
-    **Note**:
-        Now track_running_stats is actucal always true. The next version will fix the problem .
     
 
     Examples:
@@ -817,9 +788,6 @@ class BatchNorm2D(_BatchNormBase):
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCHW" or "NHWC". Default: NCHW.
-        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
-            True will track global mean and variance used for inference. When inference, track_running_stats must be 
-            True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
@@ -830,9 +798,6 @@ class BatchNorm2D(_BatchNormBase):
     Returns:
         None
 
-    **Note**:
-        Now track_running_stats is actucal always true. The next version will fix the problem .
-
     Examples:
         .. code-block:: python
 
@@ -912,9 +877,6 @@ class BatchNorm3D(_BatchNormBase):
             will create ParamAttr as bias_attr. If it is set to Fasle, the weight is not learnable.
             If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None.
         data_format(str, optional): Specify the input data format, the data format can be "NCDHW" or "NDHWC. Default: NCDHW.
-        track_running_stats(bool, optional): Whether to use global mean and variance. In train period, 
-            True will track global mean and variance used for inference. When inference, track_running_stats must be 
-            True. Default: True.
         name(str, optional): Name for the BatchNorm, default is None. For more information, please refer to :ref:`api_guide_Name`..
 
     Shape:
@@ -925,9 +887,6 @@ class BatchNorm3D(_BatchNormBase):
     Returns:
         None
 
-    **Note**:
-        Now track_running_stats is actucal always true. The next version will fix the problem .
-
     Examples:
         .. code-block:: python
 
@@ -1024,8 +983,6 @@ class SyncBatchNorm(_BatchNormBase):
              will create ParamAttr as bias_attr. If the Initializer of the bias_attr
              is not set, the bias is initialized zero. If it is set to False, this layer will not 
              have trainable bias parameter. Default: None.
-        track_running_stats(bool, optional): Whether to compute global stats, which including running mean and 
-             running variance. Default: True.
 
     Shapes:
         input: Tensor that the dimension from 2 to 5.
@@ -1055,11 +1012,10 @@ def __init__(self,
                  weight_attr=None,
                  bias_attr=None,
                  data_format='NCHW',
-                 track_running_stats=True,
                  name=None):
         super(SyncBatchNorm,
               self).__init__(num_features, momentum, epsilon, weight_attr,
-                             bias_attr, data_format, track_running_stats, name)
+                             bias_attr, data_format, name)
 
     def forward(self, x):
         # create output
@@ -1147,10 +1103,10 @@ def convert_sync_batchnorm(cls, layer):
         """
         layer_output = layer
         if isinstance(layer, _BatchNormBase):
-            layer_output = SyncBatchNorm(
-                layer._num_features, layer._momentum, layer._epsilon,
-                layer._weight_attr, layer._bias_attr, layer._data_format,
-                layer._track_running_stats, layer._name)
+            layer_output = SyncBatchNorm(layer._num_features, layer._momentum,
+                                         layer._epsilon, layer._weight_attr,
+                                         layer._bias_attr, layer._data_format,
+                                         layer._name)
 
             if layer._weight_attr != False and layer._bias_attr != False:
                 with no_grad():
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 9e544cb02e70e..0b0a4909f8550 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -35,7 +35,7 @@
 class AvgPool1D(layers.Layer):
     """
     This operation applies a 1D average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
     The output tensor shape will be [N, C, output_size].
@@ -61,7 +61,7 @@ class AvgPool1D(layers.Layer):
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `True`.
         ceil_mode (bool): ${ceil_mode_comment}Whether to use the ceil function to calculate output height and width.
             If it is set to False, the floor function will be used. The default value is False.
@@ -103,7 +103,7 @@ def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
-                 count_include_pad=True,
+                 exclusive=True,
                  ceil_mode=False,
                  name=None):
         super(AvgPool1D, self).__init__()
@@ -111,12 +111,12 @@ def __init__(self,
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
+        self.exclusive = exclusive
         self.name = name
 
     def forward(self, x):
         out = F.avg_pool1d(x, self.kernel_size, self.stride, self.padding,
-                           self.count_include_pad, self.ceil_mode, self.name)
+                           self.exclusive, self.ceil_mode, self.name)
         return out
 
 
@@ -156,7 +156,7 @@ class AvgPool2D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is `true`.
         divisor_override (float): if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
@@ -197,7 +197,7 @@ def __init__(self,
                  stride=None,
                  padding=0,
                  ceil_mode=False,
-                 count_include_pad=True,
+                 exclusive=True,
                  divisor_override=None,
                  data_format="NCHW",
                  name=None):
@@ -206,7 +206,7 @@ def __init__(self,
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
+        self.exclusive = exclusive
         self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
@@ -218,7 +218,7 @@ def forward(self, x):
             stride=self.stride,
             padding=self.padding,
             ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
+            exclusive=self.exclusive,
             divisor_override=self.divisor,
             data_format=self.data_format,
             name=self.name)
@@ -247,7 +247,7 @@ class AvgPool3D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        count_include_pad (bool): Whether to exclude padding points in average pooling
+        exclusive (bool): Whether to exclude padding points in average pooling
                           mode, default is True.
         divisor_override (int|float) if specified, it will be used as divisor, otherwise kernel_size will be used. Default None.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
@@ -289,7 +289,7 @@ def __init__(self,
                  stride,
                  padding=0,
                  ceil_mode=False,
-                 count_include_pad=True,
+                 exclusive=True,
                  divisor_override=None,
                  data_format="NCDHW",
                  name=None):
@@ -298,7 +298,7 @@ def __init__(self,
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.count_include_pad = count_include_pad
+        self.exclusive = exclusive
         self.divisor = divisor_override
         self.data_format = data_format
         self.name = name
@@ -310,7 +310,7 @@ def forward(self, x):
             stride=self.stride,
             padding=self.padding,
             ceil_mode=self.ceil_mode,
-            count_include_pad=self.count_include_pad,
+            exclusive=self.exclusive,
             divisor_override=self.divisor,
             data_format=self.data_format,
             name=self.name)
@@ -319,7 +319,7 @@ def forward(self, x):
 class MaxPool1D(layers.Layer):
     """
     Applies a 1D max pooling over an input signal composed of several input planes based
-    on the input, output_size, return_indices parameters.
+    on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
 
@@ -343,7 +343,7 @@ class MaxPool1D(layers.Layer):
             4. A list[int] or tuple(int) whose length is 2. It has the form [pad_before, pad_after].
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
-        return_indices (bool): Whether return the max indices along with the outputs. default is `False`.
+        return_mask (bool): Whether return the max indices along with the outputs. default is `False`.
         ceil_mode (bool): Whether to use the ceil function to calculate output height and width. False is the default.
             If it is set to False, the floor function will be used. Default False.
         name(str, optional): For detailed information, please refer
@@ -377,7 +377,7 @@ class MaxPool1D(layers.Layer):
           pool_out = MaxPool1D(data)
           # pool_out shape: [1, 3, 16]
 
-          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_indices=True)
+          MaxPool1D = nn.MaxPool1D(kernel_size=2, stride=2, padding=0, return_mask=True)
           pool_out, indices = MaxPool1D(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
@@ -387,7 +387,7 @@ def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
+                 return_mask=False,
                  ceil_mode=False,
                  name=None):
         super(MaxPool1D, self).__init__()
@@ -395,12 +395,12 @@ def __init__(self,
         self.stride = stride
         self.padding = padding
         self.ceil_mode = ceil_mode
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.name = name
 
     def forward(self, input):
         out = F.max_pool1d(input, self.kernel_size, self.stride, self.padding,
-                           self.return_indices, self.ceil_mode, self.name)
+                           self.return_mask, self.ceil_mode, self.name)
         return out
 
 
@@ -440,7 +440,7 @@ class MaxPool2D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): when True, will use `ceil` instead of `floor` to compute the output shape
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_mask (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`.
                         The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_height, input_width]`.
@@ -473,8 +473,8 @@ class MaxPool2D(layers.Layer):
           output = MaxPool2D(input)
           # output.shape [1, 3, 16, 16]
 
-          # for return_indices=True
-          MaxPool2D = nn.MaxPool2D(kernel_size=2,stride=2, padding=0, return_indices=True)
+          # for return_mask=True
+          MaxPool2D = nn.MaxPool2D(kernel_size=2, stride=2, padding=0, return_mask=True)
           output, max_indices = MaxPool2D(input)
           # output.shape [1, 3, 16, 16], max_indices.shape [1, 3, 16, 16],
     """
@@ -483,7 +483,7 @@ def __init__(self,
                  kernel_size,
                  stride=None,
                  padding=0,
-                 return_indices=False,
+                 return_mask=False,
                  ceil_mode=False,
                  data_format="NCHW",
                  name=None):
@@ -491,7 +491,7 @@ def __init__(self,
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.ceil_mode = ceil_mode
         self.data_format = data_format
         self.name = name
@@ -502,7 +502,7 @@ def forward(self, x):
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            return_indices=self.return_indices,
+            return_mask=self.return_mask,
             data_format=self.data_format,
             name=self.name)
 
@@ -530,7 +530,7 @@ class MaxPool3D(layers.Layer):
             5. A list or tuple of pairs of integers. It has the form [[pad_before, pad_after], [pad_before, pad_after], ...]. Note that, the batch dimension and channel dimension should be [0,0] or (0,0).
             The default value is 0.
         ceil_mode (bool): ${ceil_mode_comment}
-        return_indices (bool): Whether to return the max indices along with the outputs.
+        return_mask (bool): Whether to return the max indices along with the outputs.
         data_format (string): The data format of the input and output data. An optional string from: `"NCDHW"`, `"NDHWC"`.
                         The default is `"NCDHW"`. When it is `"NCDHW"`, the data is stored in the order of:
                         `[batch_size, input_channels, input_depth, input_height, input_width]`.
@@ -564,8 +564,8 @@ class MaxPool3D(layers.Layer):
           output = MaxPool3D(input)
           # output.shape [1, 2, 3, 16, 16]
 
-          # for return_indices=True
-          MaxPool3D = nn.MaxPool3D(kernel_size=2,stride=2, padding=0, return_indices=True)
+          # for return_mask=True
+          MaxPool3D = nn.MaxPool3D(kernel_size=2, stride=2, padding=0, return_mask=True)
           output, max_indices = MaxPool3D(input)
           # output.shape [1, 2, 3, 16, 16], max_indices.shape [1, 2, 3, 16, 16],
     """
@@ -574,7 +574,7 @@ def __init__(self,
                  kernel_size,
                  stride,
                  padding,
-                 return_indices=False,
+                 return_mask=False,
                  ceil_mode=False,
                  data_format="NCDHW",
                  name=None):
@@ -582,7 +582,7 @@ def __init__(self,
         self.ksize = kernel_size
         self.stride = stride
         self.padding = padding
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.ceil_mode = ceil_mode
         self.data_format = data_format
         self.name = name
@@ -593,7 +593,7 @@ def forward(self, x):
             kernel_size=self.ksize,
             stride=self.stride,
             padding=self.padding,
-            return_indices=self.return_indices,
+            return_mask=self.return_mask,
             data_format=self.data_format,
             name=self.name)
 
@@ -602,7 +602,7 @@ class AdaptiveAvgPool1D(layers.Layer):
     """
 
     This operation applies a 1D adaptive average pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
     The output tensor shape will be [N, C, output_size].
@@ -841,7 +841,7 @@ class AdaptiveMaxPool1D(layers.Layer):
     """
 
     This operation applies a 1D adaptive max pooling over an input signal composed
-    of several input planes, based on the input, output_size, return_indices parameters.
+    of several input planes, based on the input, output_size, return_mask parameters.
     Input(X) and output(Out) are in NCL format, where N is batch
     size, C is the number of channels, L is the length of the feature.
     The output tensor shape will be [N, C, output_size].
@@ -859,7 +859,7 @@ class AdaptiveMaxPool1D(layers.Layer):
     Args:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list,
              it must contain one int.
-        return_indices (bool): If true, the index of max pooling point will be returned along
+        return_mask (bool): If true, the index of max pooling point will be returned along
             with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
@@ -898,22 +898,22 @@ class AdaptiveMaxPool1D(layers.Layer):
           pool_out = AdaptiveMaxPool1D(data)
           # pool_out shape: [1, 3, 16]
 
-          # for return_indices = true
-          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_indices=True)
+          # for return_mask = true
+          AdaptiveMaxPool1D = nn.AdaptiveMaxPool1D(output_size=16, return_mask=True)
           pool_out, indices = AdaptiveMaxPool1D(data)
           # pool_out shape: [1, 3, 16], indices shape: [1, 3, 16]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
+    def __init__(self, output_size, return_mask=False, name=None):
         super(AdaptiveMaxPool1D, self).__init__()
         self.output_size = output_size
-        self.return_indices = return_indices
+        self.return_mask = return_mask
         self.name = name
 
     def forward(self, input):
-        return F.adaptive_max_pool1d(input, self.output_size,
-                                     self.return_indices, self.name)
+        return F.adaptive_max_pool1d(input, self.output_size, self.return_mask,
+                                     self.name)
 
 
 class AdaptiveMaxPool2D(layers.Layer):
@@ -932,7 +932,7 @@ class AdaptiveMaxPool2D(layers.Layer):
        Output(i ,j) &= max(Input[hstart:hend, wstart:wend])
     Parameters:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two element, (H, W). H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. It cannot be set in average pooling type. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -965,21 +965,21 @@ class AdaptiveMaxPool2D(layers.Layer):
             paddle.disable_static()
             input_data = np.random.rand(2, 3, 32, 32)
             x = paddle.to_tensor(input_data)
-            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_indices=True)
+            adaptive_max_pool = paddle.nn.AdaptiveMaxPool2D(output_size=3, return_mask=True)
             pool_out, indices = adaptive_max_pool(x = x)
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
+    def __init__(self, output_size, return_mask=False, name=None):
         super(AdaptiveMaxPool2D, self).__init__()
         self._output_size = output_size
-        self._return_indices = return_indices
+        self._return_mask = return_mask
         self._name = name
 
     def forward(self, x):
         return F.adaptive_max_pool2d(
             x,
             output_size=self._output_size,
-            return_indices=self._return_indices,
+            return_mask=self._return_mask,
             name=self._name)
 
 
@@ -1002,7 +1002,7 @@ class AdaptiveMaxPool3D(layers.Layer):
 
     Parameters:
         output_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain three elements, (D, H, W). D, H and W can be either a int, or None which means the size will be the same as that of the input.
-        return_indices (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
+        return_mask (bool): If true, the index of max pooling point will be returned along with outputs. Default False.
         name(str, optional): For detailed information, please refer
                              to :ref:`api_guide_Name`. Usually name is no need to set and
                              None by default.
@@ -1040,21 +1040,21 @@ class AdaptiveMaxPool3D(layers.Layer):
             pool = paddle.nn.AdaptiveMaxPool3D(output_size=4)
             out = pool(x)
             # out shape: [2, 3, 4, 4, 4]
-            pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_indices=True)
+            pool = paddle.nn.AdaptiveMaxPool3D(output_size=3, return_mask=True)
             out, indices = pool(x)
             # out shape: [2, 3, 4, 4, 4], indices shape: [2, 3, 4, 4, 4]
 
     """
 
-    def __init__(self, output_size, return_indices=False, name=None):
+    def __init__(self, output_size, return_mask=False, name=None):
         super(AdaptiveMaxPool3D, self).__init__()
         self._output_size = output_size
-        self._return_indices = return_indices
+        self._return_mask = return_mask
         self._name = name
 
     def forward(self, x):
         return F.adaptive_max_pool3d(
             x,
             output_size=self._output_size,
-            return_indices=self._return_indices,
+            return_mask=self._return_mask,
             name=self._name)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 958bfb304fb14..eaade222388fa 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -66,8 +66,6 @@
 from .logic import logical_or  #DEFINE_ALIAS
 from .logic import logical_xor  #DEFINE_ALIAS
 from .logic import not_equal  #DEFINE_ALIAS
-# from .logic import reduce_all  #DEFINE_ALIAS
-# from .logic import reduce_any  #DEFINE_ALIAS
 from .logic import allclose  #DEFINE_ALIAS
 from .logic import equal_all  #DEFINE_ALIAS
 # from .logic import isnan        #DEFINE_ALIAS
@@ -164,6 +162,8 @@
 from .math import isinf  #DEFINE_ALIAS
 from .math import isnan  #DEFINE_ALIAS
 from .math import prod  #DEFINE_ALIAS
+from .math import all  #DEFINE_ALIAS
+from .math import any  #DEFINE_ALIAS
 from .random import multinomial  #DEFINE_ALIAS
 from .random import standard_normal
 from .random import normal
diff --git a/python/paddle/tensor/logic.py b/python/paddle/tensor/logic.py
index 27671a4f15747..da08270d742e5 100644
--- a/python/paddle/tensor/logic.py
+++ b/python/paddle/tensor/logic.py
@@ -29,6 +29,8 @@
 from ..fluid.layers import logical_not  #DEFINE_ALIAS
 from ..fluid.layers import logical_or  #DEFINE_ALIAS
 from ..fluid.layers import logical_xor  #DEFINE_ALIAS
+from ..fluid.layers import reduce_all  #DEFINE_ALIAS
+from ..fluid.layers import reduce_any  #DEFINE_ALIAS
 
 __all__ = [
     'equal',
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 895d0c175905c..36793e0769672 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -21,7 +21,7 @@
 from paddle.tensor import cast
 import paddle
 from ..fluid import layers
-from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable
+from ..fluid.framework import core, _varbase_creator, in_dygraph_mode, Variable, convert_np_dtype_to_dtype_
 from ..fluid.layer_helper import LayerHelper
 from ..fluid.data_feeder import check_variable_and_dtype, check_type, check_dtype, convert_dtype
 from ..fluid.layers.layer_function_generator import _generate_doc_string_, generate_activation_fn, generate_layer_fn
@@ -46,6 +46,8 @@
 from ..fluid.layers import floor    #DEFINE_ALIAS
 from ..fluid.layers import log    #DEFINE_ALIAS
 from ..fluid.layers import reciprocal    #DEFINE_ALIAS
+from ..fluid.layers import reduce_all    #DEFINE_ALIAS
+from ..fluid.layers import reduce_any    #DEFINE_ALIAS
 # from ..fluid.layers import reduce_max    #DEFINE_ALIAS
 # from ..fluid.layers import reduce_min    #DEFINE_ALIAS
 # from ..fluid.layers import reduce_prod    #DEFINE_ALIAS
@@ -1933,3 +1935,201 @@ def increment(x, value=1.0, name=None):
         outputs={'Out': [x]},
         attrs={'step': float(value)})
     return x
+
+
+def all(x, axis=None, keepdim=False, name=None):
+    """
+    Computes the the ``logical and`` of tensor elements over the given dimension.
+
+    Args:
+        x (Tensor): An N-D Tensor, the input data type should be `bool`.
+        axis (int|list|tuple, optional): The dimensions along which the ``logical and`` is compute. If
+            :attr:`None`, and all elements of :attr:`x` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(x), rank(x))`. If :math:`axis[i] < 0`,
+            the dimension to reduce is :math:`rank + axis[i]`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result Tensor will have one fewer dimension
+            than the :attr:`x` unless :attr:`keepdim` is true, default
+            value is False.
+        name (str, optional): The default value is None. Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: Results the ``logical and`` on the specified axis of input Tensor `x`,  it's data type is bool.
+
+    Raises:
+        ValueError: If the data type of `x` is not bool.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            import paddle.fluid.layers as layers
+            import numpy as np
+            
+            # set as static mode
+            paddle.disable_static()
+            
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [True, True]]
+            x = layers.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
+            print(x)
+            x = layers.cast(x, 'bool')
+            
+            # out1 should be [False]
+            out1 = paddle.all(x)  # [False]
+            print(out1)
+            
+            # out2 should be [True, False]
+            out2 = paddle.all(x, axis=0)  # [True, False]
+            print(out2)
+            
+            # keep_dim=False, out3 should be [False, True], out.shape should be (2,)
+            out3 = paddle.all(x, axis=-1)  # [False, True]
+            print(out3)
+            
+            # keep_dim=True, out4 should be [[False], [True]], out.shape should be (2,1)
+            out4 = paddle.all(x, axis=1, keep_dim=True)
+            out4 = layers.cast(out4, 'int32')  # [[False], [True]]
+            print(out4)
+            
+    """
+    if axis is not None and not isinstance(axis, (list, tuple)):
+        axis = [axis]
+
+    if not axis:
+        reduce_all_flag = True
+    else:
+        if len(axis) == len(x.shape):
+            reduce_all_flag = True
+        else:
+            reduce_all_flag = False
+
+    attrs = {
+        'dim': axis if axis != None and axis != [] and axis != () else [0],
+        'keep_dim': keepdim,
+        'reduce_all': reduce_all_flag
+    }
+    dtype_flag = False
+
+
+    if in_dygraph_mode():
+        axis = axis if axis != None and axis != [] else [0]
+        return core.ops.reduce_all(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag)
+    check_variable_and_dtype(x, 'x', ['bool'], 'all')
+
+
+    check_type(axis, 'axis', (int, list, tuple, type(None)), 'all')
+
+    helper = LayerHelper('all', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='reduce_all',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs=attrs)
+    return out
+
+
+def any(x, axis=None, keepdim=False, name=None):
+    """
+    Computes the the ``logical or`` of tensor elements over the given dimension.
+
+    Args:
+        x (Tensor): An N-D Tensor, the input data type should be `bool`.
+        axis (int|list|tuple, optional): The dimensions along which the ``logical or`` is compute. If
+            :attr:`None`, and all elements of :attr:`x` and return a
+            Tensor variable with a single element, otherwise must be in the
+            range :math:`[-rank(x), rank(x))`. If :math:`axis[i] < 0`,
+            the dimension to reduce is :math:`rank + axis[i]`.
+        keepdim (bool, optional): Whether to reserve the reduced dimension in the
+            output Tensor. The result Tensor will have one fewer dimension
+            than the :attr:`x` unless :attr:`keepdim` is true, default
+            value is False.
+        name (str, optional): The default value is None. Normally there is no need for
+            user to set this property.  For more information, please refer to :ref:`api_guide_Name`
+
+    Returns:
+        Tensor: Results the ``logical or`` on the specified axis of input Tensor `x`,  it's data type is bool.
+
+    Raises:
+        ValueError: If the data type of `x` is not bool.
+        TypeError: The type of :attr:`axis` must be int, list or tuple.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import paddle.fluid as fluid
+            import paddle.fluid.layers as layers
+            import numpy as np
+            
+            # set as static mode
+            paddle.disable_static()
+            
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [False, False]]
+            x = layers.assign(np.array([[1, 0], [1, 1]], dtype='int32'))
+            print(x)
+            x = layers.cast(x, 'bool')
+            
+            # out1 should be [True]
+            out1 = paddle.any(x)  # [True]
+            print(out1)
+            
+            # out2 should be [True, False]
+            out2 = paddle.any(x, axis=0)  # [True, False]
+            print(out2)
+            
+            # keep_dim=False, out3 should be [True, False], out.shape should be (2,)
+            out3 = paddle.any(x, axis=-1)  # [True, False]
+            print(out3)
+            
+            # keep_dim=True, result should be [[True], [False]], out.shape should be (2,1)
+            out4 = paddle.any(x, axis=1, keep_dim=True)
+            out4 = layers.cast(out4, 'int32')  # [[True], [False]]
+            print(out4)
+            
+    """
+    if axis is not None and not isinstance(axis, (list, tuple)):
+        axis = [axis]
+
+    if not axis:
+        reduce_all_flag = True
+    else:
+        if len(axis) == len(x.shape):
+            reduce_all_flag = True
+        else:
+            reduce_all_flag = False
+
+    attrs = {
+        'dim': axis if axis != None and axis != [] and axis != () else [0],
+        'keep_dim': keepdim,
+        'reduce_all': reduce_all_flag
+    }
+    dtype_flag = False
+
+
+    if in_dygraph_mode():
+        axis = axis if axis != None and axis != [] else [0]
+        return core.ops.reduce_any(x, 'dim', axis, 'keep_dim', keepdim,
+                                       'reduce_all', reduce_all_flag)
+    check_variable_and_dtype(x, 'x', ['bool'], 'any')
+
+
+    check_type(axis, 'axis', (int, list, tuple, type(None)), 'any')
+
+    helper = LayerHelper('any', **locals())
+    out = helper.create_variable_for_type_inference(dtype=x.dtype)
+    helper.append_op(
+        type='reduce_any',
+        inputs={'X': x},
+        outputs={'Out': out},
+        attrs=attrs)
+    return out

From fb7f85291ba79d4e89d28f32aae42ee700e425e1 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Mon, 26 Oct 2020 17:58:55 +0800
Subject: [PATCH 051/185] fix print tensor place,add cpu/cuda/pin_memory API
 for Tensor (#28200)

---
 paddle/fluid/pybind/imperative.cc             | 127 +++++++++++++++++-
 paddle/fluid/pybind/pybind.cc                 |  15 ++-
 python/paddle/fluid/framework.py              |  38 +++---
 .../fluid/tests/unittests/test_var_base.py    |   9 ++
 python/paddle/tensor/creation.py              |  53 +++-----
 tools/wlist.json                              |   1 +
 6 files changed, 178 insertions(+), 65 deletions(-)

diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4c46af3199e29..4d68afeede4e5 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -836,6 +836,127 @@ void BindImperative(py::module *m_ptr) {
              }
            },
            py::call_guard<py::gil_scoped_release>())
+      .def("cpu",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+             if (platform::is_cpu_place(self->Place())) {
+               return self;
+             } else {
+               auto new_var = self->NewVarBase(platform::CPUPlace(), true);
+               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+               return new_var;
+             }
+           },
+           R"DOC(
+        Returns a copy of this Tensor in CPU memory.
+
+        If this Tensor is already in CPU memory, then no copy is performed and the original Tensor is returned.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              x = paddle.to_tensor(1.0, place=paddle.CUDAPlace(0))
+              print(x.place)    # CUDAPlace(0)
+              
+              y = x.cpu()
+              print(y.place)    # CPUPlace
+
+              )DOC")
+      .def("pin_memory",
+           [](const std::shared_ptr<imperative::VarBase> &self) {
+#ifndef PADDLE_WITH_CUDA
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Cannot copy this Tensor to pinned memory in CPU version "
+                 "Paddle, "
+                 "Please recompile or reinstall Paddle with CUDA support."));
+#endif
+             if (platform::is_cuda_pinned_place(self->Place())) {
+               return self;
+             } else {
+               auto new_var =
+                   self->NewVarBase(platform::CUDAPinnedPlace(), true);
+               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+               return new_var;
+             }
+           },
+           R"DOC(
+        Returns a copy of this Tensor in pin memory.
+
+        If this Tensor is already in pin memory, then no copy is performed and the original Tensor is returned.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              x = paddle.to_tensor(1.0, place=paddle.CUDAPlace(0))
+              print(x.place)      # CUDAPlace(0)
+
+              y = x.pin_memory()
+              print(y.place)      # CUDAPinnedPlace
+
+      )DOC")
+      .def("cuda",
+           [](const std::shared_ptr<imperative::VarBase> &self, int device_id,
+              bool blocking) {
+#ifndef PADDLE_WITH_CUDA
+             PADDLE_THROW(platform::errors::PermissionDenied(
+                 "Cannot copy this Tensor to GPU in CPU version Paddle, "
+                 "Please recompile or reinstall Paddle with CUDA support."));
+#else
+             int device_count = platform::GetCUDADeviceCount();
+             if (device_id == -1) {
+               if (platform::is_gpu_place(self->Place())) {
+                 return self;
+               } else {
+                 device_id = 0;
+               }
+             }
+             PADDLE_ENFORCE_GE(
+                 device_id, 0,
+                 platform::errors::InvalidArgument(
+                     "Can not copy Tensor to Invalid CUDAPlace(%d), device id "
+                     "must inside [0, %d)",
+                     device_id, device_count));
+             PADDLE_ENFORCE_LT(
+                 device_id, device_count,
+                 platform::errors::InvalidArgument(
+                     "Can not copy Tensor to Invalid CUDAPlace(%d), device id "
+                     "must inside [0, %d)",
+                     device_id, device_count));
+             platform::CUDAPlace place = platform::CUDAPlace(device_id);
+             if (platform::is_same_place(self->Place(), place)) {
+               return self;
+             } else {
+               auto new_var = self->NewVarBase(place, blocking);
+               new_var->SetOverridedStopGradient(self->OverridedStopGradient());
+               return new_var;
+             }
+#endif
+           },
+           py::arg("device_id") = -1, py::arg("blocking") = true, R"DOC(
+        Returns a copy of this Tensor in GPU memory.
+
+        If this Tensor is already in GPU memory and device_id is default, 
+        then no copy is performed and the original Tensor is returned.
+        
+        Args:
+            device_id(int, optional): The destination GPU device id. Defaults to the current device.
+            blocking(bool, optional): If False and the source is in pinned memory, the copy will be 
+              asynchronous with respect to the host. Otherwise, the argument has no effect. Default: False.
+
+        Examples:
+            .. code-block:: python
+
+              import paddle
+              x = paddle.to_tensor(1.0, place=paddle.CPUPlace())
+              print(x.place)        # CPUPlace
+
+              y = x.cuda()
+              print(y.place)        # CUDAPlace(0)
+
+              y = x.cuda(1)
+              print(y.place)        # CUDAPlace(1)
+       )DOC")
       .def("_copy_to",
            [](const imperative::VarBase &self, const platform::CPUPlace &place,
               bool blocking) { return self.NewVarBase(place, blocking); },
@@ -950,12 +1071,14 @@ void BindImperative(py::module *m_ptr) {
           [](imperative::Tracer &self,
              std::unordered_set<std::string> &allow_ops,
              std::unordered_set<std::string> &block_ops) {
-            // NOTE(zhiqiu): The automatic conversion in pybind11 between c++
+            // NOTE(zhiqiu): The automatic conversion in pybind11 between
+            // c++
             // STL and python set/list/dict involve a copy operation that
             // prevents pass-by-reference semantics, so it is ok to swap.
             // The reaseon why not directly pass
             // std::shared_ptr<std::unordered_set<std::string>>
-            // is that pybind11 forbid shared_ptr<T> where T is not custom type.
+            // is that pybind11 forbid shared_ptr<T> where T is not custom
+            // type.
             imperative::AmpOperators::Instance().GetAllowOps()->swap(allow_ops);
             imperative::AmpOperators::Instance().GetBlockOps()->swap(block_ops);
           })
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 3d9d204991f79..8ff7e90065330 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1421,6 +1421,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_get_device_id",
            [](platform::CUDAPlace &self) -> int { return self.GetDeviceId(); })
 #endif
+      .def("__repr__", string::to_string<const platform::CUDAPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPlace &>);
 
   py::class_<platform::XPUPlace>(m, "XPUPlace", R"DOC(
@@ -1479,6 +1480,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("get_device_id",
            [](const platform::XPUPlace &self) { return self.GetDeviceId(); })
 #endif
+      .def("__repr__", string::to_string<const platform::XPUPlace &>)
       .def("__str__", string::to_string<const platform::XPUPlace &>);
 
   py::class_<paddle::platform::CPUPlace>(m, "CPUPlace", R"DOC(
@@ -1500,6 +1502,7 @@ All parameter, weight, gradient are variables in Paddle.
       .def("_equals", &IsSamePlace<platform::CPUPlace, platform::CPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CPUPlace, platform::CUDAPinnedPlace>)
+      .def("__repr__", string::to_string<const platform::CPUPlace &>)
       .def("__str__", string::to_string<const platform::CPUPlace &>);
 
   py::class_<paddle::platform::CUDAPinnedPlace>(m, "CUDAPinnedPlace", R"DOC(
@@ -1536,6 +1539,7 @@ All parameter, weight, gradient are variables in Paddle.
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CPUPlace>)
       .def("_equals",
            &IsSamePlace<platform::CUDAPinnedPlace, platform::CUDAPinnedPlace>)
+      .def("__repr__", string::to_string<const platform::CUDAPinnedPlace &>)
       .def("__str__", string::to_string<const platform::CUDAPinnedPlace &>);
 
   py::class_<platform::Place>(m, "Place")
@@ -1578,10 +1582,13 @@ All parameter, weight, gradient are variables in Paddle.
            [](platform::Place &self, const platform::CUDAPlace &gpu_place) {
              self = gpu_place;
            })
-      .def("set_place", [](platform::Place &self,
-                           const platform::CUDAPinnedPlace &cuda_pinned_place) {
-        self = cuda_pinned_place;
-      });
+      .def("set_place",
+           [](platform::Place &self,
+              const platform::CUDAPinnedPlace &cuda_pinned_place) {
+             self = cuda_pinned_place;
+           })
+      .def("__repr__", string::to_string<const platform::Place &>)
+      .def("__str__", string::to_string<const platform::Place &>);
 
   py::class_<OperatorBase>(m, "Operator")
       .def_static(
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 6be7fe0612e5a..904622caf45fc 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1785,8 +1785,6 @@ class ComplexVariable(object):
     **Notes**:
         **The constructor of ComplexTensor should not be invoked directly.**
 
-        **Only support dygraph mode at present. Please use** :ref:`api_fluid_dygraph_to_variable` **to create a dygraph ComplexTensor with complex number data.**
-
     Args:
         real (Tensor): The Tensor holding real-part data.
         imag (Tensor): The Tensor holding imaginery-part data.
@@ -1795,14 +1793,14 @@ class ComplexVariable(object):
         .. code-block:: python
 
             import paddle
-            import numpy as np
-
-            paddle.enable_imperative()
             x = paddle.to_tensor([1.0+2.0j, 0.2])
             print(x.name, x.dtype, x.shape)
-            # ({'real': 'generated_tensor_0.real', 'imag': 'generated_tensor_0.imag'}, 'complex128', [2L])
-            print(x.numpy())
-            # [1. +2.j 0.2+0.j]
+            # ({'real': 'generated_tensor_0.real', 'imag': 'generated_tensor_0.imag'}, complex64, [2])
+            print(x)
+            # ComplexTensor[real](shape=[2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #                     [        1., 0.20000000])
+            # ComplexTensor[imag](shape=[2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+            #                     [2., 0.])
             print(type(x))
             # <class 'paddle.ComplexTensor'>
     """
@@ -1858,9 +1856,10 @@ def numpy(self):
         return self.real.numpy() + 1j * self.imag.numpy()
 
     def __str__(self):
-        return "ComplexTensor[real]: %s\n%s\nComplexTensor[imag]: %s\n%s" % (
-            self.real.name, str(self.real.value().get_tensor()), self.imag.name,
-            str(self.imag.value().get_tensor()))
+        from paddle.tensor.to_string import to_string
+        return "ComplexTensor containing:\n{real}\n{imag}".format(
+            real=to_string(self.real, "[real part]Tensor"),
+            imag=to_string(self.imag, "[imag part]Tensor"))
 
     __repr__ = __str__
 
@@ -5335,16 +5334,13 @@ def __str__(self):
             .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
-                conv = paddle.nn.Conv2D(3, 3, 5)
-                print(conv.weight)
-                # Parameter: conv2d_0.w_0
-                #   - place: CUDAPlace(0)
-                #   - shape: [3, 3, 5, 5]
-                #   - layout: NCHW
-                #   - dtype: float
-                #   - data: [...] 
-                paddle.enable_static()
+                linear = paddle.nn.Linear(3, 3)
+                print(linear.weight)
+                # Parameter containing:
+                # Tensor(shape=[3, 3], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+                #        [[ 0.48948765,  0.05829060, -0.25524026],
+                #         [-0.70368278,  0.52986908, -0.68742192],
+                #         [-0.54217887,  0.48439729,  0.34082305]])
         """
         return "Parameter containing:\n{tensor}".format(
             tensor=super(ParamBase, self).__str__())
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index ecbf2415247b1..42fd2de864d08 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -64,6 +64,15 @@ def _test_place(place):
                 y.backward()
                 self.assertTrue(
                     np.array_equal(x.grad, np.array([2.4]).astype('float32')))
+                y = x.cpu()
+                self.assertEqual(y.place.__repr__(), "CPUPlace")
+                if core.is_compiled_with_cuda():
+                    y = x.pin_memory()
+                    self.assertEqual(y.place.__repr__(), "CUDAPinnedPlace")
+                    y = x.cuda(blocking=False)
+                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
+                    y = x.cuda(blocking=True)
+                    self.assertEqual(y.place.__repr__(), "CUDAPlace(0)")
 
                 # set_default_dtype take effect on complex
                 x = paddle.to_tensor(1 + 2j, place=place, stop_gradient=False)
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 65a33ade27a22..8aa94ae420342 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -90,61 +90,38 @@ def to_tensor(data, dtype=None, place=None, stop_gradient=True):
     .. code-block:: python
 
         import paddle
-        import numpy as np
-        paddle.disable_static()
                 
         type(paddle.to_tensor(1))
         # <class 'paddle.Tensor'>
 
         paddle.to_tensor(1)
-        # Tensor: generated_tensor_0
-        # - place: CUDAPlace(0)   # allocate on global default place CPU:0
-        # - shape: [1]
-        # - layout: NCHW
-        # - dtype: int64_t
-        # - data: [1]
+        # Tensor(shape=[1], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        #        [1])
 
         x = paddle.to_tensor(1)
         paddle.to_tensor(x, dtype='int32', place=paddle.CPUPlace()) # A new tensor will be constructed due to different dtype or place
-        # Tensor: generated_tensor_01
-        # - place: CPUPlace
-        # - shape: [1]
-        # - layout: NCHW
-        # - dtype: int
-        # - data: [1]
+        # Tensor(shape=[1], dtype=int32, place=CPUPlace, stop_gradient=True,
+        #        [1])
 
         paddle.to_tensor((1.1, 2.2), place=paddle.CUDAPinnedPlace())
-        # Tensor: generated_tensor_1
-        #   - place: CUDAPinnedPlace
-        #   - shape: [2]
-        #   - layout: NCHW
-        #   - dtype: double
-        #   - data: [1.1 2.2]
+        # Tensor(shape=[1], dtype=float32, place=CUDAPinnedPlace, stop_gradient=True,
+        #        [1])
 
         paddle.to_tensor([[0.1, 0.2], [0.3, 0.4]], place=paddle.CUDAPlace(0), stop_gradient=False)
-        # Tensor: generated_tensor_2
-        #   - place: CUDAPlace(0)
-        #   - shape: [2, 2]
-        #   - layout: NCHW
-        #   - dtype: double
-        #   - data: [0.1 0.2 0.3 0.4]
+        # Tensor(shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=False,
+        #        [[0.10000000, 0.20000000],
+        #         [0.30000001, 0.40000001]])
 
         type(paddle.to_tensor([[1+1j, 2], [3+2j, 4]]), dtype='complex64')
         # <class 'paddle.ComplexTensor'>
 
         paddle.to_tensor([[1+1j, 2], [3+2j, 4]], dtype='complex64')
-        # ComplexTensor[real]: generated_tensor_0.real
-        #   - place: CUDAPlace(0)
-        #   - shape: [2, 2]
-        #   - layout: NCHW
-        #   - dtype: float
-        #   - data: [1 2 3 4]
-        # ComplexTensor[imag]: generated_tensor_0.imag
-        #   - place: CUDAPlace(0)
-        #   - shape: [2, 2]
-        #   - layout: NCHW
-        #   - dtype: float
-        #   - data: [1 0 2 0]
+        # ComplexTensor[real](shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+        #                     [[1., 2.],
+        #                      [3., 4.]])
+        # ComplexTensor[imag](shape=[2, 2], dtype=float32, place=CUDAPlace(0), stop_gradient=True,
+        #                     [[1., 0.],
+        #                      [2., 0.]])
     """
 
     if place is None:
diff --git a/tools/wlist.json b/tools/wlist.json
index 9844fa486cc04..648cbf6c3b77b 100644
--- a/tools/wlist.json
+++ b/tools/wlist.json
@@ -24,6 +24,7 @@
         }
     ],
     "wlist_temp_api":[
+        "to_tensor",
         "LRScheduler",
         "ReduceOnPlateau",
         "append_LARS",

From 7db747d9e88a989fb48be970b687b5479c22f52f Mon Sep 17 00:00:00 2001
From: Adam Osewski <adam.osewski@intel.com>
Date: Mon, 26 Oct 2020 13:12:48 +0100
Subject: [PATCH 052/185] oneDNN BatchNorm + Act fusion pass. (#27912)

---
 paddle/fluid/framework/ir/CMakeLists.txt      |   2 +
 .../framework/ir/graph_pattern_detector.cc    |  20 +
 .../framework/ir/graph_pattern_detector.h     |  21 +
 .../ir/mkldnn/batch_norm_act_fuse_pass.cc     | 108 +++++
 .../ir/mkldnn/batch_norm_act_fuse_pass.h      |  44 ++
 .../mkldnn/batch_norm_act_fuse_pass_tester.cc | 382 ++++++++++++++++++
 .../inference/api/paddle_pass_builder.cc      |   1 +
 .../test_mkldnn_batch_norm_act_fuse_pass.py   |  79 ++++
 8 files changed, 657 insertions(+)
 create mode 100644 paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
 create mode 100644 paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h
 create mode 100644 paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 5bb833f613529..9415fe6e61e08 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -110,6 +110,7 @@ if(WITH_MKLDNN)
     pass_library(cpu_quantize_squash_pass inference DIR mkldnn)
     pass_library(reshape_transpose_matmul_mkldnn_fuse_pass inference DIR mkldnn)
     pass_library(matmul_transpose_reshape_fuse_pass inference DIR mkldnn)
+    pass_library(batch_norm_act_fuse_pass inference DIR mkldnn)
 endif()
 
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
@@ -151,6 +152,7 @@ if (WITH_MKLDNN)
     cc_test(test_conv_activation_mkldnn_fuse_pass SRCS mkldnn/conv_activation_mkldnn_fuse_pass_tester.cc DEPS conv_activation_mkldnn_fuse_pass)
     cc_test(test_conv_concat_relu_mkldnn_fuse_pass SRCS mkldnn/conv_concat_relu_mkldnn_fuse_pass_tester.cc DEPS conv_concat_relu_mkldnn_fuse_pass)
     cc_test(test_conv_elementwise_add_mkldnn_fuse_pass SRCS mkldnn/conv_elementwise_add_mkldnn_fuse_pass_tester.cc DEPS conv_elementwise_add_mkldnn_fuse_pass)
+    cc_test(test_batch_norm_act_fuse_pass SRCS mkldnn/batch_norm_act_fuse_pass_tester.cc DEPS batch_norm_act_fuse_pass)
     set(TEST_CONV_BN_PASS_DEPS conv_bn_fuse_pass graph_to_program_pass conv_op conv_transpose_op math_function im2col vol2col batch_norm_op gelu_op activation_op elementwise_add_op concat_and_split naive_executor device_context)
 if (WITH_GPU)
     set(TEST_CONV_BN_PASS_DEPS ${TEST_CONV_BN_PASS_DEPS} depthwise_conv)
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index ed2863e8bf798..3127a3fd8a7fe 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1188,6 +1188,26 @@ PDNode *patterns::BatchNormActGrad::operator()(
   return bn_grad;
 }
 
+PDNode *patterns::BatchNormActOneDNN::operator()(const std::string &act_type) {
+  auto *bn_x = pattern->NewNode(bn_in_repr())
+                   ->AsInput()
+                   ->assert_is_op_input("batch_norm", "X");
+  auto *bn = pattern->NewNode(batch_norm_repr())->assert_is_op("batch_norm");
+  auto *bn_out = pattern->NewNode(bn_out_repr())
+                     ->assert_is_op_output("batch_norm", "Y")
+                     ->assert_is_op_input(act_type);
+  auto *act =
+      pattern->NewNode(act_repr())->assert_is_op(act_type)->AsIntermediate();
+  auto *act_out = pattern->NewNode(act_out_repr())
+                      ->assert_is_op_output(act_type, "Out")
+                      ->AsOutput();
+
+  bn->LinksFrom({bn_x}).LinksTo({bn_out});
+  act->LinksFrom({bn_out}).LinksTo({act_out});
+
+  return act_out;
+}
+
 PDNode *patterns::ElewiseAddAct::operator()(
     paddle::framework::ir::PDNode *ele_x_var,
     std::unordered_set<std::string> act_types) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 15f6ea1541d58..c44c7b4059eb0 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -664,6 +664,27 @@ struct BatchNormActGrad : public PatternBase {
   PATTERN_DECL_NODE(d_bn_bias);
 };
 
+//
+// \brief   Pattern looking for batch_norm and a directly following activation
+// operator.
+//
+// \note    Currently only ReLU is supported as an activation function.
+//          Formula: act(bn(x))
+//          Op: batch_norm + act
+struct BatchNormActOneDNN : public PatternBase {
+  BatchNormActOneDNN(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bn_act_onednn") {}
+
+  PDNode* operator()(const std::string& act_type);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(bn_in);
+  PATTERN_DECL_NODE(batch_norm);
+  PATTERN_DECL_NODE(act);
+  PATTERN_DECL_NODE(bn_out);
+  PATTERN_DECL_NODE(act_out);
+};
+
 // The following patterns are used to fuse elewise_add and act
 // formula: act(ele_add(x, y))
 // op: elementwise_add + act
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
new file mode 100644
index 0000000000000..7e28ccd24a80d
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.cc
@@ -0,0 +1,108 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/fluid/string/pretty_log.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+using string::PrettyLogDetail;
+
+void FuseBatchNormActOneDNNPass::ApplyImpl(Graph *graph) const {
+  std::string act_type("relu");
+  FuseBatchNormAct(graph, act_type);
+}
+
+void FuseBatchNormActOneDNNPass::FuseBatchNormAct(
+    Graph *graph, const std::string &act_type) const {
+  PADDLE_ENFORCE_NOT_NULL(
+      graph, platform::errors::InvalidArgument(
+                 "The input graph of "
+                 "FuseBatchNormActOneDNNPass should not be nullptr."));
+  FusePassBase::Init("bn_act", graph);
+
+  GraphPatternDetector gpd;
+  patterns::BatchNormActOneDNN bn_act_pattern(gpd.mutable_pattern(), "bn_act");
+  bn_act_pattern(act_type);
+
+  int found_bn_act_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "Fuse BatchNorm with ReLU activation op.";
+    // BN output
+    GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, bn_act_pattern);
+    // ACT output
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_act_pattern);
+    // ops
+    GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, bn_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_act_pattern);
+
+    auto *bn_op = batch_norm->Op();
+
+    if (bn_op->HasAttr("use_mkldnn")) {
+      PADDLE_ENFORCE(
+          BOOST_GET_CONST(bool, bn_op->GetAttr("use_mkldnn")),
+          platform::errors::PreconditionNotMet(
+              "The BatchNorm+Act fusion may happen only when oneDNN library "
+              "is used."));
+    }
+
+    if (bn_op->HasAttr("trainable_statistics")) {
+      PADDLE_ENFORCE(
+          !BOOST_GET_CONST(bool, bn_op->GetAttr("trainable_statistics")),
+          platform::errors::PreconditionNotMet(
+              "The BatchNorm+Act fusion may happen only when mean and variance "
+              "are not calculated by current batch statistics."));
+    }
+
+    if (bn_op->HasAttr("is_test")) {
+      PADDLE_ENFORCE(
+          BOOST_GET_CONST(bool, bn_op->GetAttr("is_test")),
+          platform::errors::PreconditionNotMet(
+              "The BatchNorm+Act fusion may happen only during inference."));
+    }
+
+    bn_op->SetAttr("use_mkldnn", true);
+    bn_op->SetAttr("is_test", true);
+    bn_op->SetAttr("fuse_with_relu", true);
+    bn_op->SetAttr("trainable_statistics", false);
+    bn_op->SetOutput("Y", {act_out->Name()});
+
+    IR_OP_VAR_LINK(batch_norm, act_out);
+    GraphSafeRemoveNodes(g, {act, bn_out});
+    found_bn_act_count++;
+  };
+
+  gpd(graph, handler);
+  AddStatis(found_bn_act_count);
+  PrettyLogDetail("---    fused %d batch norm with relu activation",
+                  found_bn_act_count);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(batch_norm_act_fuse_pass,
+              paddle::framework::ir::FuseBatchNormActOneDNNPass);
+REGISTER_PASS_CAPABILITY(batch_norm_act_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("batch_norm", 0)
+            .EQ("relu", 0));
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h
new file mode 100644
index 0000000000000..843e7e420b7be
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h
@@ -0,0 +1,44 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * \brief   Fuse the BatchNorm and activation operators into single OneDNN's
+ *          BatchNorm with post-op.
+ *
+ * \note    Currently only ReLU is supported as an activation function.
+ */
+class FuseBatchNormActOneDNNPass : public FusePassBase {
+ public:
+  virtual ~FuseBatchNormActOneDNNPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  void FuseBatchNormAct(ir::Graph *graph, const std::string &act_types) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
new file mode 100644
index 0000000000000..5543d19b91c8e
--- /dev/null
+++ b/paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass_tester.cc
@@ -0,0 +1,382 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <exception>
+#include <functional>
+#include <iterator>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/framework/ir/mkldnn/batch_norm_act_fuse_pass.h"
+#include "paddle/fluid/framework/op_desc.h"
+#include "paddle/fluid/framework/program_desc.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// -------------------------- helper functions --------------------------------
+namespace {
+
+using InOutVarNamePair = std::pair<std::string, std::string>;
+using OpTypeCountPair = std::pair<std::string, int>;
+
+///
+/// @brief      Creates the specified operator and sets up its inputs/outputs.
+///
+/// @param      prog          The program descriptor to which we add new op.
+/// @param[in]  op_type_name  The operator type name.
+/// @param[in]  inputs        The vector of input pairs: {input_name, variable
+///                           name}
+/// @param[in]  outputs       The vector of output pairs {output_name, variable}
+/// @param[in]  use_mkldnn    The flag deciding whether or not to set
+///                           'use_mkldnn' attribute.
+///
+/// @return     Returns pointer to the created operator descriptor.
+///
+OpDesc* CreateOp(ProgramDesc* prog, const std::string& op_type_name,
+                 const std::vector<InOutVarNamePair>& inputs,
+                 const std::vector<InOutVarNamePair>& outputs,
+                 bool use_mkldnn = true) {
+  auto op = prog->MutableBlock(0)->AppendOp();
+  op->SetType(op_type_name);
+  op->SetAttr("use_mkldnn", use_mkldnn);
+
+  for (const auto& input : inputs) {
+    op->SetInput(input.first, {input.second});
+  }
+  for (const auto& output : outputs) {
+    op->SetOutput(output.first, {output.second});
+  }
+
+  return op;
+}
+
+///
+/// @brief      Check whether node 'to' is reachable from node 'from' in graph.
+///
+/// @param[in]  graph  The graph we're checking for reachability.
+/// @param[in]  from   The 'from' node name.
+/// @param[in]  to     The 'to' node name.
+///
+/// @return     True if there is connection between nodes 'from' and 'to'.
+///
+bool TestIsReachable(const Graph& graph, std::string from, std::string to) {
+  auto hash = [](const Node* node) -> std::string {
+    return node->Name() + std::to_string(node->id());
+  };
+
+  auto find_node = [&](const Graph& graph, const std::string& name) -> Node* {
+    for (auto& node : GraphTraits::DFS(graph)) {
+      if (name == hash(&node)) {
+        return &node;
+      }
+    }
+
+    return nullptr;
+  };
+
+  if (from == to) return true;
+
+  std::map<std::string, bool> visited;
+  // update the from and to strings to hashed equivs in loop from graph traits
+  for (auto& node : GraphTraits::DFS(graph)) {
+    auto hashed = hash(&node);
+    if (node.Name() == from) {
+      from = hashed;
+    }
+    if (node.Name() == to) {
+      to = hashed;
+    }
+    visited[hashed] = false;
+  }
+
+  visited[from] = true;
+
+  std::list<std::string> queue;
+  queue.push_back(from);
+
+  while (!queue.empty()) {
+    auto cur = find_node(graph, queue.front());
+    queue.pop_front();
+    if (cur == nullptr) {
+      return false;
+    }
+
+    for (auto n : cur->outputs) {
+      auto hashed_name = hash(n);
+      if (hashed_name == to) {
+        return true;
+      }
+
+      if (!visited[hashed_name]) {
+        visited[hashed_name] = true;
+        queue.push_back(hashed_name);
+      }
+    }
+  }
+  return false;
+}
+
+///
+/// @brief      Search through graph and counts provided operator occurences.
+///
+/// @param[in]  graph          The graph we search through.
+/// @param[in]  op_type_count  The vector of pairs {op_type_name, op count}
+///
+/// @note       After going through all graph nodes this function asserts
+///             whether counted number for each requested op is as expected.
+///
+void AssertOpsCount(const Graph& graph,
+                    std::vector<OpTypeCountPair> op_type_count) {
+  for (auto* node : graph.Nodes()) {
+    if (!node->IsOp()) {
+      continue;
+    }
+
+    const std::string op_type_name = node->Op()->Type();
+    auto op_it =
+        std::find_if(std::begin(op_type_count), std::end(op_type_count),
+                     [op_type_name](const OpTypeCountPair& p) {
+                       return op_type_name == p.first;
+                     });
+    if (op_it != std::end(op_type_count)) {
+      op_it->second--;
+    }
+  }
+
+  for (const OpTypeCountPair& p : op_type_count) {
+    EXPECT_EQ(p.second, 0);
+  }
+}
+
+///
+/// @brief      Builds a program descriptor.
+///
+/// @param[in]  transient_vars   The vector of transient variables names.
+/// @param[in]  persistent_vars  The vector of persistent variables names. Those
+///                              will have persistable attribute set to true.
+///
+/// @return     The program descriptor object.
+///
+ProgramDesc BuildProgramDesc(const std::vector<std::string>& transient_vars,
+                             const std::vector<std::string>& persistent_vars) {
+  ProgramDesc prog;
+
+  auto add_var_to_prog = [&prog](const std::string& var_name) -> VarDesc* {
+    auto var = prog.MutableBlock(0)->Var(var_name);
+    var->SetType(proto::VarType::LOD_TENSOR);
+    return var;
+  };
+
+  for (const auto& v : transient_vars) {
+    add_var_to_prog(v);
+  }
+
+  for (const auto& v : persistent_vars) {
+    auto* var = add_var_to_prog(v);
+    var->SetPersistable(true);
+  }
+
+  return prog;
+}
+
+///
+/// @brief      Execute pass on provided graph and perform checks.
+///
+/// @param      graph                The graph we run pass on.
+/// @param[in]  from                 The name of a 'starting' node sequence in a
+///                                  graph. This would be used to test for
+///                                  correct node connections.
+/// @param[in]  to                   The name of a 'ending' node sequence in a
+///                                  graph. This would be used to test for
+///                                  correct node connections.
+/// @param[in]  removed_nodes_count  The number of nodes we expect will be
+///                                  removed/fused after pass execution.
+/// @param[in]  added_nodes_count    The number of nodes we expect will be
+///                                  added after pass execution.
+///
+void RunPassAndAssert(Graph* graph, const std::string& from,
+                      const std::string& to, int removed_nodes_count,
+                      int added_nodes_count = 0) {
+  EXPECT_TRUE(TestIsReachable(*graph, from, to));
+  int original_nodes_num = graph->Nodes().size();
+  auto pass = PassRegistry::Instance().Get("batch_norm_act_fuse_pass");
+  pass->Apply(graph);
+  int current_nodes_num = graph->Nodes().size();
+
+  EXPECT_TRUE(TestIsReachable(*graph, from, to));
+  EXPECT_EQ(original_nodes_num - removed_nodes_count + added_nodes_count,
+            current_nodes_num);
+}
+
+void SetBatchNormAttrs(OpDesc* bn_op, bool is_test = true,
+                       bool trainable_stats = true) {
+  bn_op->SetAttr("is_test", is_test);
+  bn_op->SetAttr("trainable_statistics", trainable_stats);
+  bn_op->SetAttr("fuse_with_relu", false);
+}
+
+}  // namespace
+
+// ------------------------------ Test cases -----------------------------------
+
+// The below test cases are distinguished by whether following attributes have
+// true or false value:
+// - is_test
+// - trainable_statistics
+// The test case name would have only attributes with true value in its name.
+
+TEST(FuseBatchNormActOneDNNPass, ThrowIsTestTrainableStats) {
+  auto prog = BuildProgramDesc(
+      {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
+      {"scale", "bias"});
+  auto* bn_op = CreateOp(&prog, "batch_norm", {{"X", "x"},
+                                               {"Scale", "scale"},
+                                               {"Bias", "bias"},
+                                               {"Mean", "m"},
+                                               {"Variance", "v"}},
+                         {{"Y", "bn_y"},
+                          {"MeanOut", "m_out"},
+                          {"VarianceOut", "var_out"},
+                          {"SavedMean", "sm"},
+                          {"SavedVariance", "sv"}});
+  SetBatchNormAttrs(bn_op, true, true);
+  CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
+
+  Graph graph(prog);
+  // No fusion in this attribute configuration
+  constexpr int removed_nodes_count = 0;
+
+  EXPECT_THROW(RunPassAndAssert(&graph, "x", "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(FuseBatchNormActOneDNNPass, FuseIsTest) {
+  auto prog =
+      BuildProgramDesc({"x", "m", "v", "bn_y", "act_y"}, {"scale", "bias"});
+  auto* bn_op = CreateOp(&prog, "batch_norm", {{"X", "x"},
+                                               {"Scale", "scale"},
+                                               {"Bias", "bias"},
+                                               {"Mean", "m"},
+                                               {"Variance", "v"}},
+                         {{"Y", "bn_y"}});
+  SetBatchNormAttrs(bn_op, true, false);
+  CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
+
+  Graph graph(prog);
+  constexpr int removed_nodes_count = 2;
+
+  RunPassAndAssert(&graph, "x", "act_y", removed_nodes_count);
+  AssertOpsCount(graph, {{"batch_norm", 1}, {"relu", 0}});
+
+  for (const auto* node : graph.Nodes()) {
+    if (node->IsOp() && node->Op()->Type() == "batch_norm") {
+      const auto* op = node->Op();
+      ASSERT_TRUE(op->HasAttr("use_mkldnn"));
+      EXPECT_TRUE(BOOST_GET_CONST(bool, op->GetAttr("use_mkldnn")));
+      ASSERT_TRUE(op->HasAttr("fuse_with_relu"));
+      EXPECT_TRUE(BOOST_GET_CONST(bool, op->GetAttr("fuse_with_relu")));
+      ASSERT_TRUE(op->HasAttr("trainable_statistics"));
+      EXPECT_FALSE(BOOST_GET_CONST(bool, op->GetAttr("trainable_statistics")));
+    }
+  }
+}
+
+TEST(FuseBatchNormActOneDNNPass, ThrowTrainableStats) {
+  auto prog = BuildProgramDesc(
+      {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
+      {"scale", "bias"});
+  auto* bn_op = CreateOp(&prog, "batch_norm", {{"X", "x"},
+                                               {"Scale", "scale"},
+                                               {"Bias", "bias"},
+                                               {"Mean", "m"},
+                                               {"Variance", "v"}},
+                         {{"Y", "bn_y"},
+                          {"MeanOut", "m_out"},
+                          {"VarianceOut", "var_out"},
+                          {"SavedMean", "sm"},
+                          {"SavedVariance", "sv"}});
+  SetBatchNormAttrs(bn_op, false, true);
+  CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
+
+  Graph graph(prog);
+  // No fusion in this attribute configuration
+  constexpr int removed_nodes_count = 0;
+
+  EXPECT_THROW(RunPassAndAssert(&graph, "x", "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(FuseBatchNormActOneDNNPass, AllAttrsFalse) {
+  auto prog = BuildProgramDesc(
+      {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
+      {"scale", "bias"});
+  auto* bn_op = CreateOp(&prog, "batch_norm", {{"X", "x"},
+                                               {"Scale", "scale"},
+                                               {"Bias", "bias"},
+                                               {"Mean", "m"},
+                                               {"Variance", "v"}},
+                         {{"Y", "bn_y"},
+                          {"MeanOut", "m_out"},
+                          {"VarianceOut", "var_out"},
+                          {"SavedMean", "sm"},
+                          {"SavedVariance", "sv"}});
+  SetBatchNormAttrs(bn_op, false, false);
+  CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
+
+  Graph graph(prog);
+  // No fusion in this attribute configuration
+  constexpr int removed_nodes_count = 0;
+
+  EXPECT_THROW(RunPassAndAssert(&graph, "x", "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
+}
+
+TEST(FuseBatchNormActOneDNNPass, ThrowUseMkldnn) {
+  auto prog = BuildProgramDesc(
+      {"x", "m", "v", "bn_y", "act_y", "m_out", "var_out", "sm", "sv"},
+      {"scale", "bias"});
+  auto* bn_op = CreateOp(&prog, "batch_norm", {{"X", "x"},
+                                               {"Scale", "scale"},
+                                               {"Bias", "bias"},
+                                               {"Mean", "m"},
+                                               {"Variance", "v"}},
+                         {{"Y", "bn_y"},
+                          {"MeanOut", "m_out"},
+                          {"VarianceOut", "var_out"},
+                          {"SavedMean", "sm"},
+                          {"SavedVariance", "sv"}},
+                         false);
+  SetBatchNormAttrs(bn_op, false, false);
+  CreateOp(&prog, "relu", {{"X", "bn_y"}}, {{"Out", "act_y"}}, false);
+
+  Graph graph(prog);
+  // No fusion in this attribute configuration
+  constexpr int removed_nodes_count = 0;
+
+  EXPECT_THROW(RunPassAndAssert(&graph, "x", "act_y", removed_nodes_count),
+               paddle::platform::EnforceNotMet);
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+USE_PASS(batch_norm_act_fuse_pass);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 19f52422b441f..1448d56566165 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -207,6 +207,7 @@ void CpuPassStrategy::EnableMKLDNN() {
              "matmul_transpose_reshape_fuse_pass",         //
              // Disabled due to topology-dependent speed-up
              // "fc_mkldnn_pass",
+             "batch_norm_act_fuse_pass",
              "mkldnn_inplace_pass",  // This pass should be activated after
                                      // fuses
          })) {
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
new file mode 100644
index 0000000000000..c119cbec884e2
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_batch_norm_act_fuse_pass.py
@@ -0,0 +1,79 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Test for fusion of batch norm and activation."""
+from __future__ import print_function
+
+import unittest
+import numpy as np
+
+import paddle.fluid as fluid
+from inference_pass_test import InferencePassTest
+from paddle import enable_static
+from paddle.fluid.core import PassVersionChecker
+
+enable_static()
+
+
+class BnReluOneDnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            bn_out = fluid.layers.batch_norm(
+                input=data, is_test=True, use_global_stats=self.global_stats)
+            relu_out = fluid.layers.relu(bn_out)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [relu_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.global_stats = False
+        self.pass_name = "batch_norm_act_fuse_pass"
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+class BnReluGlobalStatsOneDnnFusePassTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 3, 100, 100], dtype="float32")
+            bn_out = fluid.layers.batch_norm(
+                input=data, is_test=True, use_global_stats=self.global_stats)
+            relu_out = fluid.layers.relu(bn_out)
+
+        self.feeds = {
+            "data": np.random.random((1, 3, 100, 100)).astype("float32")
+        }
+        self.fetch_list = [relu_out]
+        self.enable_mkldnn = True
+
+    def set_params(self):
+        self.global_stats = True
+        self.pass_name = "batch_norm_act_fuse_pass"
+
+    def test_check_output(self):
+        self.check_output()
+        self.assertTrue(PassVersionChecker.IsCompatible(self.pass_name))
+
+
+if __name__ == "__main__":
+    unittest.main()

From 4671d85a03a6ff501f32775ef88b8e0468ffb2a6 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Mon, 26 Oct 2020 20:15:23 +0800
Subject: [PATCH 053/185] fix DataLoader return same format between static &
 dynamic in single mode (#28176)

* fix DataLoader return same format between static & dynamic in single mode. test=develop
---
 .../fluid/dataloader/dataloader_iter.py       |  8 +++-
 .../test_multiprocess_dataloader_static.py    | 45 +++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dataloader/dataloader_iter.py b/python/paddle/fluid/dataloader/dataloader_iter.py
index 7d203b349a130..d32a543eb495f 100644
--- a/python/paddle/fluid/dataloader/dataloader_iter.py
+++ b/python/paddle/fluid/dataloader/dataloader_iter.py
@@ -341,7 +341,13 @@ def __next__(self):
                 return self._reader.read_next_var_list()
             else:
                 if self._return_list:
-                    return self._reader.read_next_list()
+                    # static graph organized data on multi-device with list, if
+                    # place number is 1, there is only 1 device, extra the data
+                    # from list for devices to be compatible with dygraph mode
+                    if len(self._places) == 1:
+                        return self._reader.read_next_list()[0]
+                    else:
+                        return self._reader.read_next_list()
                 else:
                     return self._reader.read_next()
         except StopIteration:
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
index 38497f91fc188..c01e2e75b8195 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_static.py
@@ -170,5 +170,50 @@ def test_main(self):
             self.assertLess(diff, 1e-2)
 
 
+class TestStaticDataLoaderReturnList(unittest.TestCase):
+    def test_single_place(self):
+        scope = fluid.Scope()
+        image = fluid.data(
+            name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+        with fluid.scope_guard(scope):
+            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                feed_list=[image, label],
+                num_workers=0,
+                batch_size=BATCH_SIZE,
+                drop_last=True,
+                return_list=True)
+
+            for d in dataloader:
+                assert isinstance(d, list)
+                assert len(d) == 2
+                assert not isinstance(d[0], list)
+                assert not isinstance(d[1], list)
+
+    def test_multi_place(self):
+        scope = fluid.Scope()
+        image = fluid.data(
+            name='image', shape=[None, IMAGE_SIZE], dtype='float32')
+        label = fluid.data(name='label', shape=[None, 1], dtype='int64')
+        with fluid.scope_guard(scope):
+            dataset = RandomDataset(SAMPLE_NUM, CLASS_NUM)
+            dataloader = DataLoader(
+                dataset,
+                feed_list=[image, label],
+                num_workers=0,
+                batch_size=BATCH_SIZE,
+                places=[fluid.CPUPlace()] * 2,
+                drop_last=True,
+                return_list=True)
+
+            for d in dataloader:
+                assert isinstance(d, list)
+                assert len(d) == 2
+                assert isinstance(d[0], list)
+                assert isinstance(d[1], list)
+
+
 if __name__ == '__main__':
     unittest.main()

From 7a3a05cccbd03077a5b18ff0afa933e4033f73bd Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 26 Oct 2020 20:38:03 +0800
Subject: [PATCH 054/185] [Dy2Stat]Support to save model with nested output
 (#28224)

---
 python/paddle/fluid/dygraph/jit.py            |  3 +-
 .../tests/unittests/test_jit_save_load.py     | 40 +++++++++++++++++++
 2 files changed, 42 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 9eea6d659f7b1..4e026dab662c0 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -25,6 +25,7 @@
 from paddle.fluid import core
 from paddle.fluid.compiler import BuildStrategy, CompiledProgram, ExecutionStrategy
 from paddle.fluid.data_feeder import check_type
+from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph.base import program_desc_tracing_guard, switch_to_static_graph
 from paddle.fluid.dygraph.dygraph_to_static import logging_utils
 from paddle.fluid.dygraph.dygraph_to_static.logging_utils import set_code_level, set_verbosity
@@ -397,7 +398,7 @@ def _get_output_vars(outputs, output_spec):
         "Layer.forward method."
     result_list = []
     output_vars_dict = OrderedDict()
-    for var in outputs:
+    for var in flatten(outputs):
         if isinstance(var, Variable):
             output_vars_dict[var.name] = var
     if output_spec is None:
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index ac9a3f06f8f3e..b954f5c829aa6 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -21,6 +21,7 @@
 import paddle
 from paddle.static import InputSpec
 import paddle.fluid as fluid
+from paddle.fluid.layers.utils import flatten
 from paddle.fluid.dygraph import Linear
 from paddle.fluid.dygraph import declarative, ProgramTranslator
 from paddle.fluid.dygraph.io import INFER_MODEL_SUFFIX, INFER_PARAMS_SUFFIX, INFER_PARAMS_INFO_SUFFIX
@@ -153,6 +154,21 @@ def forward(self, x):
         return y, loss
 
 
+class LinearNetWithNestOut(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetWithNestOut, self).__init__()
+        self._linear_1 = Linear(in_size, out_size)
+        self._linear_2 = Linear(in_size, out_size)
+
+    @declarative
+    def forward(self, x):
+        y = self._linear_1(x)
+        z = self._linear_2(y)
+        out = y + z
+        loss = fluid.layers.mean(out)
+        return y, [(z, loss), out]
+
+
 class EmptyLayer(paddle.nn.Layer):
     def __init__(self):
         super(EmptyLayer, self).__init__()
@@ -299,6 +315,30 @@ def test_jit_load_no_path(self):
             loaded_layer = paddle.jit.load(path)
 
 
+class TestSaveLoadWithNestOut(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        fluid.enable_dygraph()
+
+    def test_nest_output(self):
+        x = fluid.dygraph.to_variable(
+            np.random.random((4, 8)).astype('float32'))
+
+        net = LinearNetWithNestOut(8, 8)
+        dy_outs = flatten(net(x))
+        net = declarative(net, input_spec=[InputSpec([None, 8], name='x')])
+
+        model_path = "net_with_nest_out/model"
+        paddle.jit.save(net, model_path)
+
+        load_net = paddle.jit.load(model_path)
+        load_outs = flatten(load_net(x))
+
+        self.assertTrue(len(dy_outs) == 4)
+        for dy_out, load_out in zip(dy_outs, load_outs):
+            self.assertTrue(np.allclose(dy_out.numpy(), load_out.numpy()))
+
+
 class TestSaveLoadWithInputSpec(unittest.TestCase):
     def setUp(self):
         # enable dygraph mode

From a5c18204e9dbfede345194712f48bbfad9db785e Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 26 Oct 2020 20:56:03 +0800
Subject: [PATCH 055/185] [Dy2stat]Join break cond with while cond in some
 pattern (#28171)

* Join break cond with while cond

* remove usless code

* refine the if code

* Split into BreakTransfromOptimizer

* add BreakTransformOptimizer in ast_transformer

* add more comment
---
 .../dygraph_to_static/ast_transformer.py      |   2 +
 .../break_continue_transformer.py             | 172 +++++++++++++-----
 .../fluid/dygraph/dygraph_to_static/utils.py  |  22 +++
 .../dygraph_to_static/test_break_continue.py  |  35 ++++
 4 files changed, 188 insertions(+), 43 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
index 2c59a66f22be2..fa168a62de11a 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/ast_transformer.py
@@ -22,6 +22,7 @@
 from paddle.fluid.dygraph.dygraph_to_static.assert_transformer import AssertTransformer
 from paddle.fluid.dygraph.dygraph_to_static.basic_api_transformer import BasicApiTransformer
 from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakContinueTransformer
+from paddle.fluid.dygraph.dygraph_to_static.break_continue_transformer import BreakTransformOptimizer
 from paddle.fluid.dygraph.dygraph_to_static.call_transformer import CallTransformer
 from paddle.fluid.dygraph.dygraph_to_static.cast_transformer import CastTransformer
 from paddle.fluid.dygraph.dygraph_to_static.ifelse_transformer import IfElseTransformer
@@ -75,6 +76,7 @@ def transfer_from_node_type(self, node_wrapper):
             BasicApiTransformer,  # Basic Api
             TensorShapeTransformer,  # Tensor.shape -> layers.shape(Tensor)
             ListTransformer,  # List used in control flow
+            BreakTransformOptimizer,  # optimize transfromation of break in loops
             BreakContinueTransformer,  # break/continue in loops
             ReturnTransformer,  # return in functions
             LogicalTransformer,  # logical and/or/not
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
index c78f6e8f40319..cb0383b9f7362 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/break_continue_transformer.py
@@ -19,6 +19,7 @@
 from paddle.fluid import unique_name
 from paddle.fluid.dygraph.dygraph_to_static.utils import index_in_list
 from paddle.fluid.dygraph.dygraph_to_static.utils import ForNodeVisitor
+from paddle.fluid.dygraph.dygraph_to_static.utils import BaseNodeVisitor
 from paddle.fluid.dygraph.dygraph_to_static.variable_trans_func import create_fill_constant_node
 
 __all__ = ['BreakContinueTransformer']
@@ -83,7 +84,7 @@ def get_for_stmt_nodes(self, node):
         return init_stmts
 
 
-class BreakContinueTransformer(gast.NodeTransformer):
+class BreakContinueTransformer(BaseNodeVisitor):
     """
     Rewrite 'break' and 'continue' key words in a if-else python way to make
     it equivalent to original control flow
@@ -103,41 +104,23 @@ class BreakContinueTransformer(gast.NodeTransformer):
         set continue to False at the beginning of each loop
 
         TODO: more details should be summarized as design document
+
+    Note: The class is inherited from BaseNodeVisitor instead of NodeTransformer,
+          because ancestor nodes will be modified inplace for `Break/Continue` here.
+          In general, we recommend to inheriting NodeTransformer to modify node!
     """
 
     def __init__(self, wrapper_root):
+        super(BreakContinueTransformer, self).__init__()
+
         self.wrapper_root = wrapper_root
         self.root = wrapper_root.node
 
-        self.ancestor_nodes = []
-
     def transform(self):
         self.visit(self.root)
 
-    def generic_visit(self, node):
-        # TODO: because we change ancestor nodes during visit_Break/Continue,
-        # not current node, so generic_visit of NodeTransformer will visit node
-        # which may be deleted. To prevent that node being added into
-        # transformed AST, I have to self-write a generic_visit, but this is
-        # NOT a good thing. Considering refactorying this whole class.
-        for field, value in gast.iter_fields(node):
-            if isinstance(value, list):
-                for item in value:
-                    if isinstance(item, gast.AST):
-                        self.visit(item)
-            elif isinstance(value, gast.AST):
-                self.visit(value)
-
-    def visit(self, node):
-        self.ancestor_nodes.append(node)
-        method = 'visit_' + node.__class__.__name__
-        visitor = getattr(self, method, self.generic_visit)
-        ret = visitor(node)
-        self.ancestor_nodes.pop()
-        return ret
-
     def visit_Break(self, node):
-        loop_node_index = self._find_ancestor_loop_index(node)
+        loop_node_index = _find_ancestor_loop_index(node, self.ancestor_nodes)
         assert loop_node_index != -1, "SyntaxError: 'break' outside loop"
         loop_node = self.ancestor_nodes[loop_node_index]
 
@@ -150,7 +133,7 @@ def visit_Break(self, node):
         first_block_index = self._remove_stmts_after_break_continue(
             node, variable_name, loop_node_index)
 
-        # 3. Add 'if V' for stmts in ancestor blocks between the first one
+        # 3. Add 'if not V' for stmts in ancestor blocks between the first one
         # (exclusive) and the ancestor loop (inclusive)
         self._replace_if_stmt(loop_node_index, first_block_index, variable_name)
 
@@ -165,6 +148,7 @@ def visit_Break(self, node):
                 ctx=gast.Load(),
                 annotation=None,
                 type_comment=None))
+
         if isinstance(loop_node, gast.While):
             loop_node.test = gast.BoolOp(
                 op=gast.And(), values=[loop_node.test, cond_var_node])
@@ -175,7 +159,7 @@ def visit_Break(self, node):
             for_to_while.transform()
 
     def visit_Continue(self, node):
-        loop_node_index = self._find_ancestor_loop_index(node)
+        loop_node_index = _find_ancestor_loop_index(node, self.ancestor_nodes)
         assert loop_node_index != -1, "SyntaxError: 'continue' outside loop"
         loop_node = self.ancestor_nodes[loop_node_index]
 
@@ -188,7 +172,7 @@ def visit_Continue(self, node):
         first_block_index = self._remove_stmts_after_break_continue(
             node, variable_name, loop_node_index)
 
-        # 3. Add 'if V' for stmts in ancestor blocks between the first one
+        # 3. Add 'if not V' for stmts in ancestor blocks between the first one
         # (exclusive) and the ancestor loop (inclusive)
         self._replace_if_stmt(loop_node_index, first_block_index, variable_name)
 
@@ -215,15 +199,6 @@ def _remove_stmts_after_break_continue(
 
         return first_block_index
 
-    def _replace_break_continue_in_stmt_list(
-            self, stmt_list, break_continue_node, break_continue_name):
-        i = index_in_list(stmt_list, break_continue_node)
-        if i == -1:
-            return False
-        assign_true_node = create_fill_constant_node(break_continue_name, True)
-        stmt_list[i:] = [assign_true_node]
-        return True
-
     def _replace_if_stmt(self, loop_node_index, first_block_index,
                          break_continue_name):
         for i in range(first_block_index - 1, loop_node_index - 1, -1):
@@ -239,6 +214,15 @@ def _replace_if_stmt(self, loop_node_index, first_block_index,
                         cur_node.orelse, son_node, break_continue_name):
                 continue
 
+    def _replace_break_continue_in_stmt_list(
+            self, stmt_list, break_continue_node, break_continue_name):
+        i = index_in_list(stmt_list, break_continue_node)
+        if i == -1:
+            return False
+        assign_true_node = create_fill_constant_node(break_continue_name, True)
+        stmt_list[i:] = [assign_true_node]
+        return True
+
     def _replace_after_node_to_if_in_stmt_list(self, stmt_list, node,
                                                break_continue_name):
         i = index_in_list(stmt_list, node)
@@ -282,8 +266,110 @@ def _add_stmt_into_list_before_node(self, stmt_list, node, stmt_node):
         stmt_list.insert(i, stmt_node)
         return True
 
-    def _find_ancestor_loop_index(self, node):
-        for i in range(len(self.ancestor_nodes) - 1, -1, -1):
-            if isinstance(self.ancestor_nodes[i], (gast.For, gast.While)):
-                return i
-        return -1
+
+def _find_ancestor_loop_index(node, ancestor_nodes):
+    for i in range(len(ancestor_nodes) - 1, -1, -1):
+        if isinstance(ancestor_nodes[i], (gast.For, gast.While)):
+            return i
+    return -1
+
+
+class BreakTransformOptimizer(BaseNodeVisitor):
+    """
+    In specific pattern, the transformed code could be optimized by joining the 
+    If.test with while.test. 
+    
+    Currently supported pattern is:
+    ```
+        while cond1:            while cond1 and not cond2:
+            if cond2:    --->       do_something()
+                break
+            do_something()
+    ```
+    
+    See following example:
+
+    >>> def foo(x):
+    ...     i = paddle.to_tensor(1, dtype='int32')
+    ...     while i < 10:
+    ...         if x.mean() > 5:
+    ...             break
+    ...         x += i
+    ...         i += 1
+    ...     return x
+
+    The generated code after applying optimization will be:
+    ```
+        def foo(x):
+            i = paddle.to_tensor(1, dtype='int32')
+            while i < 10 and not x.mean() > 5:
+                x += i
+                i += 1
+            return x
+    ```
+    It can avoid wrapping all ops after `break` statement into `cond_op` that 
+    usually brings very heavy overhead.
+    """
+
+    def __init__(self, wrapper_root):
+        super(BreakTransformOptimizer, self).__init__()
+
+        self.wrapper_root = wrapper_root
+        self.root = wrapper_root.node
+
+    def transform(self):
+        self.visit(self.root)
+
+    def visit_Break(self, node):
+        loop_node_index = _find_ancestor_loop_index(node, self.ancestor_nodes)
+        assert loop_node_index != -1, "SyntaxError: 'break' outside loop"
+        loop_node = self.ancestor_nodes[loop_node_index]
+
+        if self._is_break_cond_pattern(node, loop_node):
+            cond_var_node = self._join_with_while_cond(node, loop_node)
+
+            if isinstance(loop_node, gast.While):
+                loop_node.test = gast.BoolOp(
+                    op=gast.And(), values=[loop_node.test, cond_var_node])
+            elif isinstance(loop_node, gast.For):
+                parent_node = self.ancestor_nodes[loop_node_index - 1]
+                for_to_while = ForToWhileTransformer(parent_node, loop_node,
+                                                     cond_var_node)
+                for_to_while.transform()
+
+    def _is_break_cond_pattern(self, break_node, loop_node):
+        """
+        Judge whether if match the pattern to join `If.test` with `while.test`
+        """
+        # while/for -> if -> break
+        if len(self.ancestor_nodes) < 3 or self.ancestor_nodes[-3] != loop_node:
+            return False
+
+        assert self.ancestor_nodes[-1] == break_node
+        parent_if_node = self.ancestor_nodes[-2]
+
+        is_matched = False
+        if isinstance(parent_if_node, gast.If):
+            # gast.If only contains `break`
+            break_first_in_if = parent_if_node.body[0] == break_node and len(
+                parent_if_node.orelse) == 0
+            # gast.If is first node of loop_node
+            if_first_in_loop = loop_node.body[0] == parent_if_node
+
+            is_matched = if_first_in_loop and break_first_in_if
+
+        return is_matched
+
+    def _join_with_while_cond(self, break_node, loop_node):
+        """
+        Join the `If.test` with `While.test` together.
+        """
+        parent_if_node = self.ancestor_nodes[-2]
+
+        cond_var_node = gast.UnaryOp(op=gast.Not(), operand=parent_if_node.test)
+
+        # remove the gast.If node that contains the gast.Break.
+        assert loop_node.body[0] == parent_if_node
+        loop_node.body.pop(0)
+
+        return cond_var_node
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
index 7a234580712ac..b44739ca8484b 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/utils.py
@@ -29,6 +29,28 @@
 
 from paddle.fluid import unique_name
 
+
+class BaseNodeVisitor(gast.NodeVisitor):
+    """
+    Implement customized NodeVisitor inherited from gast.NodeVisitor. 
+    Ancestor nodes are traced to easily support more operations of currently
+    visited node.
+    """
+
+    def __init__(self):
+        self.ancestor_nodes = []
+
+    def visit(self, node):
+        """Visit a node."""
+        self.ancestor_nodes.append(node)
+
+        method = 'visit_' + node.__class__.__name__
+        visitor = getattr(self, method, self.generic_visit)
+        ret = visitor(node)
+        self.ancestor_nodes.pop()
+        return ret
+
+
 # imp is deprecated in python3
 if six.PY2:
     import imp
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
index 6bcbc2b4a0bab..8423c056b2d83 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_break_continue.py
@@ -16,6 +16,7 @@
 
 import unittest
 import numpy as np
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph.jit import declarative
 
@@ -157,6 +158,30 @@ def __init__(self):
     return foo.c
 
 
+def test_optim_break_in_for(x):
+    x = paddle.to_tensor(x)
+    for i in range(10):
+        if x.sum() > 5:
+            break
+            x += 10086
+        x += i
+        if i < 3:
+            x = x * 2
+    return x
+
+
+def test_optim_break_in_while(x):
+    x = paddle.to_tensor(x)
+    i = fluid.layers.fill_constant(shape=[1], dtype='int32', value=0)
+    while i < 10:
+        if i > 5:
+            break
+            x += 10086
+        x += i
+        i += 1
+    return x
+
+
 class TestContinueInFor(unittest.TestCase):
     def setUp(self):
         self.input = np.zeros((1)).astype('int32')
@@ -226,5 +251,15 @@ def init_dygraph_func(self):
         self.dygraph_func = while_loop_class_var
 
 
+class TestOptimBreakInFor(TestContinueInWhile):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_optim_break_in_for
+
+
+class TestOptimBreakInWhile(TestContinueInWhile):
+    def init_dygraph_func(self):
+        self.dygraph_func = test_optim_break_in_while
+
+
 if __name__ == '__main__':
     unittest.main()

From 813b2ade342a991f3ed08ff31533a4e05b534a23 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Tue, 27 Oct 2020 11:07:44 +0800
Subject: [PATCH 056/185] Enrich the python error types of paddle & polish
 format (#28124)

* add multiple exception type

* define all exception & polish compile pystack

* mapping paddle error to python exception

* polish static mode error format

* fix failed unittests

* fix dytostatic test_error

* fix check_nan_inf failed

* add unittest for coverage

* revert some code try to solve compile error

* refactor enforce & error change

* polish code & add unittest
---
 paddle/fluid/framework/op_call_stack.cc       |  36 ++-
 paddle/fluid/platform/enforce.h               | 289 +++++++++---------
 paddle/fluid/platform/errors.cc               |   2 +-
 paddle/fluid/platform/errors.h                |   2 +-
 paddle/fluid/pybind/exception.cc              |  47 ++-
 .../tests/unittests/check_nan_inf_base.py     |   4 +-
 .../unittests/dygraph_to_static/test_error.py |  13 +-
 .../fluid/tests/unittests/test_assert_op.py   |  12 +-
 .../unittests/test_c_comm_init_all_op.py      |   4 +-
 .../fluid/tests/unittests/test_cholesky_op.py |  11 +-
 .../fluid/tests/unittests/test_exception.py   |  46 ++-
 .../tests/unittests/test_fill_any_like_op.py  |   3 +-
 .../tests/unittests/test_histogram_op.py      |   7 +-
 .../tests/unittests/test_imperative_basic.py  |   3 +-
 .../test_imperative_data_loader_process.py    |   2 +-
 .../unittests/test_imperative_double_grad.py  |   2 +-
 .../test_imperative_signal_handler.py         |   8 +-
 .../fluid/tests/unittests/test_inverse_op.py  |  11 +-
 .../tests/unittests/test_multinomial_op.py    |   9 +-
 .../fluid/tests/unittests/test_multiply.py    |   8 +-
 .../test_multiprocess_reader_exception.py     |   5 +-
 .../unittests/test_nn_functional_hot_op.py    |   4 +-
 .../fluid/tests/unittests/test_one_hot_op.py  |   4 +-
 .../tests/unittests/test_one_hot_v2_op.py     |   4 +-
 ..._parallel_executor_feed_persistable_var.py |   6 +-
 .../paddle/fluid/tests/unittests/test_pow.py  |   4 +-
 .../tests/unittests/test_retain_graph.py      |   3 +-
 .../fluid/tests/unittests/test_reverse_op.py  |   4 +-
 .../test_runtime_and_compiletime_exception.py |   6 +-
 .../fluid/tests/unittests/test_tensor.py      |   2 +-
 30 files changed, 360 insertions(+), 201 deletions(-)

diff --git a/paddle/fluid/framework/op_call_stack.cc b/paddle/fluid/framework/op_call_stack.cc
index 80db35e0c3917..380ba74a1cb11 100644
--- a/paddle/fluid/framework/op_call_stack.cc
+++ b/paddle/fluid/framework/op_call_stack.cc
@@ -21,6 +21,18 @@ limitations under the License. */
 namespace paddle {
 namespace framework {
 
+std::string InsertIndentationIntoEachLine(const std::string &str) {
+  std::ostringstream sout;
+  size_t start_pos = 0;
+  size_t end_pos = 0;
+  while ((end_pos = str.find("\n", start_pos)) != std::string::npos) {
+    sout << "    " << str.substr(start_pos, end_pos + 1);
+    start_pos = end_pos + 1;
+  }
+  sout << "    " << str.substr(start_pos, end_pos);
+  return sout.str();
+}
+
 void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs,
                          platform::EnforceNotMet *exception) {
   if (attrs.count("sub_block") != 0) {
@@ -37,23 +49,37 @@ void InsertCallStackInfo(const std::string &type, const AttributeMap &attrs,
   std::ostringstream sout;
   // Step 1. Construct python call stack string
   if (callstack) {
-    sout << "\n\n  Compile Traceback (most recent call last):";
+    if (FLAGS_call_stack_level > 1) {
+      sout << "\n\n  Compile Traceback (most recent call last):";
+    } else {
+      sout << "In user code:\n";
+    }
     for (auto &line : *callstack) {
       sout << "\n  " << line;
     }
   }
   // Step 2. Construct final call stack & append error op name
-  sout << exception->err_str_;
+  if (FLAGS_call_stack_level > 1) {
+    sout << exception->what();
+  } else {
+    // If callstack exists, use err_str_ instead sub_err_str_
+    if (callstack) {
+      sout << "\n\n";
+      sout << InsertIndentationIntoEachLine(exception->error_str());
+    } else {
+      sout << exception->simple_error_str();
+    }
+  }
   sout << "  [operator < " << type << " > error]";
-  exception->err_str_ = sout.str();
+  exception->set_error_str(sout.str());
 }
 
 void AppendErrorOpHint(const std::string &type,
                        platform::EnforceNotMet *exception) {
   std::ostringstream sout;
-  sout << exception->err_str_;
+  sout << exception->what();
   sout << "  [operator < " << type << " > error]";
-  exception->err_str_ = sout.str();
+  exception->set_error_str(sout.str());
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 165321d9c87ff..6a27249817027 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -39,7 +39,6 @@ limitations under the License. */
 
 #include <fstream>
 #include <iomanip>
-#include <iostream>
 #include <memory>
 #include <sstream>
 #include <stdexcept>
@@ -275,8 +274,10 @@ template <typename StrType>
 inline std::string GetErrorSumaryString(StrType&& what, const char* file,
                                         int line) {
   std::ostringstream sout;
-  sout << "\n----------------------\nError Message "
-          "Summary:\n----------------------\n";
+  if (FLAGS_call_stack_level > 1) {
+    sout << "\n----------------------\nError Message "
+            "Summary:\n----------------------\n";
+  }
   sout << string::Sprintf("%s (at %s:%d)", std::forward<StrType>(what), file,
                           line)
        << std::endl;
@@ -294,41 +295,89 @@ inline std::string GetTraceBackString(StrType&& what, const char* file,
   }
 }
 
-inline bool is_error(bool stat) { return !stat; }
-
-inline void throw_on_error(bool stat, const std::string& msg) {
-  throw std::runtime_error(msg);
+inline std::string SimplifyErrorTypeFormat(const std::string& str) {
+  std::ostringstream sout;
+  size_t type_end_pos = str.find(":", 0);
+  if (type_end_pos == std::string::npos) {
+    sout << str;
+  } else {
+    // Remove "Error:", add "()""
+    sout << "(" << str.substr(0, type_end_pos - 5) << ")"
+         << str.substr(type_end_pos + 1);
+  }
+  return sout.str();
 }
 
+inline bool is_error(bool stat) { return !stat; }
+
 // Note: This Macro can only be used within enforce.h
-#define __THROW_ERROR_INTERNAL__(...)                                \
-  do {                                                               \
-    HANDLE_THE_ERROR                                                 \
-    throw ::paddle::platform::EnforceNotMet(                         \
-        ::paddle::string::Sprintf(__VA_ARGS__), __FILE__, __LINE__); \
-    END_HANDLE_THE_ERROR                                             \
+#define __THROW_ERROR_INTERNAL__(__ERROR_SUMMARY)                      \
+  do {                                                                 \
+    HANDLE_THE_ERROR                                                   \
+    throw ::paddle::platform::EnforceNotMet(__ERROR_SUMMARY, __FILE__, \
+                                            __LINE__);                 \
+    END_HANDLE_THE_ERROR                                               \
   } while (0)
 
 /** ENFORCE EXCEPTION AND MACROS **/
 
 struct EnforceNotMet : public std::exception {
+ public:
   EnforceNotMet(std::exception_ptr e, const char* file, int line) {
     try {
       std::rethrow_exception(e);
+    } catch (platform::EnforceNotMet& e) {
+      code_ = e.code();
+      err_str_ = GetTraceBackString(e.what(), file, line);
+      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
     } catch (std::exception& e) {
       err_str_ = GetTraceBackString(e.what(), file, line);
+      simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
     }
   }
 
   EnforceNotMet(const std::string& str, const char* file, int line)
-      : err_str_(GetTraceBackString(str, file, line)) {}
+      : err_str_(GetTraceBackString(str, file, line)) {
+    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
+  }
 
-  EnforceNotMet(const platform::ErrorSummary& error, const char* file, int line)
-      : err_str_(GetTraceBackString(error.ToString(), file, line)) {}
+  EnforceNotMet(const ErrorSummary& error, const char* file, int line)
+      : code_(error.code()),
+        err_str_(GetTraceBackString(error.to_string(), file, line)) {
+    simple_err_str_ = SimplifyErrorTypeFormat(err_str_);
+  }
 
-  const char* what() const noexcept override { return err_str_.c_str(); }
+  const char* what() const noexcept override {
+    if (FLAGS_call_stack_level > 1) {
+      return err_str_.c_str();
+    } else {
+      return simple_err_str_.c_str();
+    }
+  }
+
+  error::Code code() const { return code_; }
+
+  const std::string& error_str() const { return err_str_; }
+
+  const std::string& simple_error_str() const { return simple_err_str_; }
+
+  void set_error_str(std::string str) {
+    if (FLAGS_call_stack_level > 1) {
+      err_str_ = str;
+    } else {
+      simple_err_str_ = str;
+    }
+  }
 
+ private:
+  // Used to determine the final type of exception thrown
+  error::Code code_ = error::LEGACY;
+  // Complete error message
+  // e.g. InvalidArgumentError: ***
   std::string err_str_;
+  // Simple errror message used when no C++ stack and python compile stack
+  // e.g. (InvalidArgument) ***
+  std::string simple_err_str_;
 };
 
 #define PADDLE_THROW(...)                                                   \
@@ -352,21 +401,12 @@ struct EnforceNotMet : public std::exception {
     }                                                                        \
   } while (0)
 #else
-#define PADDLE_ENFORCE(COND, ...)                                         \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {               \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__,                                                     \
-            ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString());    \
-      } catch (...) {                                                     \
-        HANDLE_THE_ERROR                                                  \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-        END_HANDLE_THE_ERROR                                              \
-      }                                                                   \
-    }                                                                     \
+#define PADDLE_ENFORCE(COND, ...)                                              \
+  do {                                                                         \
+    auto __cond__ = (COND);                                                    \
+    if (UNLIKELY(::paddle::platform::is_error(__cond__))) {                    \
+      __THROW_ERROR_INTERNAL__(::paddle::platform::ErrorSummary(__VA_ARGS__)); \
+    }                                                                          \
   } while (0)
 #endif
 
@@ -384,40 +424,46 @@ struct EnforceNotMet : public std::exception {
  *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
  */
 
-#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                          \
-  do {                                                               \
-    if (UNLIKELY(nullptr == (__VAL))) {                              \
-      __THROW_ERROR_INTERNAL__(                                      \
-          "%s\n  [Hint: " #__VAL " should not be null.]",            \
-          ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString()); \
-    }                                                                \
+#define PADDLE_ENFORCE_NOT_NULL(__VAL, ...)                                   \
+  do {                                                                        \
+    if (UNLIKELY(nullptr == (__VAL))) {                                       \
+      auto __summary__ = ::paddle::platform::ErrorSummary(__VA_ARGS__);       \
+      auto __message__ = ::paddle::string::Sprintf(                           \
+          "%s\n  [Hint: " #__VAL " should not be null.]",                     \
+          __summary__.error_message());                                       \
+      __THROW_ERROR_INTERNAL__(                                               \
+          ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \
+    }                                                                         \
   } while (0)
 
-#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)         \
-  do {                                                                         \
-    auto __val1 = (__VAL1);                                                    \
-    auto __val2 = (__VAL2);                                                    \
-    using __TYPE1__ = decltype(__val1);                                        \
-    using __TYPE2__ = decltype(__val2);                                        \
-    using __COMMON_TYPE1__ =                                                   \
-        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;        \
-    using __COMMON_TYPE2__ =                                                   \
-        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;        \
-    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(        \
-        static_cast<__COMMON_TYPE2__>(__val2));                                \
-    if (UNLIKELY(!__is_not_error)) {                                           \
-      constexpr bool __kCanToString__ =                                        \
-          ::paddle::platform::details::CanToString<__TYPE1__>::kValue &&       \
-          ::paddle::platform::details::CanToString<__TYPE2__>::kValue;         \
-      __THROW_ERROR_INTERNAL__(                                                \
-          "%s\n  [Hint: Expected %s " #__CMP                                   \
-          " %s, but received %s " #__INV_CMP " %s.]",                          \
-          ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString(), #__VAL1,   \
-          #__VAL2, ::paddle::platform::details::BinaryCompareMessageConverter< \
-                       __kCanToString__>::Convert(#__VAL1, __val1),            \
-          ::paddle::platform::details::BinaryCompareMessageConverter<          \
-              __kCanToString__>::Convert(#__VAL2, __val2));                    \
-    }                                                                          \
+#define __PADDLE_BINARY_COMPARE(__VAL1, __VAL2, __CMP, __INV_CMP, ...)        \
+  do {                                                                        \
+    auto __val1 = (__VAL1);                                                   \
+    auto __val2 = (__VAL2);                                                   \
+    using __TYPE1__ = decltype(__val1);                                       \
+    using __TYPE2__ = decltype(__val2);                                       \
+    using __COMMON_TYPE1__ =                                                  \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;       \
+    using __COMMON_TYPE2__ =                                                  \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;       \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(       \
+        static_cast<__COMMON_TYPE2__>(__val2));                               \
+    if (UNLIKELY(!__is_not_error)) {                                          \
+      auto __summary__ = ::paddle::platform::ErrorSummary(__VA_ARGS__);       \
+      constexpr bool __kCanToString__ =                                       \
+          ::paddle::platform::details::CanToString<__TYPE1__>::kValue &&      \
+          ::paddle::platform::details::CanToString<__TYPE2__>::kValue;        \
+      auto __message__ = ::paddle::string::Sprintf(                           \
+          "%s\n  [Hint: Expected %s " #__CMP                                  \
+          " %s, but received %s " #__INV_CMP " %s.]",                         \
+          __summary__.error_message(), #__VAL1, #__VAL2,                      \
+          ::paddle::platform::details::BinaryCompareMessageConverter<         \
+              __kCanToString__>::Convert(#__VAL1, __val1),                    \
+          ::paddle::platform::details::BinaryCompareMessageConverter<         \
+              __kCanToString__>::Convert(#__VAL2, __val2));                   \
+      __THROW_ERROR_INTERNAL__(                                               \
+          ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \
+    }                                                                         \
   } while (0)
 
 #define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \
@@ -458,26 +504,28 @@ struct EnforceNotMet : public std::exception {
  * Examples:
  *    GET_DATA_SAFELY(ctx.Input<LoDTensor>("X"), "Input", "X", "Mul");
  */
-#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                   \
-  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {          \
-    auto* __ptr = (__PTR);                                                  \
-    if (UNLIKELY(nullptr == __ptr)) {                                       \
-      __THROW_ERROR_INTERNAL__(                                             \
-          "%s\n  [Hint: pointer " #__PTR " should not be null.]",           \
-          paddle::platform::errors::NotFound(                               \
-              "Unable to get %s data of %s %s in operator %s. "             \
-              "Possible reasons are:\n"                                     \
-              "  1. The %s is not the %s of operator %s;\n"                 \
-              "  2. The %s has no corresponding variable passed in;\n"      \
-              "  3. The %s corresponding variable is not initialized.",     \
-              paddle::platform::demangle(                                   \
-                  typeid(std::add_lvalue_reference<decltype(*__ptr)>::type) \
-                      .name()),                                             \
-              __ROLE, __NAME, __OP_TYPE, __NAME, __ROLE, __OP_TYPE, __NAME, \
-              __NAME)                                                       \
-              .ToString());                                                 \
-    }                                                                       \
-    return *__ptr;                                                          \
+#define GET_DATA_SAFELY(__PTR, __ROLE, __NAME, __OP_TYPE)                     \
+  (([&]() -> std::add_lvalue_reference<decltype(*(__PTR))>::type {            \
+    auto* __ptr = (__PTR);                                                    \
+    if (UNLIKELY(nullptr == __ptr)) {                                         \
+      auto __summary__ = paddle::platform::errors::NotFound(                  \
+          "Unable to get %s data of %s %s in operator %s. "                   \
+          "Possible reasons are:\n"                                           \
+          "  1. The %s is not the %s of operator %s;\n"                       \
+          "  2. The %s has no corresponding variable passed in;\n"            \
+          "  3. The %s corresponding variable is not initialized.",           \
+          paddle::platform::demangle(                                         \
+              typeid(std::add_lvalue_reference<decltype(*__ptr)>::type)       \
+                  .name()),                                                   \
+          __ROLE, __NAME, __OP_TYPE, __NAME, __ROLE, __OP_TYPE, __NAME,       \
+          __NAME);                                                            \
+      auto __message__ = ::paddle::string::Sprintf(                           \
+          "%s\n  [Hint: pointer " #__PTR " should not be null.]",             \
+          __summary__.error_message());                                       \
+      __THROW_ERROR_INTERNAL__(                                               \
+          ::paddle::platform::ErrorSummary(__summary__.code(), __message__)); \
+    }                                                                         \
+    return *__ptr;                                                            \
   })())
 
 /*
@@ -584,13 +632,13 @@ struct EOFException : public std::exception {
     END_HANDLE_THE_ERROR                                                       \
   } while (0)
 
-#define PADDLE_THROW_BAD_ALLOC(...)                                         \
-  do {                                                                      \
-    HANDLE_THE_ERROR                                                        \
-    throw ::paddle::memory::allocation::BadAlloc(                           \
-        ::paddle::platform::ErrorSummary(__VA_ARGS__).ToString(), __FILE__, \
-        __LINE__);                                                          \
-    END_HANDLE_THE_ERROR                                                    \
+#define PADDLE_THROW_BAD_ALLOC(...)                                          \
+  do {                                                                       \
+    HANDLE_THE_ERROR                                                         \
+    throw ::paddle::memory::allocation::BadAlloc(                            \
+        ::paddle::platform::ErrorSummary(__VA_ARGS__).to_string(), __FILE__, \
+        __LINE__);                                                           \
+    END_HANDLE_THE_ERROR                                                     \
   } while (0)
 
 /** CUDA PADDLE ENFORCE FUNCTIONS AND MACROS **/
@@ -687,10 +735,6 @@ inline std::string build_nvidia_error_msg(cudaError_t e) {
   return sout.str();
 }
 
-inline void throw_on_error(cudaError_t e, const std::string& msg) {
-  throw std::runtime_error(msg);
-}
-
 /** curand ERROR **/
 inline bool is_error(curandStatus_t stat) {
   return stat != CURAND_STATUS_SUCCESS;
@@ -734,11 +778,6 @@ inline std::string build_nvidia_error_msg(curandStatus_t stat) {
   return msg + curandGetErrorString(stat) + " ";
 }
 
-inline void throw_on_error(curandStatus_t stat, const std::string& msg) {
-  throw thrust::system_error(cudaErrorLaunchFailure, thrust::cuda_category(),
-                             msg);
-}
-
 /***** CUDNN ERROR *****/
 inline bool is_error(cudnnStatus_t stat) {
   return stat != CUDNN_STATUS_SUCCESS;
@@ -749,10 +788,6 @@ inline std::string build_nvidia_error_msg(cudnnStatus_t stat) {
   return msg + platform::dynload::cudnnGetErrorString(stat) + " ";
 }
 
-inline void throw_on_error(cudnnStatus_t stat, const std::string& msg) {
-  throw std::runtime_error(msg);
-}
-
 /***** CUBLAS ERROR *****/
 inline bool is_error(cublasStatus_t stat) {
   return stat != CUBLAS_STATUS_SUCCESS;
@@ -788,10 +823,6 @@ inline std::string build_nvidia_error_msg(cublasStatus_t stat) {
   return msg + cublasGetErrorString(stat) + " ";
 }
 
-inline void throw_on_error(cublasStatus_t stat, const std::string& msg) {
-  throw std::runtime_error(msg);
-}
-
 /***** CUSOLVER ERROR *****/
 inline bool is_error(cusolverStatus_t stat) {
   return stat != CUSOLVER_STATUS_SUCCESS;
@@ -817,15 +848,12 @@ inline const char* cusolverGetErrorString(cusolverStatus_t stat) {
       return "Unknown cusolver status";
   }
 }
+
 inline std::string build_nvidia_error_msg(cusolverStatus_t stat) {
   std::string msg(" Cublas error, ");
   return msg + cusolverGetErrorString(stat) + " ";
 }
 
-inline void throw_on_error(cusolverStatus_t stat, const std::string& msg) {
-  throw std::runtime_error(msg);
-}
-
 /****** NCCL ERROR ******/
 #if !defined(__APPLE__) && defined(PADDLE_WITH_NCCL)
 inline bool is_error(ncclResult_t nccl_result) {
@@ -836,10 +864,6 @@ inline std::string build_nvidia_error_msg(ncclResult_t nccl_result) {
   std::string msg(" Nccl error, ");
   return msg + platform::dynload::ncclGetErrorString(nccl_result) + " ";
 }
-
-inline void throw_on_error(ncclResult_t nccl_result, const std::string& msg) {
-  throw std::runtime_error(msg);
-}
 #endif  // not(__APPLE__) and PADDLE_WITH_NCCL
 
 namespace details {
@@ -866,27 +890,18 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
 
 }  // namespace details
 
-#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                                 \
-  do {                                                                    \
-    auto __cond__ = (COND);                                               \
-    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                      \
-    constexpr auto __success_type__ =                                     \
-        ::paddle::platform::details::CudaStatusType<                      \
-            __CUDA_STATUS_TYPE__>::kSuccess;                              \
-    if (UNLIKELY(__cond__ != __success_type__)) {                         \
-      try {                                                               \
-        ::paddle::platform::throw_on_error(                               \
-            __cond__,                                                     \
-            ::paddle::platform::errors::External(                         \
-                ::paddle::platform::build_nvidia_error_msg(__cond__))     \
-                .ToString());                                             \
-      } catch (...) {                                                     \
-        HANDLE_THE_ERROR                                                  \
-        throw ::paddle::platform::EnforceNotMet(std::current_exception(), \
-                                                __FILE__, __LINE__);      \
-        END_HANDLE_THE_ERROR                                              \
-      }                                                                   \
-    }                                                                     \
+#define PADDLE_ENFORCE_CUDA_SUCCESS(COND)                        \
+  do {                                                           \
+    auto __cond__ = (COND);                                      \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);             \
+    constexpr auto __success_type__ =                            \
+        ::paddle::platform::details::CudaStatusType<             \
+            __CUDA_STATUS_TYPE__>::kSuccess;                     \
+    if (UNLIKELY(__cond__ != __success_type__)) {                \
+      auto __summary__ = ::paddle::platform::errors::External(   \
+          ::paddle::platform::build_nvidia_error_msg(__cond__)); \
+      __THROW_ERROR_INTERNAL__(__summary__);                     \
+    }                                                            \
   } while (0)
 
 #undef DEFINE_CUDA_STATUS_TYPE
diff --git a/paddle/fluid/platform/errors.cc b/paddle/fluid/platform/errors.cc
index 85beefa59de5b..94a182f965678 100644
--- a/paddle/fluid/platform/errors.cc
+++ b/paddle/fluid/platform/errors.cc
@@ -68,7 +68,7 @@ std::string error_name(Code code) {
   }
 }
 
-std::string ErrorSummary::ToString() const {
+std::string ErrorSummary::to_string() const {
   std::string result(error_name(code()));
   result += ": ";
   result += error_message();
diff --git a/paddle/fluid/platform/errors.h b/paddle/fluid/platform/errors.h
index a2f2e7c130ca2..6bcd5cf39f2e0 100644
--- a/paddle/fluid/platform/errors.h
+++ b/paddle/fluid/platform/errors.h
@@ -52,7 +52,7 @@ class ErrorSummary {
 
   const std::string& error_message() const { return msg_; }
 
-  std::string ToString() const;
+  std::string to_string() const;
 
  private:
   Code code_;
diff --git a/paddle/fluid/pybind/exception.cc b/paddle/fluid/pybind/exception.cc
index 776d480806223..3d07985ff654e 100644
--- a/paddle/fluid/pybind/exception.cc
+++ b/paddle/fluid/pybind/exception.cc
@@ -17,6 +17,21 @@ limitations under the License. */
 namespace paddle {
 namespace pybind {
 
+/* Paddle Exception mapping rules:
+ *   - InvalidArgumentError -> ValueError
+ *   - NotFoundError -> RuntimeError
+ *   - OutOfRangeError -> IndexError
+ *   - AlreadyExistsError -> RuntimeError
+ *   - ResourceExhaustedError -> MemoryError
+ *   - PreconditionNotMetError -> RuntimeError
+ *   - PermissionDeniedError -> RuntimeError
+ *   - ExecutionTimeoutError -> RuntimeError
+ *   - UnimplementedError -> NotImplementedError
+ *   - UnavailableError -> RuntimeError
+ *   - FatalError -> SystemError
+ *   - ExternalError -> OSError
+ */
+
 void BindException(pybind11::module* m) {
   static pybind11::exception<platform::EOFException> eof(*m, "EOFException");
   static pybind11::exception<platform::EnforceNotMet> exc(*m, "EnforceNotMet");
@@ -26,7 +41,37 @@ void BindException(pybind11::module* m) {
     } catch (const platform::EOFException& e) {
       eof(e.what());
     } catch (const platform::EnforceNotMet& e) {
-      exc(e.what());
+      switch (e.code()) {
+        case paddle::platform::error::INVALID_ARGUMENT:
+          PyErr_SetString(PyExc_ValueError, e.what());
+          break;
+        case paddle::platform::error::NOT_FOUND:
+        case paddle::platform::error::ALREADY_EXISTS:
+        case paddle::platform::error::PRECONDITION_NOT_MET:
+        case paddle::platform::error::PERMISSION_DENIED:
+        case paddle::platform::error::EXECUTION_TIMEOUT:
+        case paddle::platform::error::UNAVAILABLE:
+          PyErr_SetString(PyExc_RuntimeError, e.what());
+          break;
+        case paddle::platform::error::OUT_OF_RANGE:
+          PyErr_SetString(PyExc_IndexError, e.what());
+          break;
+        case paddle::platform::error::RESOURCE_EXHAUSTED:
+          PyErr_SetString(PyExc_MemoryError, e.what());
+          break;
+        case paddle::platform::error::UNIMPLEMENTED:
+          PyErr_SetString(PyExc_NotImplementedError, e.what());
+          break;
+        case paddle::platform::error::FATAL:
+          PyErr_SetString(PyExc_SystemError, e.what());
+          break;
+        case paddle::platform::error::EXTERNAL:
+          PyErr_SetString(PyExc_OSError, e.what());
+          break;
+        default:
+          exc(e.what());
+          break;
+      }
     }
   });
 
diff --git a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
index c682c795019ca..1c5db616306ca 100644
--- a/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
+++ b/python/paddle/fluid/tests/unittests/check_nan_inf_base.py
@@ -112,11 +112,11 @@ def check(use_cuda):
             print(type(e))
             # Note. Enforce in cuda kernel may not catch in paddle, and
             # Exception type will be RuntimeError
-            assert type(e) == core.EnforceNotMet or type(e) == RuntimeError
+            assert type(e) == OSError or type(e) == RuntimeError
     try:
         check(use_cuda=False)
         assert False
     except Exception as e:
         print(e)
         print(type(e))
-        assert type(e) == core.EnforceNotMet
+        assert type(e) == RuntimeError
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
index 2998ba85757e7..82f4bd763a29e 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_error.py
@@ -20,7 +20,6 @@
 import numpy as np
 import paddle
 import paddle.fluid as fluid
-from paddle.fluid.core import EnforceNotMet
 from paddle.fluid.dygraph.dygraph_to_static import error
 from paddle.fluid.dygraph.dygraph_to_static.origin_info import unwrap
 
@@ -76,9 +75,9 @@ def set_input(self):
 
     def set_message(self):
         self.expected_message = \
-            ['File "{}", line 36, in func_error_in_compile_time'.format(self.filepath),
+            ['File "{}", line 35, in func_error_in_compile_time'.format(self.filepath),
             'inner_func()',
-            'File "{}", line 29, in inner_func'.format(self.filepath),
+            'File "{}", line 28, in inner_func'.format(self.filepath),
             'fluid.layers.fill_constant(shape=[1, 2], value=9, dtype="int")',
             ]
 
@@ -130,13 +129,13 @@ def set_func(self):
         self.func = func_error_in_compile_time_2
 
     def set_exception_type(self):
-        self.exception_type = EnforceNotMet
+        self.exception_type = ValueError
 
     def set_message(self):
 
         self.expected_message = \
             [
-             'File "{}", line 47, in func_error_in_compile_time_2'.format(self.filepath),
+             'File "{}", line 46, in func_error_in_compile_time_2'.format(self.filepath),
              'x = fluid.layers.reshape(x, shape=[1, 2])'
              ]
 
@@ -146,12 +145,12 @@ def set_func(self):
         self.func = func_error_in_runtime
 
     def set_exception_type(self):
-        self.exception_type = EnforceNotMet
+        self.exception_type = ValueError
 
     def set_message(self):
         self.expected_message = \
             [
-                'File "{}", line 55, in func_error_in_runtime'.format(self.filepath),
+                'File "{}", line 54, in func_error_in_runtime'.format(self.filepath),
                 'x = fluid.layers.reshape(x, shape=[1, two])'
             ]
 
diff --git a/python/paddle/fluid/tests/unittests/test_assert_op.py b/python/paddle/fluid/tests/unittests/test_assert_op.py
index 47dbb1092c5c6..f7ab991de56d2 100644
--- a/python/paddle/fluid/tests/unittests/test_assert_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assert_op.py
@@ -14,6 +14,7 @@
 
 from __future__ import print_function
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
 import unittest
@@ -42,7 +43,7 @@ def net_func():
                 shape=[1], dtype='bool', value=False)
             layers.Assert(condition)
 
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_assert_cond_numel_error(self):
@@ -51,7 +52,7 @@ def net_func():
                 shape=[1, 2], dtype='bool', value=True)
             layers.Assert(condition, [])
 
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_assert_print_data(self):
@@ -62,7 +63,7 @@ def net_func():
             layers.Assert(condition, [zero, one])
 
         print("test_assert_print_data")
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_assert_summary(self):
@@ -72,7 +73,7 @@ def net_func():
             layers.Assert(condition, (x, ), 5)
 
         print("test_assert_summary")
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_assert_summary_greater_than_size(self):
@@ -82,9 +83,10 @@ def net_func():
             layers.Assert(condition, [x], 10, name="test")
 
         print("test_assert_summary_greater_than_size")
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py b/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
index 042f03e19ab18..a7f4a15381b42 100644
--- a/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
+++ b/python/paddle/fluid/tests/unittests/test_c_comm_init_all_op.py
@@ -15,6 +15,7 @@
 from __future__ import print_function
 
 import unittest
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 
@@ -34,7 +35,7 @@ def test_init_with_same_ring_id(self):
         program = fluid.Program()
         block = program.global_block()
         block.append_op(type='c_comm_init_all', attrs={'ring_id': 0})
-        with self.assertRaises(core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.exe.run(program)
 
     def test_specifying_devices(self):
@@ -47,4 +48,5 @@ def test_specifying_devices(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_cholesky_op.py b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
index 2fcec657c1404..93f62b20f2997 100644
--- a/python/paddle/fluid/tests/unittests/test_cholesky_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cholesky_op.py
@@ -118,7 +118,10 @@ def check_static_result(self, place, with_out=False):
                 fetches = exe.run(fluid.default_main_program(),
                                   feed={"input": input_np},
                                   fetch_list=[result])
-            except fluid.core.EnforceNotMet as ex:
+            except RuntimeError as ex:
+                print("The mat is singular")
+                pass
+            except ValueError as ex:
                 print("The mat is singular")
                 pass
 
@@ -135,10 +138,14 @@ def test_dygraph(self):
                 input = fluid.dygraph.to_variable(input_np)
                 try:
                     result = paddle.cholesky(input)
-                except fluid.core.EnforceNotMet as ex:
+                except RuntimeError as ex:
+                    print("The mat is singular")
+                    pass
+                except ValueError as ex:
                     print("The mat is singular")
                     pass
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_exception.py b/python/paddle/fluid/tests/unittests/test_exception.py
index 7dd6047968f53..adc7386bdeba6 100644
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
@@ -14,9 +14,13 @@
 
 from __future__ import print_function
 
+import numpy
+import unittest
+
+import paddle
+import paddle.fluid as fluid
 import paddle.compat as cpt
 import paddle.fluid.core as core
-import unittest
 
 
 class TestException(unittest.TestCase):
@@ -24,7 +28,7 @@ def test_exception(self):
         exception = None
         try:
             core.__unittest_throw_exception__()
-        except core.EnforceNotMet as ex:
+        except RuntimeError as ex:
             self.assertIn("This is a test of exception",
                           cpt.get_exception_message(ex))
             exception = ex
@@ -32,5 +36,43 @@ def test_exception(self):
         self.assertIsNotNone(exception)
 
 
+class TestExceptionNoCStack(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+        # test no C++ stack format
+        fluid.set_flags({'FLAGS_call_stack_level': 1})
+
+    def test_exception_in_static_mode(self):
+        x = fluid.layers.data(name='X', shape=[-1, 13], dtype='float32')
+        y = fluid.layers.data(name='Y', shape=[-1, 1], dtype='float32')
+        predict = fluid.layers.fc(input=x, size=1, act=None)
+        loss = fluid.layers.square_error_cost(input=predict, label=y)
+        avg_loss = fluid.layers.mean(loss)
+
+        fluid.optimizer.SGD(learning_rate=0.01).minimize(avg_loss)
+
+        place = fluid.CPUPlace()
+        exe = fluid.Executor(place)
+        exe.run(fluid.default_startup_program())
+
+        x = numpy.random.random(size=(8, 12)).astype('float32')
+        y = numpy.random.random(size=(8, 1)).astype('float32')
+
+        with self.assertRaises(ValueError):
+            exe.run(fluid.default_main_program(),
+                    feed={'X': x,
+                          'Y': y},
+                    fetch_list=[avg_loss.name])
+
+    def test_exception_in_dynamic_mode(self):
+        place = fluid.CPUPlace()
+        with fluid.dygraph.guard(place):
+            x = numpy.random.random(size=(10, 2)).astype('float32')
+            linear = fluid.dygraph.Linear(1, 10)
+            data = fluid.dygraph.to_variable(x)
+            with self.assertRaises(ValueError):
+                res = linear(data)
+
+
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
index b18b5456c12aa..5bc2d1cda180b 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_any_like_op.py
@@ -87,7 +87,7 @@ def test_check_output(self):
         exception = None
         try:
             self.check_output(check_dygraph=False)
-        except core.EnforceNotMet as ex:
+        except ValueError as ex:
             exception = ex
         self.assertIsNotNone(exception)
 
@@ -98,4 +98,5 @@ def init(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_histogram_op.py b/python/paddle/fluid/tests/unittests/test_histogram_op.py
index f540b885e12ee..7da9dbd62e9f9 100644
--- a/python/paddle/fluid/tests/unittests/test_histogram_op.py
+++ b/python/paddle/fluid/tests/unittests/test_histogram_op.py
@@ -77,7 +77,7 @@ def net_func():
                 shape=[3, 4], dtype='float32', value=3.0)
             paddle.histogram(input=input_value, bins=-1, min=1, max=5)
 
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(IndexError):
             self.run_network(net_func)
 
     def test_min_max_error(self):
@@ -88,7 +88,7 @@ def net_func():
                 shape=[3, 4], dtype='float32', value=3.0)
             paddle.histogram(input=input_value, bins=1, min=5, max=1)
 
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_min_max_range_error(self):
@@ -99,7 +99,7 @@ def net_func():
                 shape=[3, 4], dtype='float32', value=3.0)
             paddle.histogram(input=input_value, bins=1, min=-np.inf, max=5)
 
-        with self.assertRaises(fluid.core.EnforceNotMet):
+        with self.assertRaises(ValueError):
             self.run_network(net_func)
 
     def test_type_errors(self):
@@ -138,4 +138,5 @@ def test_check_output(self):
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_basic.py b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
index 7378975aa3795..8892c08a470d4 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_basic.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_basic.py
@@ -333,7 +333,7 @@ def test_empty_var(self):
             try:
                 new_variable.numpy()
             except Exception as e:
-                assert type(e) == core.EnforceNotMet
+                assert type(e) == ValueError
 
             try:
                 new_variable.backward()
@@ -689,4 +689,5 @@ def test_without_guard(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
index 9b2d71c9f9077..2a3a1e8b0a366 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_data_loader_process.py
@@ -96,7 +96,7 @@ def __reader__():
             exception = None
             try:
                 _reader_process_loop(loader._batch_reader, loader._data_queue)
-            except core.EnforceNotMet as ex:
+            except ValueError as ex:
                 exception = ex
             self.assertIsNotNone(exception)
 
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
index 600ee6d10e5de..8f3116f653514 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_double_grad.py
@@ -356,7 +356,7 @@ def raise_no_grad_op(self):
             loss.backward()
 
     def test_raise(self):
-        self.assertRaises(fluid.core.EnforceNotMet, self.raise_no_grad_op)
+        self.assertRaises(RuntimeError, self.raise_no_grad_op)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
index a0da4b0efee64..d783a2cc752d2 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_signal_handler.py
@@ -50,8 +50,8 @@ def __test_process__():
 
             set_child_signal_handler(id(self), test_process.pid)
             time.sleep(5)
-        except core.EnforceNotMet as ex:
-            self.assertIn("FatalError", cpt.get_exception_message(ex))
+        except SystemError as ex:
+            self.assertIn("Fatal", cpt.get_exception_message(ex))
             exception = ex
 
         self.assertIsNotNone(exception)
@@ -68,7 +68,7 @@ def __test_process__():
 
             set_child_signal_handler(id(self), test_process.pid)
             time.sleep(3)
-        except core.EnforceNotMet as ex:
+        except SystemError as ex:
             self.assertIn("Segmentation fault", cpt.get_exception_message(ex))
             exception = ex
 
@@ -86,7 +86,7 @@ def __test_process__():
 
             set_child_signal_handler(id(self), test_process.pid)
             time.sleep(3)
-        except core.EnforceNotMet as ex:
+        except SystemError as ex:
             self.assertIn("Bus error", cpt.get_exception_message(ex))
             exception = ex
 
diff --git a/python/paddle/fluid/tests/unittests/test_inverse_op.py b/python/paddle/fluid/tests/unittests/test_inverse_op.py
index fd540dcd741ee..85c4c6000a684 100644
--- a/python/paddle/fluid/tests/unittests/test_inverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_inverse_op.py
@@ -153,7 +153,10 @@ def check_static_result(self, place):
                 fetches = exe.run(fluid.default_main_program(),
                                   feed={"input": input_np},
                                   fetch_list=[result])
-            except fluid.core.EnforceNotMet as ex:
+            except RuntimeError as ex:
+                print("The mat is singular")
+                pass
+            except ValueError as ex:
                 print("The mat is singular")
                 pass
 
@@ -168,10 +171,14 @@ def test_dygraph(self):
                 input = fluid.dygraph.to_variable(input_np)
                 try:
                     result = paddle.inverse(input)
-                except fluid.core.EnforceNotMet as ex:
+                except RuntimeError as ex:
+                    print("The mat is singular")
+                    pass
+                except ValueError as ex:
                     print("The mat is singular")
                     pass
 
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_multinomial_op.py b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
index db4978930e049..b22f6b80df79a 100644
--- a/python/paddle/fluid/tests/unittests/test_multinomial_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multinomial_op.py
@@ -186,29 +186,28 @@ def test_num_sample_less_than_0():
             x = paddle.rand([4])
             paddle.multinomial(x, num_samples=-2)
 
-        self.assertRaises(core.EnforceNotMet, test_num_sample_less_than_0)
+        self.assertRaises(ValueError, test_num_sample_less_than_0)
 
     def test_replacement_False(self):
         def test_samples_larger_than_categories():
             x = paddle.rand([4])
             paddle.multinomial(x, num_samples=5, replacement=False)
 
-        self.assertRaises(core.EnforceNotMet,
-                          test_samples_larger_than_categories)
+        self.assertRaises(ValueError, test_samples_larger_than_categories)
 
     def test_input_probs_dim(self):
         def test_dim_larger_than_2():
             x = paddle.rand([2, 3, 3])
             paddle.multinomial(x)
 
-        self.assertRaises(core.EnforceNotMet, test_dim_larger_than_2)
+        self.assertRaises(ValueError, test_dim_larger_than_2)
 
         def test_dim_less_than_1():
             x_np = np.random.random([])
             x = paddle.to_tensor(x_np)
             paddle.multinomial(x)
 
-        self.assertRaises(core.EnforceNotMet, test_dim_less_than_1)
+        self.assertRaises(ValueError, test_dim_less_than_1)
 
 
 if __name__ == "__main__":
diff --git a/python/paddle/fluid/tests/unittests/test_multiply.py b/python/paddle/fluid/tests/unittests/test_multiply.py
index abd0c15dc7235..09a2007c1adb3 100755
--- a/python/paddle/fluid/tests/unittests/test_multiply.py
+++ b/python/paddle/fluid/tests/unittests/test_multiply.py
@@ -194,7 +194,7 @@ def test_errors(self):
         with program_guard(Program(), Program()):
             x = paddle.static.data(name='x', shape=[20, 50], dtype=np.float64)
             y = paddle.static.data(name='y', shape=[20], dtype=np.float64)
-            self.assertRaises(fluid.core.EnforceNotMet, tensor.multiply, x, y)
+            self.assertRaises(ValueError, tensor.multiply, x, y)
 
         np.random.seed(7)
         # test dynamic computation graph: dtype can not be int8
@@ -203,21 +203,21 @@ def test_errors(self):
         y_data = np.random.randn(200).astype(np.int8)
         x = paddle.to_tensor(x_data)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+        self.assertRaises(RuntimeError, paddle.multiply, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         x_data = np.random.rand(200, 5)
         y_data = np.random.rand(200)
         x = paddle.to_tensor(x_data)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+        self.assertRaises(ValueError, paddle.multiply, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable(python)
         x_data = np.random.rand(200, 5)
         y_data = np.random.rand(200)
         x = paddle.to_tensor(x_data)
         y = paddle.to_tensor(y_data)
-        self.assertRaises(fluid.core.EnforceNotMet, paddle.multiply, x, y)
+        self.assertRaises(ValueError, paddle.multiply, x, y)
 
         # test dynamic computation graph: dtype must be same
         x_data = np.random.randn(200).astype(np.int64)
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
index 9634f5af30a46..95e2462a2e298 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_reader_exception.py
@@ -98,7 +98,7 @@ def __impl__():
                             exe.run(feed=data, fetch_list=[image_p_1])
                             num += 1
                         self.assertEquals(num, batch_num)
-                    except fluid.core.EnforceNotMet as ex:
+                    except SystemError as ex:
                         self.assertEquals(num, 0)
                         raise ReaderException()
             else:
@@ -113,7 +113,7 @@ def __impl__():
                         reader.reset()
                         self.assertFalse(self.raise_exception)
                         self.assertEquals(num, batch_num)
-                    except fluid.core.EnforceNotMet as ex:
+                    except SystemError as ex:
                         self.assertTrue(self.raise_exception)
                         self.assertEquals(num, 0)
                         raise ReaderException()
@@ -152,4 +152,5 @@ def setUp(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
index 339f689998f81..9b7ba5c4b052f 100644
--- a/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_nn_functional_hot_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.nn.functional as functional
@@ -152,7 +153,7 @@ def run():
                         fetch_list=[one_hot_out],
                         return_numpy=False)
 
-            self.assertRaises(core.EnforceNotMet, run)
+            self.assertRaises(ValueError, run)
 
 
 class TestOneHotOpApi(unittest.TestCase):
@@ -204,4 +205,5 @@ def test_bad_x():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
index 34a1087c9ff1b..e1da94c1219ca 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
@@ -172,7 +173,7 @@ def run():
                         fetch_list=[one_hot_out],
                         return_numpy=False)
 
-            self.assertRaises(core.EnforceNotMet, run)
+            self.assertRaises(ValueError, run)
 
 
 class TestOneHotOpError(unittest.TestCase):
@@ -200,4 +201,5 @@ def test_errors(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 9b6c2b1fd8731..66de1b309797f 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -18,6 +18,7 @@
 import numpy as np
 import math
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 import paddle.fluid.framework as framework
@@ -169,7 +170,7 @@ def run():
                         fetch_list=[one_hot_out],
                         return_numpy=False)
 
-            self.assertRaises(core.EnforceNotMet, run)
+            self.assertRaises(ValueError, run)
 
 
 class TestOneHotOpApi(unittest.TestCase):
@@ -220,4 +221,5 @@ def test_bad_x():
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
index 831e2e761088b..2597df7faff54 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_feed_persistable_var.py
@@ -16,6 +16,7 @@
 from functools import partial
 import numpy
 import unittest
+import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
 from simple_nets import init_data, simple_fc_net
@@ -76,13 +77,14 @@ def test_feed_persistable_var(self):
         self.feed_dict['learning_rate'] = numpy.array(
             [1.0, 1.0]).astype("float32")
         run = partial(self.check_feed_persistable_var, self.feed_dict)
-        self.assertRaises(core.EnforceNotMet, run)
+        self.assertRaises(RuntimeError, run)
 
         self.feed_dict['image'] = self.img[0, :]
         self.feed_dict['label'] = self.label[0, :]
         run = partial(self.check_feed_persistable_var, self.feed_dict)
-        self.assertRaises(core.EnforceNotMet, run)
+        self.assertRaises(RuntimeError, run)
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_pow.py b/python/paddle/fluid/tests/unittests/test_pow.py
index 0764cb580e40d..a468b24a79a21 100755
--- a/python/paddle/fluid/tests/unittests/test_pow.py
+++ b/python/paddle/fluid/tests/unittests/test_pow.py
@@ -218,8 +218,8 @@ def test_errors(self):
                 np.random.randint(5, 10))
         x = (np.random.rand(*dims) * 10).astype(np.float64)
         y = (np.random.rand(dims[-1] + 1) * 10).astype(np.float64)
-        self.assertRaises(fluid.core.EnforceNotMet, _run_power, DYNAMIC, x, y)
-        self.assertRaises(fluid.core.EnforceNotMet, _run_power, STATIC, x, y)
+        self.assertRaises(ValueError, _run_power, DYNAMIC, x, y)
+        self.assertRaises(ValueError, _run_power, STATIC, x, y)
 
         # test dynamic computation graph: inputs must be broadcastable
         dims = (np.random.randint(1, 10), np.random.randint(5, 10),
diff --git a/python/paddle/fluid/tests/unittests/test_retain_graph.py b/python/paddle/fluid/tests/unittests/test_retain_graph.py
index de94e0b0fcd2d..79664fe4b12fb 100644
--- a/python/paddle/fluid/tests/unittests/test_retain_graph.py
+++ b/python/paddle/fluid/tests/unittests/test_retain_graph.py
@@ -136,8 +136,7 @@ def run_retain(self, need_retain):
 
     def test_retain(self):
         self.run_retain(need_retain=True)
-        self.assertRaises(
-            fluid.core.EnforceNotMet, self.run_retain, need_retain=False)
+        self.assertRaises(RuntimeError, self.run_retain, need_retain=False)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_reverse_op.py b/python/paddle/fluid/tests/unittests/test_reverse_op.py
index 5aaf0b85d504f..9b739eff97cbe 100644
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid import core
 
@@ -107,7 +108,7 @@ def _run_program():
             x = np.random.random(size=(10, 1, 1, 1, 1, 1, 1)).astype('int64')
             exe.run(train_program, feed={"label": x})
 
-        self.assertRaises(core.EnforceNotMet, _run_program)
+        self.assertRaises(IndexError, _run_program)
 
 
 class TestReverseLoDTensorArray(unittest.TestCase):
@@ -182,4 +183,5 @@ def test_raise_error(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py b/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
index 83c6ad8412210..21fdeeeb3e645 100644
--- a/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_runtime_and_compiletime_exception.py
@@ -17,6 +17,7 @@
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
@@ -36,12 +37,12 @@ def _run_program():
             x = np.random.random(size=(10)).astype('int64')
             exe.run(train_program, feed={"label": x})
 
-        self.assertRaises(core.EnforceNotMet, _run_program)
+        self.assertRaises(ValueError, _run_program)
 
 
 class TestCompileTimeException(unittest.TestCase):
     def test_compile_time_exception(self):
-        self.assertRaises(core.EnforceNotMet, self.build_model)
+        self.assertRaises(ValueError, self.build_model)
 
     def build_model(self):
         train_program = fluid.Program()
@@ -53,4 +54,5 @@ def build_model(self):
 
 
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor.py b/python/paddle/fluid/tests/unittests/test_tensor.py
index 03dffe4e5a2fe..f8f3eea78a6a2 100644
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
@@ -356,7 +356,7 @@ def test_tensor_set_error(self):
         try:
             error_array = ["1", "2"]
             tensor.set(error_array, place)
-        except core.EnforceNotMet as ex:
+        except ValueError as ex:
             exception = ex
 
         self.assertIsNotNone(exception)

From fdc06f21587b6d65196a65ff9edacc09442296fb Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Tue, 27 Oct 2020 11:21:33 +0800
Subject: [PATCH 057/185] add Fuse bn add act pass (#28196)

* add fuse_bn_add_act pass
---
 paddle/fluid/framework/details/CMakeLists.txt |   2 +-
 .../fluid/framework/details/build_strategy.cc |   8 +
 .../fluid/framework/details/build_strategy.h  |   1 +
 paddle/fluid/framework/ir/CMakeLists.txt      |   1 +
 .../framework/ir/fuse_bn_add_act_pass.cc      | 365 ++++++++++++++++++
 .../fluid/framework/ir/fuse_bn_add_act_pass.h |  75 ++++
 .../framework/ir/graph_pattern_detector.cc    | 186 +++++++++
 .../framework/ir/graph_pattern_detector.h     |  72 ++++
 .../fused/fused_bn_add_activation_op.cc       |   2 -
 .../fused/fused_bn_add_activation_op.cu       |   1 -
 .../fused/fused_bn_add_activation_op.h        |   1 -
 paddle/fluid/pybind/pybind.cc                 |  25 ++
 .../fluid/tests/unittests/CMakeLists.txt      |   2 +
 ...dd_act.py => test_fuse_bn_add_act_pass.py} |  46 ++-
 14 files changed, 771 insertions(+), 16 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
 create mode 100644 paddle/fluid/framework/ir/fuse_bn_add_act_pass.h
 rename python/paddle/fluid/tests/unittests/{test_fused_bn_add_act.py => test_fuse_bn_add_act_pass.py} (85%)

diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt
index 8281ec2143890..29db49a47cffa 100644
--- a/paddle/fluid/framework/details/CMakeLists.txt
+++ b/paddle/fluid/framework/details/CMakeLists.txt
@@ -107,7 +107,7 @@ cc_test(exception_holder_test SRCS exception_holder_test.cc )
 
 set(IR_PASS_DEPS graph_viz_pass multi_devices_graph_pass
     multi_devices_graph_print_pass multi_devices_graph_check_pass
-    fuse_elewise_add_act_pass fuse_bn_act_pass 
+    fuse_elewise_add_act_pass fuse_bn_act_pass fuse_bn_add_act_pass 
     multi_batch_merge_pass 
     fuse_relu_depthwise_conv_pass
     lock_free_optimize_pass
diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc
index 962f968c84ea4..678946fbc5133 100644
--- a/paddle/fluid/framework/details/build_strategy.cc
+++ b/paddle/fluid/framework/details/build_strategy.cc
@@ -164,6 +164,7 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder {
     AppendPassWithCheck(strategy_.fuse_relu_depthwise_conv_,
                         "fuse_relu_depthwise_conv_pass");
     AppendPassWithCheck(strategy_.fuse_bn_act_ops_, "fuse_bn_act_pass");
+    AppendPassWithCheck(strategy_.fuse_bn_add_act_ops_, "fuse_bn_add_act_pass");
 #if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__)
     AppendPassWithCheck(strategy_.enable_auto_fusion_, "fusion_group_pass");
 #else
@@ -390,6 +391,12 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
                         "GPU, skipped.";
         continue;
       }
+    } else if (pass->Type() == "fuse_bn_add_act_pass") {
+      if (!use_cuda) {
+        LOG(WARNING) << "fuse_bn_add_act_pass is only supported on "
+                        "GPU, skipped.";
+        continue;
+      }
     } else if (pass->Type() == "mkldnn_placement_pass") {
       pass->Set("mkldnn_enabled_op_types",
                 new std::unordered_set<std::string>(mkldnn_enabled_op_types_));
@@ -416,6 +423,7 @@ USE_PASS(sync_batch_norm_pass);
 USE_PASS(fuse_relu_depthwise_conv_pass);
 USE_PASS(fuse_elewise_add_act_pass);
 USE_PASS(fuse_bn_act_pass);
+USE_PASS(fuse_bn_add_act_pass);
 USE_PASS(graph_viz_pass);
 USE_PASS(multi_batch_merge_pass);
 USE_PASS(reduce_mode_multi_devices_pass);
diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h
index 87b27eaa440cc..bc275cb8f3bce 100644
--- a/paddle/fluid/framework/details/build_strategy.h
+++ b/paddle/fluid/framework/details/build_strategy.h
@@ -100,6 +100,7 @@ struct BuildStrategy {
   // TODO(dev-paddle): fuse_elewise_add_act_ops may cause some models have
   // cycle.
   bool fuse_bn_act_ops_{false};
+  bool fuse_bn_add_act_ops_{true};
   bool fuse_elewise_add_act_ops_{false};
   bool enable_auto_fusion_{false};
   // Fuse_all_optimizer_ops and fuse_all_reduce_ops require that gradients
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 9415fe6e61e08..f9ab60c5c7478 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -114,6 +114,7 @@ if(WITH_MKLDNN)
 endif()
 
 cc_library(fuse_bn_act_pass SRCS fuse_bn_act_pass.cc DEPS pass graph_pattern_detector )
+cc_library(fuse_bn_add_act_pass SRCS fuse_bn_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_elewise_add_act_pass SRCS fuse_elewise_add_act_pass.cc DEPS pass graph_pattern_detector )
 cc_library(fuse_relu_depthwise_conv_pass SRCS fuse_relu_depthwise_conv_pass.cc DEPS pass graph_pattern_detector )
 
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
new file mode 100644
index 0000000000000..774f655c7bb6d
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.cc
@@ -0,0 +1,365 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/framework/ir/fuse_bn_add_act_pass.h"
+#include <algorithm>
+#include <string>
+#include "paddle/fluid/framework/framework.pb.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/platform/enforce.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/platform/cudnn_helper.h"
+#endif
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+void FuseBatchNormAddActPass::ApplyImpl(ir::Graph *graph) const {
+#ifdef PADDLE_WITH_CUDA
+#if CUDNN_VERSION_MIN(7, 4, 1)
+  // forward
+  std::unordered_set<std::string> act_types = {"relu"};
+  graph = FuseBatchNormAddAct(graph, act_types);
+  // backward
+  std::unordered_set<std::string> act_grad_types = {"relu_grad"};
+  graph = FuseBatchNormAddActGrad(graph, act_grad_types);
+#endif
+#endif
+}
+
+// act(bn(x) + z)
+ir::Graph *FuseBatchNormAddActPass::FuseBatchNormAddAct(
+    ir::Graph *graph, const std::unordered_set<std::string> &act_types) const {
+  PADDLE_ENFORCE_NE(
+      graph, nullptr,
+      platform::errors::InvalidArgument(
+          "The input graph of FuseBatchNormAddAct should not be nullptr."));
+  FusePassBase::Init("bn_add_act", graph);
+
+  GraphPatternDetector gpd;
+  auto *x = gpd.mutable_pattern()
+                ->NewNode("bn_add_act/x")
+                ->AsInput()
+                ->assert_is_op_input("batch_norm", "X")
+                ->assert_var_dtype(proto::VarType::FP16);
+  patterns::BatchNormAddAct bn_add_act_pattern(gpd.mutable_pattern(),
+                                               "bn_add_act");
+
+  bn_add_act_pattern(x, act_types);
+
+  int found_bn_add_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseBatchNormAddAct fuse";
+    // BN inputs
+    GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, bn_add_act_pattern);
+    // BN outputs
+    GET_IR_NODE_FROM_SUBGRAPH(bn_mean_out, bn_mean_out, bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_variance_out, bn_variance_out,
+                              bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance,
+                              bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean, bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_reserve_space, bn_reserve_space,
+                              bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_out, bn_out, bn_add_act_pattern);
+    // Add outputs
+    GET_IR_NODE_FROM_SUBGRAPH(elewise_add_in, elewise_add_in,
+                              bn_add_act_pattern);
+    // Add outputs
+    GET_IR_NODE_FROM_SUBGRAPH(elewise_add_out, elewise_add_out,
+                              bn_add_act_pattern);
+    // ACT output
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_add_act_pattern);
+    // ops
+    GET_IR_NODE_FROM_SUBGRAPH(batch_norm, batch_norm, bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elewise_add, elewise_add, bn_add_act_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act, act, bn_add_act_pattern);
+
+    std::string bn_x_n = subgraph.at(x)->Name();
+    std::string elewise_add_in_n = elewise_add_in->Name();
+    std::string bn_scale_n = bn_scale->Name();
+    std::string bn_bias_n = bn_bias->Name();
+    std::string bn_mean_out_n = bn_mean_out->Name();
+    std::string bn_variance_out_n = bn_variance_out->Name();
+    std::string bn_saved_variance_n = bn_saved_variance->Name();
+    std::string bn_saved_mean_n = bn_saved_mean->Name();
+    std::string bn_reserve_space_n = bn_reserve_space->Name();
+    std::string bn_out_n = bn_out->Name();
+    std::string elewise_add_out_n = elewise_add_out->Name();
+    std::string act_out_n = act_out->Name();
+
+    Node *fused_bn_add_act_node = CreateFusedBatchNormAddActNode(
+        g, act, elewise_add, batch_norm, bn_x_n, elewise_add_in_n, bn_scale_n,
+        bn_bias_n, bn_mean_out_n, bn_variance_out_n, bn_saved_variance_n,
+        bn_saved_mean_n, bn_reserve_space_n, act_out_n);
+
+    VLOG(4) << "\n\t " << bn_x_n << ", " << bn_scale_n << ", " << bn_bias_n
+            << " -> " << batch_norm->Name() << " -> " << bn_mean_out_n << ", "
+            << bn_variance_out_n << ", " << bn_saved_variance_n << ", "
+            << bn_saved_mean_n << ", " << bn_reserve_space_n << " and "
+            << bn_out_n << "\n"
+            << "\t " << bn_out_n << " and " << elewise_add_in_n << " -> "
+            << elewise_add->Name() << " -> " << elewise_add_out_n << "\n"
+            << "\t " << elewise_add_out_n << " -> " << act->Name() << " -> "
+            << act_out_n;
+
+    ReLinkNodes(g, batch_norm, elewise_add, act, fused_bn_add_act_node);
+    found_bn_add_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_bn_add_act_count);
+  return graph;
+}
+
+Node *FuseBatchNormAddActPass::CreateFusedBatchNormAddActNode(
+    Graph *g, const Node *act, const Node *elewise_add, const Node *bn,
+    const std::string &bn_x_n, const std::string &elewise_add_in_n,
+    const std::string &bn_scale_n, const std::string &bn_bias_n,
+    const std::string &bn_mean_out_n, const std::string &bn_variance_out_n,
+    const std::string &bn_saved_variance_n, const std::string &bn_saved_mean_n,
+    const std::string &bn_reserve_space_n, const std::string &act_out_n) const {
+  OpDesc desc;
+  desc.SetInput("X", std::vector<std::string>({bn_x_n}));
+  desc.SetInput("Z", std::vector<std::string>({elewise_add_in_n}));
+  desc.SetInput("Scale", std::vector<std::string>({bn_scale_n}));
+  desc.SetInput("Bias", std::vector<std::string>({bn_bias_n}));
+
+  desc.SetOutput("Y", std::vector<std::string>({act_out_n}));
+  desc.SetOutput("MeanOut", std::vector<std::string>({bn_mean_out_n}));
+  desc.SetOutput("VarianceOut", std::vector<std::string>({bn_variance_out_n}));
+  desc.SetOutput("SavedMean", std::vector<std::string>({bn_saved_mean_n}));
+  desc.SetOutput("SavedVariance",
+                 std::vector<std::string>({bn_saved_variance_n}));
+  desc.SetOutput("ReserveSpace",
+                 std::vector<std::string>({bn_reserve_space_n}));
+  desc.SetType("fused_bn_add_activation");
+
+  desc.SetAttr("act_type", act->Name());
+  // Set attrs
+  for (auto &n : {act->Op(), elewise_add->Op(), bn->Op()}) {
+    for (auto &m : n->GetAttrMap()) {
+      desc.SetAttr(m.first, m.second);
+    }
+  }
+
+  auto fused_bn_add_act_node = g->CreateOpNode(&desc);
+  return fused_bn_add_act_node;
+}
+
+// the backward of act(bn(x) + z)
+ir::Graph *FuseBatchNormAddActPass::FuseBatchNormAddActGrad(
+    ir::Graph *graph,
+    const std::unordered_set<std::string> &act_grad_types) const {
+  PADDLE_ENFORCE_NE(
+      graph, nullptr,
+      platform::errors::InvalidArgument(
+          "The input graph of FuseBatchNormAddActGrad should not be nullptr."));
+  FusePassBase::Init("bn_add_act_grad", graph);
+
+  GraphPatternDetector gpd;
+  auto *d_act_out =
+      gpd.mutable_pattern()
+          ->NewNode("bn_add_act_grad/x")
+          ->AsInput()
+          ->assert_is_ops_input(act_grad_types, GradVarName("Out"))
+          ->assert_var_dtype(proto::VarType::FP16);
+  patterns::BatchNormAddActGrad bn_add_act_grad_pattern(gpd.mutable_pattern(),
+                                                        "bn_add_act_grad");
+  bn_add_act_grad_pattern(d_act_out, act_grad_types);
+
+  int found_bn_add_act_count = 0;
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    VLOG(4) << "handle FuseBatchNormAddActGrad fuse";
+    GET_IR_NODE_FROM_SUBGRAPH(act_grad, act_grad, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(elewise_add_grad, elewise_add_grad,
+                              bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(batch_norm_grad, batch_norm_grad,
+                              bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(act_out, act_out, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_act_x, d_act_x, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_out, d_bn_out, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_x, bn_x, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_scale, bn_scale, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_bias, bn_bias, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_mean, bn_saved_mean,
+                              bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_saved_variance, bn_saved_variance,
+                              bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(bn_reserve_space, bn_reserve_space,
+                              bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_x, d_bn_x, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_scale, d_bn_scale, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_bn_bias, d_bn_bias, bn_add_act_grad_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(d_elewise_add_in, d_elewise_add_in,
+                              bn_add_act_grad_pattern);
+
+    std::string d_act_out_n = subgraph.at(d_act_out)->Name();  // Y@GRAD
+    std::string act_out_n = act_out->Name();                   // Y
+    std::string d_act_x_n = d_act_x->Name();
+    std::string bn_x_n = bn_x->Name();
+    std::string bn_scale_n = bn_scale->Name();
+    std::string bn_bias_n = bn_bias->Name();
+    std::string bn_saved_mean_n = bn_saved_mean->Name();
+    std::string bn_saved_variance_n = bn_saved_variance->Name();
+    std::string bn_reserve_space_n = bn_reserve_space->Name();
+    std::string d_bn_out_n = d_bn_out->Name();
+    std::string d_bn_x_n = d_bn_x->Name();
+    std::string d_bn_scale_n = d_bn_scale->Name();
+    std::string d_bn_bias_n = d_bn_bias->Name();
+    std::string d_elewise_add_in_n = d_elewise_add_in->Name();
+
+    OpDesc desc;
+    desc.SetType("fused_bn_add_activation_grad");
+    desc.SetInput("X", {bn_x_n});
+    desc.SetInput("Y", std::vector<std::string>({act_out_n}));
+    desc.SetInput(GradVarName("Y"), std::vector<std::string>({d_act_out_n}));
+    desc.SetInput("Scale", std::vector<std::string>({bn_scale_n}));
+    desc.SetInput("Bias", std::vector<std::string>({bn_bias_n}));
+    desc.SetInput("SavedMean", std::vector<std::string>({bn_saved_mean_n}));
+    desc.SetInput("SavedVariance",
+                  std::vector<std::string>({bn_saved_variance_n}));
+    desc.SetInput("ReserveSpace",
+                  std::vector<std::string>({bn_reserve_space_n}));
+    desc.SetOutput(GradVarName("X"), std::vector<std::string>({d_bn_x_n}));
+    desc.SetOutput(GradVarName("Z"),
+                   std::vector<std::string>({d_elewise_add_in_n}));
+    desc.SetOutput(GradVarName("Scale"),
+                   std::vector<std::string>({d_bn_scale_n}));
+    desc.SetOutput(GradVarName("Bias"),
+                   std::vector<std::string>({d_bn_bias_n}));
+    std::string act = act_grad->Name();
+    act = act.substr(0, act.length() - 5);  // remove "_grad"
+    desc.SetAttr("act_type", act);
+
+    for (auto &n :
+         {act_grad->Op(), elewise_add_grad->Op(), batch_norm_grad->Op()}) {
+      for (auto &m : n->GetAttrMap()) {
+        desc.SetAttr(m.first, m.second);
+      }
+    }
+
+    auto fused_node = g->CreateOpNode(&desc);
+
+    VLOG(4) << "\n\t " << d_act_out_n << " and " << act_out_n << " -> "
+            << act_grad->Name() << " -> " << d_act_x_n << "\n\t ";
+    VLOG(4) << d_act_x_n << " -> " << elewise_add_grad->Name() << " -> "
+            << d_elewise_add_in_n << "," << d_bn_out_n << "\n\t ";
+    VLOG(4) << bn_x_n << ", " << d_bn_out_n << ", " << bn_scale_n << ", "
+            << bn_bias_n << ", " << bn_saved_mean_n << ", "
+            << bn_saved_variance_n << " and " << bn_reserve_space_n << " -> "
+            << batch_norm_grad->Name() << " -> " << d_bn_x_n << ", "
+            << d_bn_scale_n << " and " << d_bn_bias_n;
+
+    ReLinkNodes(g, act_grad, elewise_add_grad, batch_norm_grad, fused_node);
+    found_bn_add_act_count++;
+  };
+
+  gpd(graph, handler);
+
+  AddStatis(found_bn_add_act_count);
+  return graph;
+}
+
+void FuseBatchNormAddActPass::ReLinkNodes(Graph *graph, Node *op_1, Node *op_2,
+                                          Node *op_3,
+                                          Node *fused_op) const {  // delete act
+  // link inputs of op_1 to fused_op
+  for (auto &in : op_1->inputs) {
+    fused_op->inputs.emplace_back(in);
+    in->outputs = this->ReplaceNode(op_1, fused_op, in->outputs);
+  }
+
+  std::unordered_set<const Node *> nodes2delete;
+
+  LinkOutputsToFuseOp(op_1, op_2, fused_op, &nodes2delete);
+  LinkOutputsToFuseOp(op_2, op_3, fused_op, &nodes2delete);
+  LinkInputsToFuseOp(op_2, fused_op, &nodes2delete);
+  LinkInputsToFuseOp(op_3, fused_op, &nodes2delete);
+
+  for (auto &out : op_3->outputs) {
+    IR_OP_VAR_LINK(fused_op, out);
+  }
+
+  nodes2delete.insert(std::move(op_1));
+  nodes2delete.insert(std::move(op_2));
+  nodes2delete.insert(std::move(op_3));
+
+  GraphSafeRemoveNodes(graph, nodes2delete);
+}
+
+void FuseBatchNormAddActPass::LinkOutputsToFuseOp(
+    Node *op_1, Node *op_2, Node *fused_op,
+    std::unordered_set<const Node *> *nodes2delete) const {
+  // if the outputs of op_1 are inputs of op_2, add the outputs to nodes2delete
+  // otherwise link the outputs to fused_op
+  for (auto &out : op_1->outputs) {
+    auto result_iter =
+        std::find_if(op_2->inputs.begin(), op_2->inputs.end(),
+                     [&out](const Node *node) -> bool { return node == out; });
+
+    if (result_iter == op_2->inputs.end()) {
+      IR_OP_VAR_LINK(fused_op, out);
+    } else {
+      nodes2delete->emplace(out);
+    }
+  }
+}
+
+void FuseBatchNormAddActPass::LinkInputsToFuseOp(
+    Node *op, Node *fused_op,
+    std::unordered_set<const Node *> *nodes2delete) const {
+  // if the inputs of the op are outputs of previous op, which means
+  // these inputs have been added to nodes2delete before, skip the inputs,
+  // otherwise link the inputs of the op to fused_op
+  for (auto &in : op->inputs) {
+    if (nodes2delete->count(in)) {
+      continue;
+    }
+    fused_op->inputs.emplace_back(in);
+    in->outputs = this->ReplaceNode(op, fused_op, in->outputs);
+  }
+}
+
+std::vector<Node *> FuseBatchNormAddActPass::ReplaceNode(
+    Node *cur_node, Node *new_node, const std::vector<Node *> &nodes) const {
+  std::vector<Node *> new_list(nodes.size());
+  bool has_replaced = false;
+  std::transform(nodes.begin(), nodes.end(), new_list.begin(),
+                 [&](Node *node) -> Node * {
+                   if (node == cur_node) {
+                     has_replaced = true;
+                     return new_node;
+                   }
+                   return node;
+                 });
+  PADDLE_ENFORCE_EQ(has_replaced, true,
+                    platform::errors::NotFound("Not found %s in the node list.",
+                                               cur_node->Name()));
+  return new_list;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(fuse_bn_add_act_pass,
+              paddle::framework::ir::FuseBatchNormAddActPass);
diff --git a/paddle/fluid/framework/ir/fuse_bn_add_act_pass.h b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.h
new file mode 100644
index 0000000000000..243a5b1b8df6d
--- /dev/null
+++ b/paddle/fluid/framework/ir/fuse_bn_add_act_pass.h
@@ -0,0 +1,75 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <string>
+#include <unordered_set>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+#include "paddle/fluid/framework/ir/pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+/*
+ * Fuse the BatchNorm, add and activation.
+ */
+class Graph;
+class Node;
+
+class FuseBatchNormAddActPass : public FusePassBase {
+ public:
+  virtual ~FuseBatchNormAddActPass() {}
+
+ protected:
+  void ApplyImpl(ir::Graph *graph) const override;
+
+  ir::Graph *FuseBatchNormAddAct(
+      ir::Graph *graph, const std::unordered_set<std::string> &act_types) const;
+
+  ir::Graph *FuseBatchNormAddActGrad(
+      ir::Graph *graph,
+      const std::unordered_set<std::string> &act_grad_types) const;
+
+  void LinkOutputsToFuseOp(
+      Node *op_1, Node *op_2, Node *fused_op,
+      std::unordered_set<const Node *> *nodes2delete) const;
+
+  void LinkInputsToFuseOp(Node *op, Node *fused_op,
+                          std::unordered_set<const Node *> *nodes2delete) const;
+
+  std::vector<Node *> ReplaceNode(Node *cur_node, Node *new_node,
+                                  const std::vector<Node *> &nodes) const;
+
+  void ReLinkNodes(Graph *graph, Node *op_1, Node *op_2, Node *op_3,
+                   Node *fused_op) const;
+  Node *CreateFusedBatchNormAddActNode(
+      Graph *g, const Node *act, const Node *add, const Node *bn,
+      const std::string &bn_x_n, const std::string &add_y_n,
+      const std::string &bn_scale_n, const std::string &bn_bias_n,
+      const std::string &bn_mean_out_n, const std::string &bn_variance_out_n,
+      const std::string &bn_saved_variance_n,
+      const std::string &bn_saved_mean_n, const std::string &bn_reserve_space_n,
+      const std::string &act_out_n) const;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 3127a3fd8a7fe..5ffaf28fe92f1 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -93,6 +93,7 @@ void GraphPatternDetector::operator()(Graph *graph,
 
   auto subgraphs = DetectPatterns();
   UniquePatterns(&subgraphs);
+  SortSubgraphs(&subgraphs);
   RemoveOverlappedMatch(&subgraphs);
   ValidateByNodeRole(&subgraphs);
 
@@ -302,6 +303,46 @@ void GraphPatternDetector::UniquePatterns(
   *subgraphs = result;
 }
 
+void GraphPatternDetector::SortSubgraphs(
+    std::vector<GraphPatternDetector::subgraph_t> *subgraphs) {
+  if (subgraphs->empty()) return;
+  bool has_bn_add_act = false;
+  for (auto &subgraph : *subgraphs) {
+    for (auto &item : subgraph) {
+      if (item.first->name().find("bn_add_act") != std::string::npos) {
+        has_bn_add_act = true;
+        break;
+      }
+    }
+  }
+  if (!has_bn_add_act) {
+    return;
+  }
+
+  std::sort(
+      subgraphs->begin(), subgraphs->end(),
+      [](const GraphPatternDetector::subgraph_t &a,
+         const GraphPatternDetector::subgraph_t &b) {
+        for (auto &item : a) {
+          if (item.first->name().find("bn_add_act") != std::string::npos &&
+              item.first->name().find("bn_reserve_space") !=
+                  std::string::npos) {
+            auto it_b = b.find(item.first);
+            if (it_b != b.end()) {
+              if (item.second->Name() != it_b->second->Name()) {
+                return item.second->Name() < it_b->second->Name();
+              } else {
+                return false;
+              }
+            } else {
+              return false;
+            }
+          }
+        }
+        return false;
+      });
+}
+
 void GraphPatternDetector::RemoveOverlappedMatch(
     std::vector<subgraph_t> *subgraphs) {
   std::vector<subgraph_t> result;
@@ -1208,6 +1249,151 @@ PDNode *patterns::BatchNormActOneDNN::operator()(const std::string &act_type) {
   return act_out;
 }
 
+PDNode *patterns::BatchNormAddAct::operator()(
+    paddle::framework::ir::PDNode *bn_x_var,
+    std::unordered_set<std::string> act_types) {
+  bn_x_var->assert_is_op_input("batch_norm", "X")
+      ->assert_var_dtype(proto::VarType::FP16);
+  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
+                           ->assert_is_op_input("batch_norm", "Scale");
+  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
+                          ->assert_is_op_input("batch_norm", "Bias");
+
+  auto *bn = pattern->NewNode(batch_norm_repr())
+                 ->assert_is_op("batch_norm")
+                 ->assert_is_not_op_input("MomentumTensor")
+                 ->assert_op_attr<bool>("is_test", false)
+                 ->assert_op_attr<bool>("use_global_stats", false)
+                 ->assert_op_attr<std::string>("data_layout", "NHWC");
+
+  auto *bn_mean_out_var = pattern->NewNode(bn_mean_out_repr())
+                              ->assert_is_op_output("batch_norm", "MeanOut");
+  auto *bn_variance_out_var =
+      pattern->NewNode(bn_variance_out_repr())
+          ->assert_is_op_output("batch_norm", "VarianceOut");
+  auto *bn_saved_variance_var =
+      pattern->NewNode(bn_saved_variance_repr())
+          ->assert_is_op_output("batch_norm", "SavedVariance");
+  auto *bn_saved_mean_var =
+      pattern->NewNode(bn_saved_mean_repr())
+          ->assert_is_op_output("batch_norm", "SavedMean");
+  auto *bn_reserve_space =
+      pattern->NewNode(bn_reserve_space_repr())
+          ->assert_is_op_output("batch_norm", "ReserveSpace");
+  auto *bn_out_var = pattern->NewNode(bn_out_repr())
+                         ->assert_is_op_output("batch_norm", "Y")
+                         ->assert_var_dtype(proto::VarType::FP16);
+
+  bn_out_var->assert_is_op_input("elementwise_add");
+
+  auto *elewise_add =
+      pattern->NewNode(elewise_add_repr())->assert_is_op("elementwise_add");
+
+  auto *elewise_add_in_var = pattern->NewNode(elewise_add_in_repr())
+                                 ->assert_is_not_ctrl_var()
+                                 ->assert_is_op_input("elementwise_add")
+                                 ->assert_var_dtype(proto::VarType::FP16);
+
+  auto *elewise_add_out_var =
+      pattern->NewNode(elewise_add_out_repr())
+          ->assert_is_op_output("elementwise_add", "Out")
+          ->assert_has_n_outputs(1);
+
+  elewise_add_out_var->AsIntermediate()->assert_is_ops_input(act_types);
+
+  auto *act = pattern->NewNode(act_repr())->assert_is_ops(act_types);
+
+  auto *act_out_var =
+      pattern->NewNode(act_out_repr())->assert_is_ops_output(act_types, "Out");
+
+  bn->LinksFrom({bn_x_var, bn_scale_var, bn_bias_var})
+      .LinksTo({bn_mean_out_var, bn_variance_out_var, bn_saved_variance_var,
+                bn_saved_mean_var, bn_reserve_space, bn_out_var});
+  elewise_add->LinksFrom({elewise_add_in_var, bn_out_var})
+      .LinksTo({elewise_add_out_var});
+  act->LinksFrom({elewise_add_out_var}).LinksTo({act_out_var});
+
+  return act_out_var;
+}
+
+PDNode *patterns::BatchNormAddActGrad::operator()(
+    paddle::framework::ir::PDNode *d_act_out_var,
+    std::unordered_set<std::string> act_grad_types) {
+  auto *act_grad =
+      pattern->NewNode(act_grad_repr())->assert_is_ops(act_grad_types);
+  auto *elewise_add_grad = pattern->NewNode(elewise_add_grad_repr())
+                               ->assert_is_op("elementwise_add_grad");
+  auto *bn_grad = pattern->NewNode(batch_norm_grad_repr())
+                      ->assert_is_op("batch_norm_grad")
+                      ->assert_op_attr<bool>("use_global_stats", false)
+                      ->assert_op_attr<std::string>("data_layout", "NHWC");
+
+  auto *act_out_var = pattern->NewNode(act_out_repr())
+                          ->assert_is_ops_input(act_grad_types, "Out");
+  auto *d_act_x_var =
+      pattern->NewNode(d_act_x_repr())
+          ->assert_is_ops_output(act_grad_types, GradVarName("X"))
+          ->assert_has_n_outputs(1);  // d_act_x
+
+  d_act_x_var->AsIntermediate()->assert_is_op_input("elementwise_add_grad");
+
+  auto *d_elewise_add_in_var =
+      pattern->NewNode(d_elewise_add_in_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("elementwise_add_grad")
+          ->assert_var_dtype(proto::VarType::FP16);  // d_add_in_1
+  auto *d_bn_out_var =
+      pattern->NewNode(d_bn_out_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("elementwise_add_grad")
+          ->assert_var_dtype(proto::VarType::FP16);  // d_add_in_2
+
+  d_bn_out_var->assert_is_op_input("batch_norm_grad", GradVarName("Y"));
+
+  auto *bn_x_var = pattern->NewNode(bn_x_repr())
+                       ->assert_is_op_input("batch_norm_grad", "X")
+                       ->assert_var_dtype(proto::VarType::FP16);
+  auto *bn_scale_var = pattern->NewNode(bn_scale_repr())
+                           ->assert_is_op_input("batch_norm_grad", "Scale");
+  auto *bn_bias_var = pattern->NewNode(bn_bias_repr())
+                          ->assert_is_op_input("batch_norm_grad", "Bias");
+  auto *bn_saved_mean_var =
+      pattern->NewNode(bn_saved_mean_repr())
+          ->assert_is_op_input("batch_norm_grad", "SavedMean");
+  auto *bn_saved_variance_var =
+      pattern->NewNode(bn_saved_variance_repr())
+          ->assert_is_op_input("batch_norm_grad", "SavedVariance");
+
+  auto *bn_reserve_space =
+      pattern->NewNode(bn_reserve_space_repr())
+          ->assert_is_op_input("batch_norm_grad", "ReserveSpace");
+  auto *d_bn_x_var =
+      pattern->NewNode(d_bn_x_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("batch_norm_grad", GradVarName("X"))
+          ->assert_var_dtype(proto::VarType::FP16);
+  auto *d_bn_scale_var =
+      pattern->NewNode(d_bn_scale_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("batch_norm_grad", GradVarName("Scale"));
+  auto *d_bn_bias_var =
+      pattern->NewNode(d_bn_bias_repr())
+          ->assert_is_not_ctrl_var()
+          ->assert_is_op_output("batch_norm_grad", GradVarName("Bias"));
+
+  act_grad->LinksFrom({d_act_out_var, act_out_var}).LinksTo({d_act_x_var});
+
+  elewise_add_grad->LinksFrom({d_act_x_var})
+      .LinksTo({d_elewise_add_in_var, d_bn_out_var});
+
+  bn_grad
+      ->LinksFrom({bn_x_var, d_bn_out_var, bn_scale_var, bn_bias_var,
+                   bn_saved_mean_var, bn_saved_variance_var, bn_reserve_space})
+      .LinksTo({d_bn_x_var, d_bn_scale_var, d_bn_bias_var});
+
+  return bn_grad;
+}
+
 PDNode *patterns::ElewiseAddAct::operator()(
     paddle::framework::ir::PDNode *ele_x_var,
     std::unordered_set<std::string> act_types) {
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index c44c7b4059eb0..77a1b03407439 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -294,6 +294,12 @@ class GraphPatternDetector {
   // Remove duplicate patterns.
   void UniquePatterns(std::vector<subgraph_t>* subgraphs);
 
+  // Sort subgraphs, sort subgraphs by the specified node so that
+  // the removed forward and backward subgraphs are corresponding
+  // when two subgraphs are overlapped. Note: this function is
+  // currently only used for bn_add_act, refer to PR28196 for details.
+  void SortSubgraphs(std::vector<subgraph_t>* subgraphs);
+
   // Remove overlapped match subgraphs, when overlapped, keep the previous one.
   // The intermediate PDNodes will be removed, so can't shared by multiple
   // patterns.
@@ -685,6 +691,72 @@ struct BatchNormActOneDNN : public PatternBase {
   PATTERN_DECL_NODE(act_out);
 };
 
+// The following pattern is used to fuse batch_norm, elewise_add, and act
+// formula: act(bn(x) + z)
+// op: batch_norm + elewise_add + act
+struct BatchNormAddAct : public PatternBase {
+  BatchNormAddAct(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bn_add_act") {}
+
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> acts);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(batch_norm);
+  PATTERN_DECL_NODE(elewise_add);
+  PATTERN_DECL_NODE(act);
+  // declare variable node's name
+  // BN inputs
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_bias);
+  // BN outputs
+  PATTERN_DECL_NODE(bn_mean_out);
+  PATTERN_DECL_NODE(bn_variance_out);
+  PATTERN_DECL_NODE(bn_saved_variance);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_reserve_space);
+  PATTERN_DECL_NODE(bn_out);
+  // Elewise_Add input
+  PATTERN_DECL_NODE(elewise_add_in);
+  // Elewise_Add output
+  PATTERN_DECL_NODE(elewise_add_out);
+  // ACT output
+  PATTERN_DECL_NODE(act_out);
+};
+
+// the backward of act(bn(x) + z)
+// op: batch_norm_grad + elewise_add_grad + act_grad
+struct BatchNormAddActGrad : public PatternBase {
+  BatchNormAddActGrad(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "bn_add_act_grad") {}
+
+  // act_grad: in["Out", "Out@GRAD"], out["X@GRAD"]
+  // elewise_add_grad: in["Out@GRAD"], out["X@GRAD", "Y@GRAD"]
+  // bn_grad: in["X", "Z", "Y@GRAD", "Scale", "Bias", "SavedMean",
+  // "SavedVariance",
+  // "ReserveSpace"],
+  // out["X@GRAD", "Z@GRAD", "Scale@GRAD", "Bias@GRAD"]
+  PDNode* operator()(PDNode* x, std::unordered_set<std::string> act_grad_types);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(act_grad);
+  PATTERN_DECL_NODE(elewise_add_grad);
+  PATTERN_DECL_NODE(batch_norm_grad);
+  // declare variable node's name
+  PATTERN_DECL_NODE(act_out);
+  PATTERN_DECL_NODE(d_act_x);
+  PATTERN_DECL_NODE(d_elewise_add_in);
+  PATTERN_DECL_NODE(d_bn_out);
+  PATTERN_DECL_NODE(bn_x);
+  PATTERN_DECL_NODE(bn_scale);
+  PATTERN_DECL_NODE(bn_bias);
+  PATTERN_DECL_NODE(bn_saved_mean);
+  PATTERN_DECL_NODE(bn_saved_variance);
+  PATTERN_DECL_NODE(bn_reserve_space);
+  PATTERN_DECL_NODE(d_bn_x);
+  PATTERN_DECL_NODE(d_bn_scale);
+  PATTERN_DECL_NODE(d_bn_bias);
+};
+
 // The following patterns are used to fuse elewise_add and act
 // formula: act(ele_add(x, y))
 // op: elementwise_add + act
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
index 5b3ed03bb6419..9f446b48b4728 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cc
@@ -186,8 +186,6 @@ void FusedBatchNormAddActGradOp::InferShape(
   // check input
   OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X",
                  "FusedBatchNormAddActGradOp");
-  OP_INOUT_CHECK(ctx->HasInput("Z"), "Input", "Z",
-                 "FusedBatchNormAddActGradOp");
   OP_INOUT_CHECK(ctx->HasInput("Scale"), "Input", "Scale",
                  "FusedBatchNormAddActGradOp");
   OP_INOUT_CHECK(ctx->HasInput("SavedMean"), "Input", "SavedMean",
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 7f1d297cda3fa..c92b13b5f5847 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -188,7 +188,6 @@ class FusedBatchNormAddActGradKernel<platform::CUDADeviceContext, T>
     std::string act_type = ctx.Attr<std::string>("act_type");
 
     const auto *x = ctx.Input<Tensor>("X");
-    const auto *z = ctx.Input<Tensor>("Z");
     const auto *y = ctx.Input<Tensor>("Y");
     const auto *d_y = ctx.Input<Tensor>(framework::GradVarName("Y"));
     const auto *scale = ctx.Input<Tensor>("Scale");
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
index 5c7df96e60dd8..d5e5ae9bda642 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.h
@@ -61,7 +61,6 @@ class FusedBatchNormAddActGradOpMaker : public framework::SingleGradOpMaker<T> {
   void Apply(GradOpPtr<T> op) const override {
     op->SetType(this->ForwardOpType() + "_grad");
     op->SetInput("X", this->Input("X"));
-    op->SetInput("Z", this->Input("Z"));
     op->SetInput("Y", this->Output("Y"));
     op->SetInput(framework::GradVarName("Y"), this->OutputGrad("Y"));
 
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 8ff7e90065330..736669fa4ef92 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -2500,6 +2500,31 @@ All parameter, weight, gradient are variables in Paddle.
                         build_strategy = static.BuildStrategy()
                         build_strategy.fuse_bn_act_ops = True
                      )DOC")
+      .def_property(
+          "fuse_bn_add_act_ops",
+          [](const BuildStrategy &self) { return self.fuse_bn_add_act_ops_; },
+          [](BuildStrategy &self, bool b) {
+            PADDLE_ENFORCE_NE(self.IsFinalized(), true,
+                              platform::errors::PreconditionNotMet(
+                                  "BuildStrategy has been finlaized, cannot be "
+                                  "configured again."));
+            self.fuse_bn_add_act_ops_ = b;
+          },
+          R"DOC((bool, optional): fuse_bn_add_act_ops indicate whether
+                to fuse batch_norm, elementwise_add and activation_op,
+                it may make the execution faster. Default is True
+
+                Examples:
+                    .. code-block:: python
+
+                        import paddle
+                        import paddle.static as static
+
+                        paddle.enable_static()
+
+                        build_strategy = static.BuildStrategy()
+                        build_strategy.fuse_bn_add_act_ops = True
+                     )DOC")
       .def_property(
           "enable_auto_fusion",
           [](const BuildStrategy &self) { return self.enable_auto_fusion_; },
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 101242808b22f..4cd9d9e530d87 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -331,6 +331,7 @@ list(REMOVE_ITEM TEST_OPS test_basic_gru_unit_op)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_api)
 list(REMOVE_ITEM TEST_OPS test_basic_lstm_unit_op)
 list(REMOVE_ITEM TEST_OPS test_fuse_bn_act_pass)
+list(REMOVE_ITEM TEST_OPS test_fuse_bn_add_act_pass)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_mnist)
 list(REMOVE_ITEM TEST_OPS test_imperative_static_runner_while)
 list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
@@ -515,6 +516,7 @@ py_test_modules(test_parallel_executor_transformer_auto_growth MODULES test_para
 
 py_test_modules(test_data_norm_op MODULES test_data_norm_op)
 py_test_modules(test_fuse_bn_act_pass MODULES test_fuse_bn_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
+py_test_modules(test_fuse_bn_add_act_pass MODULES test_fuse_bn_add_act_pass ENVS FLAGS_cudnn_deterministic=1 FLAGS_cudnn_batchnorm_spatial_persistent=1 FLAGS_conv_workspace_size_limit=1000)
 
 # NOTE: These unittests will appear NaN steadily in windows CI. After analysis,
 # it is found that windows CI will run all the training unittests with the ON_INFER option turned on, 
diff --git a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
similarity index 85%
rename from python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
rename to python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index 45c27552743d3..316c40971aaac 100644
--- a/python/paddle/fluid/tests/unittests/test_fused_bn_add_act.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -21,6 +21,8 @@
 import paddle.fluid as fluid
 from paddle.fluid import core
 
+paddle.enable_static()
+
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "Paddle core is not compiled with CUDA")
@@ -163,12 +165,16 @@ def check(self, place, use_cuda):
         iters = 5
         batch_size = 16
 
-        # build_fused_program
+        # build_fused_program: turn on fuse_bn_add_act_ops
         main_program = fluid.Program()
         startup_program = fluid.Program()
-        x, y, loss = self.build_fused_program(main_program, startup_program,
-                                              use_cuda)
+        x, y, loss = self.build_origin_program(main_program, startup_program,
+                                               use_cuda)
         feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        build_strategy_fused = fluid.BuildStrategy()
+        build_strategy_fused.fuse_bn_add_act_ops = True
+        binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy_fused)
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=batch_size)
         exe = fluid.Executor(place)
@@ -178,17 +184,16 @@ def check(self, place, use_cuda):
             exe.run(startup_program)
             for _ in range(iters):
                 data = next(train_reader())
-                loss_v = exe.run(main_program,
+                loss_v = exe.run(binary_fused,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
                 loss_vals_fused.append(loss_v[0][0])
 
-        # build_origin_program
-        main_program = fluid.Program()
-        startup_program = fluid.Program()
-        x, y, loss = self.build_origin_program(main_program, startup_program,
-                                               use_cuda)
-        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        # build_origin_program: turn off fused_bn_act_ops
+        build_strategy = fluid.BuildStrategy()
+        build_strategy.fuse_bn_add_act_ops = False
+        binary = fluid.CompiledProgram(main_program).with_data_parallel(
+            loss_name=loss.name, build_strategy=build_strategy)
         train_reader = paddle.batch(
             paddle.dataset.mnist.train(), batch_size=batch_size)
         loss_vals = []
@@ -197,7 +202,7 @@ def check(self, place, use_cuda):
             exe.run(startup_program)
             for _ in range(iters):
                 data = next(train_reader())
-                loss_v = exe.run(main_program,
+                loss_v = exe.run(binary,
                                  feed=feeder.feed(data),
                                  fetch_list=[loss])
                 loss_vals.append(loss_v[0][0])
@@ -210,6 +215,25 @@ def test_fuse_bn_add_act(self):
         place = fluid.CUDAPlace(0)
         self.check(place, use_cuda=True)
 
+    def test_fuse_bn_add_act_API(self):
+        # build_fused_program: use fused_bn_add_act python API
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        place = fluid.CUDAPlace(0)
+        x, y, loss = self.build_fused_program(
+            main_program, startup_program, use_cuda=True)
+        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=16)
+        exe = fluid.Executor(place)
+        scope = fluid.Scope()
+        with fluid.scope_guard(scope):
+            exe.run(startup_program)
+            for _ in range(5):
+                data = next(train_reader())
+                loss_v = exe.run(main_program,
+                                 feed=feeder.feed(data),
+                                 fetch_list=[loss])
+
 
 if __name__ == '__main__':
     unittest.main()

From 96ae48b75122c9228ee0473632bdfd46c9d7219b Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 27 Oct 2020 14:20:41 +0800
Subject: [PATCH 058/185] Change CI dockerfile (#28236)

---
 tools/dockerfile/Dockerfile.ubuntu |  3 +
 tools/dockerfile/ci_dockerfile.sh  | 95 +++++++++++++++++-------------
 2 files changed, 58 insertions(+), 40 deletions(-)

diff --git a/tools/dockerfile/Dockerfile.ubuntu b/tools/dockerfile/Dockerfile.ubuntu
index 4f8b092ceea65..a4d458021ab9c 100644
--- a/tools/dockerfile/Dockerfile.ubuntu
+++ b/tools/dockerfile/Dockerfile.ubuntu
@@ -207,6 +207,8 @@ RUN wget -q https://launchpad.net/ubuntu/+archive/primary/+sourcefiles/binutils/
     cd binutils-2.27 && \
     ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz
 
+RUN apt-get install libprotobuf-dev -y
+
 # Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
 # https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
 # So install a newer version here.
@@ -224,4 +226,5 @@ RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
     make -j8 && make install && \
     ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
 
+
 EXPOSE 22
diff --git a/tools/dockerfile/ci_dockerfile.sh b/tools/dockerfile/ci_dockerfile.sh
index eea7bfda9af73..fb9dc2c2659d8 100644
--- a/tools/dockerfile/ci_dockerfile.sh
+++ b/tools/dockerfile/ci_dockerfile.sh
@@ -1,52 +1,67 @@
 #!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 function make_ubuntu_dockerfile(){
   dockerfile_name="Dockerfile.cuda10_cudnn7_gcc82_ubuntu16"
-  sed 's/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g' ./Dockerfile.ubuntu >${dockerfile_name}
-  sed -i 's#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g' ${dockerfile_name} 
-  dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
+  sed "s/<baseimg>/10.1-cudnn7-devel-ubuntu16.04/g" ./Dockerfile.ubuntu >${dockerfile_name}
+  sed -i "s#liblzma-dev#liblzma-dev openmpi-bin openmpi-doc libopenmpi-dev#g" ${dockerfile_name} 
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
   sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
      tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name} 
-  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
-      COPY tools/dockerfile/build_scripts /build_scripts \
-      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
-      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
-      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
-      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
-      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
-      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
-      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
-  sed -i 's#bash /build_scripts/install_nccl2.sh#wget --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb && \
-      dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb && \
-      apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages #g' ${dockerfile_name}
+  sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
+    COPY tools/dockerfile/build_scripts /build_scripts \\
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
+    RUN cp gcc  gcc.bak \&\& cp g++  g++.bak \&\& rm gcc \&\& rm g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
+    ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
+  sed -i "s#bash /build_scripts/install_nccl2.sh#wget --no-proxy https://nccl2-deb.cdn.bcebos.com/nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+    RUN dpkg -i nccl-repo-ubuntu1604-2.7.8-ga-cuda10.1_1-1_amd64.deb \\
+    RUN apt-get install -y libnccl2=2.7.8-1+cuda10.1 libnccl-dev=2.7.8-1+cuda10.1 --allow-change-held-packages #g" ${dockerfile_name}
 }
 
 
 function make_centos_dockerfile(){
   dockerfile_name="Dockerfile.cuda9_cudnn7_gcc48_py35_centos6"
-  sed 's/<baseimg>/10.2-cudnn7-devel-centos6/g' Dockerfile.centos >${dockerfile_name}
-  sed -i 's#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g' ${dockerfile_name} 
-  dockerfile_line=`wc -l ${dockerfile_name}|awk '{print $1}'`
-  sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so && \
-     ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/ && \
-     rm -rf /usr/include/NvInfer*" ${dockerfile_name}
-  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz && \
-     tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
-  sed -i 's#RUN bash build_scripts/install_nccl2.sh##g' ${dockerfile_name}
-  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm && \
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm && \
-    wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && \
-    rpm -ivh libnccl-2.7.8-1+cuda10.2.x86_64.rpm && \
-    rpm -ivh libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm && \
-    rpm -ivh libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && rm -f /usr/local/include/nccl.h " ${dockerfile_name}
-  sed -i 's#<install_gcc>#WORKDIR /usr/bin \
-      COPY tools/dockerfile/build_scripts /build_scripts \
-      RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \
-      RUN cp gcc gcc.bak \&\& cp g++ g++.bak \&\& rm gcc \&\& rm g++ \
-      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \
-      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \
-      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \
-      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \
-      ENV PATH=/usr/local/gcc-8.2/bin:$PATH #g' ${dockerfile_name}
+  sed "s/<baseimg>/10.2-cudnn7-devel-centos6/g" Dockerfile.centos >${dockerfile_name}
+  sed -i "s#COPY build_scripts /build_scripts#COPY tools/dockerfile/build_scripts ./build_scripts#g" ${dockerfile_name} 
+  dockerfile_line=$(wc -l ${dockerfile_name}|awk '{print $1}')
+  sed -i "${dockerfile_line}i RUN ln -s /usr/lib64/libz.so /usr/local/lib/libz.so \\
+    RUN ln -s /usr/local/lib/libnccl.so /usr/local/cuda/lib64/ \\
+    RUN rm -rf /usr/include/NvInfer*" ${dockerfile_name}
+  sed -i $"${dockerfile_line}i RUN wget --no-check-certificate -q https://paddle-edl.bj.bcebos.com/hadoop-2.7.7.tar.gz \\
+    RUN tar -xzf  hadoop-2.7.7.tar.gz && mv hadoop-2.7.7 /usr/local/" ${dockerfile_name}
+  sed -i "s#RUN bash build_scripts/install_nccl2.sh##g" ${dockerfile_name}
+  sed -i "${dockerfile_line}i RUN wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-2.7.8-1+cuda10.2.x86_64.rpm \\
+    RUN wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm \\
+    RUN wget --no-check-certificate -q https://nccl2-deb.cdn.bcebos.com/libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm \\
+    RUN rpm -ivh libnccl-2.7.8-1+cuda10.2.x86_64.rpm \\
+    RUN rpm -ivh libnccl-devel-2.7.8-1+cuda10.2.x86_64.rpm \\
+    RUN rpm -ivh libnccl-static-2.7.8-1+cuda10.2.x86_64.rpm && rm -f /usr/local/include/nccl.h " ${dockerfile_name}
+  sed -i "s#<install_gcc>#WORKDIR /usr/bin \\
+    COPY tools/dockerfile/build_scripts /build_scripts \\
+    RUN bash /build_scripts/install_gcc.sh gcc82 \&\& rm -rf /build_scripts \\
+    RUN cp gcc  gcc.bak \&\& cp g++  g++.bak \&\& rm gcc \&\& rm g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ \\
+    RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc \\
+    RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ \\
+    ENV PATH=/usr/local/gcc-8.2/bin:\$PATH #g" ${dockerfile_name}
 }
 
 
@@ -55,4 +70,4 @@ function main() {
   make_centos_dockerfile
 }
 
-main $@
+main "$@"

From b1eb28d74b72cce629744cde0b3e17bf6e713489 Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Tue, 27 Oct 2020 15:20:51 +0800
Subject: [PATCH 059/185] [Dy2Stat-log] Call warnings.warn() to display the
 warning-message only once when calling StaticFunc.__call__ or
 ProgramTranslator().get_output (#28260)

---
 .../dygraph/dygraph_to_static/program_translator.py   | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 6d9bfc909a1bb..9c3f572eb9748 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -20,6 +20,7 @@
 import six
 import textwrap
 import threading
+import warnings
 import weakref
 
 from paddle.fluid import framework
@@ -298,7 +299,11 @@ def __call__(self, *args, **kwargs):
 
         # 1. call dygraph function directly if not enable `declarative`
         if not self._program_trans.enable_to_static:
-            logging_utils.warn(
+            # NOTE(liym27):
+            # Here calls `warnings.warn` but not `logging_utils.warn` because by default warnings.warn(message)
+            # will show up **only once**. StaticFunction.__call__ will run many times, it is appropriate to
+            # display this warning message only once.
+            warnings.warn(
                 "The decorator '@paddle.jit.to_static' does NOT work when setting ProgramTranslator.enable to False. "
                 "We will just return dygraph output. If you would like to get static graph output, please call API "
                 "ProgramTranslator.enable(True)")
@@ -831,7 +836,9 @@ def func(x):
         ), "Input dygraph_func is not a callable in ProgramTranslator.get_output"
 
         if not self.enable_to_static:
-            logging_utils.warn(
+            # Here calls `warnings.warn` but not `logging_utils.warn` because by default warnings.warn(message)
+            # will show up **only once**.
+            warnings.warn(
                 "The ProgramTranslator.get_output doesn't work when setting ProgramTranslator.enable to False. "
                 "We will just return dygraph output. "
                 "Please call ProgramTranslator.enable(True) if you would like to get static output."

From cdadc8f01948669cb4ed6409435c0492fb4b4c67 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 27 Oct 2020 15:43:15 +0800
Subject: [PATCH 060/185] refine temporal_shift_op for performance optimization
 using gpu kernel config (#28114)

---
 paddle/fluid/operators/temporal_shift_op.cu | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/operators/temporal_shift_op.cu b/paddle/fluid/operators/temporal_shift_op.cu
index a292f16fe20d1..b61d9aeff7d4c 100644
--- a/paddle/fluid/operators/temporal_shift_op.cu
+++ b/paddle/fluid/operators/temporal_shift_op.cu
@@ -11,6 +11,7 @@
 
 #include "paddle/fluid/operators/temporal_shift_op.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 
 namespace paddle {
 namespace operators {
@@ -112,11 +113,11 @@ class TemporalShiftOpCUDAKernel : public framework::OpKernel<T> {
     T* output_data = output->mutable_data<T>({nt, c, h, w}, ctx.GetPlace());
 
     int pixelNum = nt * chw;
-    int grid_dim = (pixelNum + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
-    KeTemporalShiftFw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeTemporalShiftFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+                           ctx.cuda_device_context().stream()>>>(
         input_data, output_data, ntchw, tchw, chw, hw, w, t, c, shift_ratio);
   }
 };
@@ -148,11 +149,11 @@ class TemporalShiftGradOpCUDAKernel : public framework::OpKernel<T> {
         static_cast<T>(0));
 
     int pixelNum = nt * chw;
-    int grid_dim = (pixelNum + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), pixelNum);
 
-    KeTemporalShiftBw<
-        T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeTemporalShiftBw<T><<<config.block_per_grid, config.thread_per_block, 0,
+                           ctx.cuda_device_context().stream()>>>(
         output_grad_data, input_grad_data, ntchw, tchw, chw, hw, w, t, c,
         shift_ratio);
   }

From 6905608ceaa9376c61079a6f17f1838d098e9043 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 27 Oct 2020 15:43:49 +0800
Subject: [PATCH 061/185] refine yolo box Op for performace optimization
 (#28155)

---
 paddle/fluid/operators/detection/yolo_box_op.cu | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index 01edf7b41b2a8..65dc73ef38323 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
 #include "paddle/fluid/operators/math/math_function.h"
-
+#include "paddle/fluid/platform/gpu_launch_config.h"
 namespace paddle {
 namespace operators {
 
@@ -108,11 +108,11 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     math::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, boxes, static_cast<T>(0));
     set_zero(dev_ctx, scores, static_cast<T>(0));
+    platform::GpuLaunchConfig config =
+        platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), n * box_num);
 
-    int grid_dim = (n * box_num + 512 - 1) / 512;
-    grid_dim = grid_dim > 8 ? 8 : grid_dim;
-
-    KeYoloBoxFw<T><<<grid_dim, 512, 0, ctx.cuda_device_context().stream()>>>(
+    KeYoloBoxFw<T><<<config.block_per_grid, config.thread_per_block, 0,
+                     ctx.cuda_device_context().stream()>>>(
         input_data, imgsize_data, boxes_data, scores_data, conf_thresh,
         anchors_data, n, h, w, an_num, class_num, box_num, input_size_h,
         input_size_w, clip_bbox, scale, bias);

From 495a9ceb955e596a752e754954cb30f811728403 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 27 Oct 2020 20:14:18 +0800
Subject: [PATCH 062/185] fix the input error of size Op (#28272)

---
 python/paddle/fluid/layers/nn.py              |  6 +--
 .../fluid/tests/unittests/test_numel_op.py    |  2 +-
 .../fluid/tests/unittests/test_size_op.py     | 52 +++++++++++++++++++
 3 files changed, 56 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index ac762944b3a68..d5157abf1a992 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11363,10 +11363,10 @@ def size(input):
     """
 
     if in_dygraph_mode():
-        return core.ops.size(x)
+        return core.ops.size(input)
     check_variable_and_dtype(
-        x, 'x', ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'],
-        "size")
+        input, 'input',
+        ['bool', 'float16', 'float32', 'float64', 'int32', 'int64'], "size")
     helper = LayerHelper('size', **locals())
     out = helper.create_variable_for_type_inference(dtype='int64')
     helper.append_op(type='size', inputs={'Input': input}, outputs={'Out': out})
diff --git a/python/paddle/fluid/tests/unittests/test_numel_op.py b/python/paddle/fluid/tests/unittests/test_numel_op.py
index d106484d91e2f..d3b9509795783 100644
--- a/python/paddle/fluid/tests/unittests/test_numel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_numel_op.py
@@ -48,7 +48,7 @@ def init(self):
         self.shape = (0, )
 
 
-class TestNumelOoAPI(unittest.TestCase):
+class TestNumelAPI(unittest.TestCase):
     def test_numel_static(self):
         main_program = fluid.Program()
         startup_program = fluid.Program()
diff --git a/python/paddle/fluid/tests/unittests/test_size_op.py b/python/paddle/fluid/tests/unittests/test_size_op.py
index aec63caa005f6..09cd35391bae0 100644
--- a/python/paddle/fluid/tests/unittests/test_size_op.py
+++ b/python/paddle/fluid/tests/unittests/test_size_op.py
@@ -14,6 +14,8 @@
 
 import unittest
 import numpy as np
+import paddle
+import paddle.fluid as fluid
 from op_test import OpTest
 
 
@@ -53,5 +55,55 @@ def config(self):
         self.shape = [2**10]
 
 
+class TestSizeAPI(unittest.TestCase):
+    def test_size_static(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+            shape1 = [2, 1, 4, 5]
+            shape2 = [1, 4, 5]
+            x_1 = paddle.fluid.data(shape=shape1, dtype='int32', name='x_1')
+            x_2 = paddle.fluid.data(shape=shape2, dtype='int32', name='x_2')
+            input_1 = np.random.random(shape1).astype("int32")
+            input_2 = np.random.random(shape2).astype("int32")
+            out_1 = paddle.fluid.layers.size(x_1)
+            out_2 = paddle.fluid.layers.size(x_2)
+            exe = paddle.static.Executor(place=paddle.CPUPlace())
+            res_1, res_2 = exe.run(feed={
+                "x_1": input_1,
+                "x_2": input_2,
+            },
+                                   fetch_list=[out_1, out_2])
+            assert (np.array_equal(
+                res_1, np.array([np.size(input_1)]).astype("int64")))
+            assert (np.array_equal(
+                res_2, np.array([np.size(input_2)]).astype("int64")))
+
+    def test_size_imperative(self):
+        paddle.disable_static(paddle.CPUPlace())
+        input_1 = np.random.random([2, 1, 4, 5]).astype("int32")
+        input_2 = np.random.random([1, 4, 5]).astype("int32")
+        x_1 = paddle.to_tensor(input_1)
+        x_2 = paddle.to_tensor(input_2)
+        out_1 = paddle.fluid.layers.size(x_1)
+        out_2 = paddle.fluid.layers.size(x_2)
+        assert (np.array_equal(out_1.numpy().item(0), np.size(input_1)))
+        assert (np.array_equal(out_2.numpy().item(0), np.size(input_2)))
+        paddle.enable_static()
+
+    def test_error(self):
+        main_program = fluid.Program()
+        startup_program = fluid.Program()
+        with fluid.program_guard(main_program, startup_program):
+
+            def test_x_type():
+                shape = [1, 4, 5]
+                input_1 = np.random.random(shape).astype("int32")
+                out_1 = paddle.fluid.layers.size(input_1)
+
+            self.assertRaises(TypeError, test_x_type)
+
+
 if __name__ == '__main__':
+    paddle.enable_static()
     unittest.main()

From 7fcb32ddf3d1e6bee30902639ce1b0d8858dc320 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Tue, 27 Oct 2020 08:01:31 -0500
Subject: [PATCH 063/185] fill_constant op supports NINF (#28270)

---
 paddle/fluid/operators/fill_constant_op.h                   | 2 ++
 .../paddle/fluid/tests/unittests/test_fill_constant_op.py   | 6 ++++++
 2 files changed, 8 insertions(+)

diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index 239083f88d9c6..cce28cae97500 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -50,6 +50,8 @@ class FillConstantKernel : public framework::OpKernel<T> {
       // handle NaN/Inf first, which cannot be read from stream.
       if (str_value == "inf") {
         value = static_cast<T>(std::numeric_limits<double>::infinity());
+      } else if (str_value == "-inf") {
+        value = static_cast<T>(-std::numeric_limits<double>::infinity());
       } else if (str_value == "nan") {
         value = static_cast<T>(std::numeric_limits<double>::quiet_NaN());
       } else {
diff --git a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
index babfcdb9040df..c305f71aa5365 100644
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
@@ -340,6 +340,12 @@ def test_inf(self):
             res = fluid.layers.fill_constant([1], 'float32', np.inf)
             self.assertTrue(np.isinf(res.numpy().item(0)))
 
+    def test_ninf(self):
+        with fluid.dygraph.guard():
+            res = fluid.layers.fill_constant([1], 'float32', np.NINF)
+            self.assertTrue(np.isinf(res.numpy().item(0)))
+            self.assertEqual(np.NINF, res.numpy().item(0))
+
 
 class TestFillConstantOpError(unittest.TestCase):
     def test_errors(self):

From 41d26a828790e578291875ce4b0245450fc7f5ed Mon Sep 17 00:00:00 2001
From: wangxinxin08 <69842442+wangxinxin08@users.noreply.github.com>
Date: Wed, 28 Oct 2020 10:02:55 +0800
Subject: [PATCH 064/185] update matrix nms op to api 2.0 (#28265)

* update matrix nms op to api 2.0

* modify code according to review
---
 .../operators/detection/matrix_nms_op.cc      | 24 ++++++++++++++++++-
 paddle/fluid/pybind/op_function_generator.cc  |  1 +
 .../tests/unittests/test_matrix_nms_op.py     |  3 ++-
 3 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/operators/detection/matrix_nms_op.cc b/paddle/fluid/operators/detection/matrix_nms_op.cc
index f7d45bc85bf6b..713c2dc7fe9c1 100644
--- a/paddle/fluid/operators/detection/matrix_nms_op.cc
+++ b/paddle/fluid/operators/detection/matrix_nms_op.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 
 namespace paddle {
@@ -59,6 +60,9 @@ class MatrixNMSOp : public framework::OperatorWithKernel {
     }
     ctx->SetOutputDim("Out", {box_dims[1], box_dims[2] + 2});
     ctx->SetOutputDim("Index", {box_dims[1], 1});
+    if (ctx->HasOutput("RoisNum")) {
+      ctx->SetOutputDim("RoisNum", {-1});
+    }
     if (!ctx->IsRuntime()) {
       ctx->SetLoDLevel("Out", std::max(ctx->GetLoDLevel("BBoxes"), 1));
       ctx->SetLoDLevel("Index", std::max(ctx->GetLoDLevel("BBoxes"), 1));
@@ -259,8 +263,10 @@ class MatrixNMSKernel : public framework::OpKernel<T> {
     std::vector<size_t> offsets = {0};
     std::vector<T> detections;
     std::vector<int> indices;
+    std::vector<int> num_per_batch;
     detections.reserve(out_dim * num_boxes * batch_size);
     indices.reserve(num_boxes * batch_size);
+    num_per_batch.reserve(batch_size);
     for (int i = 0; i < batch_size; ++i) {
       scores_slice = scores->Slice(i, i + 1);
       scores_slice.Resize({score_dims[1], score_dims[2]});
@@ -272,6 +278,7 @@ class MatrixNMSKernel : public framework::OpKernel<T> {
           background_label, nms_top_k, keep_top_k, normalized, score_threshold,
           post_threshold, use_gaussian, gaussian_sigma);
       offsets.push_back(offsets.back() + num_out);
+      num_per_batch.emplace_back(num_out);
     }
 
     int64_t num_kept = offsets.back();
@@ -285,6 +292,12 @@ class MatrixNMSKernel : public framework::OpKernel<T> {
       std::copy(indices.begin(), indices.end(), index->data<int>());
     }
 
+    if (ctx.HasOutput("RoisNum")) {
+      auto* rois_num = ctx.Output<Tensor>("RoisNum");
+      rois_num->mutable_data<int>({batch_size}, ctx.GetPlace());
+      std::copy(num_per_batch.begin(), num_per_batch.end(),
+                rois_num->data<int>());
+    }
     framework::LoD lod;
     lod.emplace_back(offsets);
     outs->set_lod(lod);
@@ -355,6 +368,8 @@ class MatrixNMSOpMaker : public framework::OpProtoAndCheckerMaker {
               "(LoDTensor) A 2-D LoDTensor with shape [No, 1] represents the "
               "index of selected bbox. The index is the absolute index cross "
               "batches.");
+    AddOutput("RoisNum", "(Tensor), Number of RoIs in each images.")
+        .AsDispensable();
     AddComment(R"DOC(
 This operator does multi-class matrix non maximum suppression (NMS) on batched
 boxes and scores.
@@ -369,7 +384,9 @@ This operator support multi-class and batched inputs. It applying NMS
 independently for each class. The outputs is a 2-D LoDTenosr, for each
 image, the offsets in first dimension of LoDTensor are called LoD, the number
 of offset is N + 1, where N is the batch size. If LoD[i + 1] - LoD[i] == 0,
-means there is no detected bbox for this image.
+means there is no detected bbox for this image. Now this operator has one more
+ouput, which is RoisNum. The size of RoisNum is N, RoisNum[i] means the number of 
+detected bbox for this image.
 
 For more information on Matrix NMS, please refer to:
 https://arxiv.org/abs/2003.10152
@@ -387,3 +404,8 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(matrix_nms, ops::MatrixNMSKernel<float>,
                        ops::MatrixNMSKernel<double>);
+REGISTER_OP_VERSION(matrix_nms)
+    .AddCheckpoint(
+        R"ROC(Upgrade matrix_nms: add a new output [RoisNum].)ROC",
+        paddle::framework::compatible::OpVersionDesc().NewOutput(
+            "RoisNum", "The number of RoIs in each image."));
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 8288f1852c27b..7f2736a9b1d41 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -74,6 +74,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"unique", {"Out", "Index", "Indices", "Counts"}},
     {"generate_proposals", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"collect_fpn_proposals", {"FpnRois", "RoisNum"}},
+    {"matrix_nms", {"Out", "Index", "RoisNum"}},
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
     {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
diff --git a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
index cf756ae838448..2bbacc316f6e6 100644
--- a/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matrix_nms_op.py
@@ -201,7 +201,8 @@ def setUp(self):
         self.inputs = {'BBoxes': boxes, 'Scores': scores}
         self.outputs = {
             'Out': (nmsed_outs, [lod]),
-            'Index': (index_outs[:, None], [lod])
+            'Index': (index_outs[:, None], [lod]),
+            'RoisNum': np.array(lod).astype('int32')
         }
         self.attrs = {
             'background_label': 0,

From 8f87c7eac496d4a1f96906c45684f7718e36eb36 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 28 Oct 2020 10:09:48 +0800
Subject: [PATCH 065/185] fix judge bug of errorlevel on cmd (#28271)

* fix judge bug of errorlevel

* fix some error
---
 paddle/scripts/paddle_build.bat | 20 ++++++++++++++------
 1 file changed, 14 insertions(+), 6 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 207651b0f23f3..6725abefa8c2b 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -17,7 +17,7 @@ rem       Paddle CI Task On Windows Platform
 rem =================================================
 
 @ECHO ON
-SETLOCAL
+setlocal
 
 rem -------clean up environment-----------
 set work_dir=%cd%
@@ -56,11 +56,12 @@ if %error_code% NEQ 0 (
     goto :mkbuild
 )
 
+setlocal enabledelayedexpansion
 git show-ref --verify --quiet refs/heads/last_pr
 if %ERRORLEVEL% EQU 0 (
     git diff HEAD last_pr --stat --name-only
     git diff HEAD last_pr --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
-    if %ERRORLEVEL% EQU 0 (
+    if !ERRORLEVEL! EQU 0 (
         rmdir build /s/q
     )
     git branch -D last_pr
@@ -218,6 +219,7 @@ goto:eof
 
 :cmake_error
 echo 7 > %cache_dir%\error_code.txt
+type %cache_dir%\error_code.txt
 echo Cmake failed, will exit!
 exit /b 7
 
@@ -263,6 +265,7 @@ goto:eof
 
 :build_error
 echo 7 > %cache_dir%\error_code.txt
+type %cache_dir%\error_code.txt
 echo Build Paddle failed, will exit!
 exit /b 7
 
@@ -306,6 +309,7 @@ goto:eof
 
 :test_whl_pacakage_error
 echo 1 > %cache_dir%\error_code.txt
+type %cache_dir%\error_code.txt
 echo Test import paddle failed, will exit!
 exit /b 1
 
@@ -340,7 +344,8 @@ ctest.exe -E "(%disable_ut_quickly%)" --output-on-failure -C Release -j 8 --repe
 goto:eof
 
 :unit_test_error
-echo 8 > %cache_dir%\error_code.txt
+echo 8 > %cache_dir%\
+type %cache_dir%\error_code.txt
 for /F %%# in ('wmic os get localdatetime^|findstr 20') do set end=%%#
 set end=%end:~4,10%
 call :timestamp "%start%" "%end%" "1 card TestCases Total"
@@ -365,6 +370,7 @@ goto:eof
 
 :test_inference_error
 echo 1 > %cache_dir%\error_code.txt
+type %cache_dir%\error_code.txt
 echo Testing fluid library for inference failed!
 exit /b 1
 
@@ -374,8 +380,10 @@ echo    ========================================
 echo    Step 6. Check whether deleting a unit test ...
 echo    ========================================
 
+@ECHO OFF
 cd /d %work_dir%\build
-echo set -ex>  check_change_of_unittest.sh
+echo set -e>  check_change_of_unittest.sh
+echo set +x>> check_change_of_unittest.sh
 echo GITHUB_API_TOKEN=%GITHUB_API_TOKEN% >>  check_change_of_unittest.sh
 echo GIT_PR_ID=%AGILE_PULL_ID% >>  check_change_of_unittest.sh
 echo BRANCH=%BRANCH%>>  check_change_of_unittest.sh
@@ -420,7 +428,6 @@ echo unittest_spec_diff=`python $(pwd)/../tools/diff_unittest.py $(pwd)/UNITTEST
 echo if [ "$unittest_spec_diff" != "" ]; then>>  check_change_of_unittest.sh
 echo     # approval_user_list: XiaoguangHu01 46782768,luotao1 6836917,phlrain 43953930,lanxianghit 47554610, zhouwei25 52485244, kolinwei 22165420>>  check_change_of_unittest.sh
 echo     approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`>>  check_change_of_unittest.sh
-echo     set +x>>  check_change_of_unittest.sh
 echo     if [ "$approval_line" != "" ]; then>>  check_change_of_unittest.sh
 echo         APPROVALS=`echo ${approval_line} ^|python $(pwd)/../tools/check_pr_approval.py 1 22165420 52485244 6836917`>>  check_change_of_unittest.sh
 echo         echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}">>  check_change_of_unittest.sh
@@ -444,11 +451,11 @@ goto:eof
 
 :check_change_of_unittest_error
 echo 1 > %cache_dir%\error_code.txt
+type %cache_dir%\error_code.txt
 exit /b 1
 
 
 :timestamp
-echo on
 setlocal enabledelayedexpansion
 set start=%~1
 set dd=%start:~2,2%
@@ -502,6 +509,7 @@ taskkill /f /im rc.exe 2>NUL
 wmic process where name="op_function_generator.exe" call terminate 2>NUL
 taskkill /f /im python.exe  2>NUL
 echo 0 > %cache_dir%\error_code.txt
+type %cache_dir%\error_code.txt
 echo Windows CI run successfully!
 exit /b 0
 

From d932b5618fb4d03d3cc461dd86f84b5549510ec0 Mon Sep 17 00:00:00 2001
From: LiuChiachi <709153940@qq.com>
Date: Wed, 28 Oct 2020 10:11:10 +0800
Subject: [PATCH 066/185] set random seed to in test_export_deploy_model
 (#28274)

---
 python/paddle/tests/test_model.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/tests/test_model.py b/python/paddle/tests/test_model.py
index 1cdb7e4e827a9..a3b33d6f253be 100644
--- a/python/paddle/tests/test_model.py
+++ b/python/paddle/tests/test_model.py
@@ -541,7 +541,7 @@ def test_summary_error(self):
 
     def test_export_deploy_model(self):
         self.set_seed()
-        np.random.seed(2020)
+        np.random.seed(201)
         for dynamic in [True, False]:
             paddle.disable_static() if dynamic else None
             prog_translator = ProgramTranslator()

From c11d9b3035e756ca45ae53c9e3ba2f7e303bc7f2 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Wed, 28 Oct 2020 04:04:02 +0100
Subject: [PATCH 067/185] [oneDNN ] conv2d fwd&bwd optimization (#27871)

---
 .../fluid/operators/mkldnn/conv_mkldnn_op.cc  | 88 +++++++++++--------
 paddle/fluid/platform/mkldnn_helper.h         |  4 +
 paddle/fluid/platform/mkldnn_reuse.h          | 18 ++++
 .../mkldnn/test_conv2d_bf16_mkldnn_op.py      |  2 +
 .../unittests/mkldnn/test_conv2d_mkldnn_op.py |  2 +
 .../mkldnn/test_fusion_gru_bf16_mkldnn_op.py  |  2 +
 .../paddle/fluid/tests/unittests/op_test.py   | 10 +++
 7 files changed, 88 insertions(+), 38 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index f44ce8c56733a..b333b42c0142d 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -211,22 +211,8 @@ class ConvMKLDNNHandlerT
        * ('any') which lets a primitive (convolution in this case) choose
        * the memory format preferred for best performance
        */
-      // TODO(jczaja): This is workaround to make grad op UT's numerical
-      // gradient computation proper as this op is called directly without
-      // fetch op following it , so numercial grad is computed (in python)
-      // using block formats which will give wrong results
-      const std::string data_format = ctx.Attr<std::string>("data_format");
-      auto chosen_memory_format =
-          is_test ? MKLDNNMemoryFormat::any
-                  : platform::data_format_to_memory_format(data_format);
-
-      // Check the format for user's special output
-      if (chosen_memory_format != MKLDNNMemoryFormat::any) {
-        if (is_conv3d) {
-          chosen_memory_format = platform::MKLDNNFormatForSize(
-              src_tz.size(), chosen_memory_format);
-        }
-      }
+      auto chosen_memory_format = MKLDNNMemoryFormat::any;
+
       auto data_type = mkldnn::memory::data_type::f32;
       if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
           std::is_same<T_out, platform::bfloat16>::value)
@@ -351,14 +337,16 @@ class ConvMKLDNNHandlerT
 
   std::shared_ptr<mkldnn::memory> AcquireResidualMemory(
       const framework::Tensor* residual_param) {
-    const T* residual_data = residual_param->data<T>();
+    void* residual_data =
+        residual_param->type() == framework::DataTypeTrait<T_out>::DataType()
+            ? to_void_cast<T_out>(residual_param->data<T_out>())
+            : to_void_cast<T>(residual_param->data<T>());
     auto user_residual_md = platform::MKLDNNMemDesc(
         framework::vectorize(residual_param->dims()),
         framework::ToMKLDNNDataType(residual_param->type()),
         residual_param->format());
 
-    return this->AcquireMemoryFromPrimitive(user_residual_md,
-                                            to_void_cast<T>(residual_data),
+    return this->AcquireMemoryFromPrimitive(user_residual_md, residual_data,
                                             "@user_residual_data_mem_p");
   }
 
@@ -973,22 +961,8 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
      * the memory format preferred for best performance
      */
 
-    // TODO(jczaja): Once GRAD NHWC is working then format 'any'
-    // should be used exclusively. But till forward pass enforce
-    // NCHW for training we need to have NCHW here as well
-    // to avoid performance degradation in relu_grad and pool2d_grad
-    std::string data_format = ctx.Attr<std::string>("data_format");
-    auto chosen_memory_format =
-        platform::data_format_to_memory_format(data_format);
-
+    auto chosen_memory_format = MKLDNNMemoryFormat::any;
     weights_format = MKLDNNMemoryFormat::any;
-    // Check the format for user's special output
-    if (chosen_memory_format != MKLDNNMemoryFormat::any) {
-      if (is_conv3d) {
-        chosen_memory_format =
-            platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format);
-      }
-    }
 
     auto src_md = platform::MKLDNNMemDesc(
         src_tz, platform::MKLDNNGetDataType<T>(), chosen_memory_format);
@@ -1055,9 +1029,12 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       const size_t size = handler.GetDiffWeightsMemorySize();
       filter_grad_data = filter_grad->mutable_data<T>(ctx.GetPlace(), size);
 
+      // For convoluition with groups write filter grad into
+      // oneDNN buffer and then we reorder it into filter_grad tensor
       auto diff_weights_memory_p =
-          handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
-              reinterpret_cast<void*>(filter_grad_data));
+          g > 1 ? handler.AcquireDiffWeightsMemoryFromWeightsPrimitive()
+                : handler.AcquireDiffWeightsMemoryFromWeightsPrimitive(
+                      reinterpret_cast<void*>(filter_grad_data));
 
       auto conv_bwd_weights_p = handler.AcquireConvolutionBackwardWeights();
 
@@ -1072,8 +1049,43 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
       // in OneDNN groups in convolution are treated as separate dimension
       // which is not the case in paddlepaddle
       auto filter_fmt = GetMKLDNNFormat(*diff_weights_memory_p);
-      filter_grad->set_format(platform::MKLDNNFormatForSize(
-          g > 1 ? weights_tz.size() - 1 : weights_tz.size(), filter_fmt));
+
+      // For convolution with groups convert from blocked to NCHW
+      // otherwise there will be problems in next operators working on this data
+      if (g > 1) {
+        memory::data_type in_type =
+            framework::ToMKLDNNDataType(filter_grad->type());
+        // for 3d conv with groups (six dimensional data reorder to goidhw)
+        // for 2d conv with groups (five dimensional data reorder to goihw)
+        mkldnn::memory::format_tag out_format =
+            weights_tz.size() == 6 ? mkldnn::memory::format_tag::goidhw
+                                   : mkldnn::memory::format_tag::goihw;
+        const std::string key =
+            platform::CreateKey(weights_tz, filter_fmt, out_format, in_type);
+
+        platform::ReorderMKLDNNHandler handler(weights_tz, filter_grad->type(),
+                                               in_type, dev_ctx, mkldnn_engine,
+                                               key);
+        auto reorder_dst_memory_p =
+            handler.AcquireDstMemory(filter_grad, out_format, ctx.GetPlace());
+
+        auto reorder_p =
+            handler.AcquireReorder(reorder_dst_memory_p, diff_weights_memory_p);
+
+        reorder_p->execute(astream, *diff_weights_memory_p,
+                           *reorder_dst_memory_p);
+        astream.wait();
+
+        // So here we have a data in goihw , which can be interpreted as OIHW
+        // (OIDHW for conv3d)
+        // because filter_grad shape is set for OIHW (OIDHW for conv3d)
+        mkldnn::memory::format_tag target_format =
+            weights_tz.size() == 6 ? mkldnn::memory::format_tag::oidhw
+                                   : mkldnn::memory::format_tag::oihw;
+        filter_grad->set_format(target_format);
+      } else {
+        filter_grad->set_format(filter_fmt);
+      }
     }
     if (input_grad) {
       auto weights_memory_p = handler.AcquireWeightsMemoryFromDataPrimitive(
diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h
index d8dd166f325c8..67b68183cc847 100644
--- a/paddle/fluid/platform/mkldnn_helper.h
+++ b/paddle/fluid/platform/mkldnn_helper.h
@@ -289,6 +289,10 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(
             strides[3] >= strides[4] && strides[4] >= strides[1]) {
           return mkldnn::memory::format_tag::Acdeb16a;
         }
+        if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
+            strides[2] >= strides[3] && strides[3] >= strides[4]) {
+          return mkldnn::memory::format_tag::Abcde16a;
+        }
       } else if (inner_blks[0] == 16 && inner_idxs[0] == 1) {
         if (strides[0] >= strides[1] && strides[1] >= strides[2] &&
             strides[2] >= strides[3] && strides[3] >= strides[4]) {
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 740ac1d81f8f9..2d9e4333ac95e 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -346,6 +346,18 @@ class MKLDNNHandler {
     return mem_p;
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
+      mkldnn::memory::desc md, const std::string& suffix) {
+    const auto local_key = key_ + suffix;
+    auto mem_p =
+        std::static_pointer_cast<mkldnn::memory>(dev_ctx_.GetBlob(local_key));
+    if (mem_p == nullptr) {
+      mem_p = std::make_shared<mkldnn::memory>(md, engine_);
+      dev_ctx_.SetBlob(local_key, mem_p);
+    }
+    return mem_p;
+  }
+
   // This incarnation of AcquireMemory can call user function eg. custom reorder
   // or preprocessing routine if needed
   std::shared_ptr<mkldnn::memory> AcquireMemory(
@@ -1199,6 +1211,12 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
         conv_bwd_weights_pd_->diff_weights_desc(), ptr, "@diff_weights_mem_p");
   }
 
+  std::shared_ptr<mkldnn::memory> AcquireDiffWeightsMemoryFromWeightsPrimitive(
+      void) {
+    return this->AcquireMemoryFromPrimitive(
+        conv_bwd_weights_pd_->diff_weights_desc(), "@diff_weights_mem_p");
+  }
+
   std::shared_ptr<mkldnn::memory> AcquireDiffDstMemoryFromDataPrimitive(
       const std::shared_ptr<mkldnn::memory> user_memory_p,
       std::vector<mkldnn::primitive>& pipeline) {  // NOLINT
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
index efd0e95dd384f..7ab738ea577fc 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_bf16_mkldnn_op.py
@@ -216,4 +216,6 @@ def init_group(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
index eb906684f0fb1..50d53864789f3 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_conv2d_mkldnn_op.py
@@ -233,4 +233,6 @@ def init_group(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
index 83b636650ab41..90140a3474fed 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_bf16_mkldnn_op.py
@@ -110,4 +110,6 @@ def set_confs(self):
 
 
 if __name__ == "__main__":
+    from paddle import enable_static
+    enable_static()
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 649c12ea50f88..a572d556a396b 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -1320,6 +1320,13 @@ def check_grad_with_place(self,
         cache_list = None
         if hasattr(self, "cache_name_list"):
             cache_list = self.cache_name_list
+
+        # oneDNN numeric gradient should use CPU kernel
+        use_onednn = False
+        if "use_mkldnn" in op_attrs and op_attrs["use_mkldnn"] == True:
+            op_attrs["use_mkldnn"] = False
+            use_onednn = True
+
         self.op = create_op(
             self.scope,
             self.op_type,
@@ -1328,6 +1335,9 @@ def check_grad_with_place(self,
             op_attrs,
             cache_list=cache_list)
 
+        if use_onednn:
+            op_attrs["use_mkldnn"] = True
+
         if no_grad_set is None:
             no_grad_set = set()
         else:

From 0b678d401bc43c336bdd41143151222fe99c13a1 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Wed, 28 Oct 2020 04:04:39 +0100
Subject: [PATCH 068/185] - sum (#28233)

test=develop
---
 paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 414312fe97ecb..bdff665f0f626 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -80,8 +80,6 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     auto& input0 = in_vars[0]->Get<LoDTensor>();
     in_place = (input0.numel() > 0) && (input0.data<T>() == output_data);
 
-    MKLDNNMemoryFormat input_format = input0.format();
-
     for (size_t i = 0; i < in_vars.size(); i++) {
       auto& input_it = in_vars[i]->Get<LoDTensor>();
       if (input_it.numel() == 0) {
@@ -89,6 +87,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
       }
 
       const T* input_data = input_it.data<T>();
+      MKLDNNMemoryFormat input_format = input_it.format();
 
       auto src_md = memory::desc(src_tz, memory::data_type::f32, input_format);
       auto src_mem = memory(src_md, mkldnn_engine, to_void_cast(input_data));
@@ -115,7 +114,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
     std::shared_ptr<mkldnn::reorder> reorder_p;
     std::shared_ptr<memory> target_mem;
     if (in_place) {
-      output_format = input_format;
+      output_format = input0.format();
       target_mem.reset(
           new memory({{src_tz}, memory::data_type::f32, output_format},
                      mkldnn_engine, output_data));

From b63e0ccb4a029784b38b9cb2d0d963250c0c0fda Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Wed, 28 Oct 2020 11:07:03 +0800
Subject: [PATCH 069/185] fix load check_point bug of LinearWarmup (#28280)

---
 .../tests/unittests/test_lr_scheduler.py      | 50 +++++++++++++------
 python/paddle/optimizer/lr.py                 | 23 ++++++++-
 2 files changed, 57 insertions(+), 16 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
index 0cdc413c2f68c..8c6383cd6ef52 100644
--- a/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_lr_scheduler.py
@@ -284,11 +284,19 @@ def linear_warmup_lr(epoch_num,
                      start_lr,
                      end_lr,
                      verbose=False):
-    if epoch_num < warmup_steps:
+    tmp = epoch_num - warmup_steps
+    if tmp < 0:
         return start_lr + (end_lr - start_lr) * (float(epoch_num) /
                                                  float(warmup_steps))
+    elif paddle.in_dynamic_mode():
+        if tmp < 3:
+            return 0.5
+        elif tmp < 6:
+            return 0.2
+        else:
+            return 0.1
     else:
-        return learning_rate
+        return 0.5
 
 
 def multi_step_lr(epoch_num,
@@ -407,6 +415,9 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place):
         paddle.disable_static(place)
         x = np.random.uniform(-1, 1, [10, 10]).astype("float32")
         linear = paddle.nn.Linear(10, 10)
+        if paddle_api.__name__ == "LinearWarmup":
+            kwarg['learning_rate'] = paddle.optimizer.lr.PiecewiseDecay(
+                [3, 6], [0.5, 0.2, 0.1])
         scheduler = paddle_api(**kwarg)
         adam = paddle.optimizer.Adam(
             learning_rate=scheduler, parameters=linear.parameters())
@@ -420,12 +431,26 @@ def _test_dygraph(self, python_func, paddle_api, kwarg, place):
                 adam.clear_grad()
             current_lr = adam.get_lr()
             expected_lr = python_func(epoch, **kwarg)
-            if paddle_api.__name__ != "CosineAnnealingDecay":
-                self.assertEqual(current_lr, expected_lr)
-                scheduler.step()
-            else:
+            if paddle_api.__name__ == "CosineAnnealingDecay":
                 self.assertAlmostEqual(current_lr, expected_lr)
                 scheduler.step(epoch + 1)
+            elif paddle_api.__name__ == "LinearWarmup":
+                self.assertAlmostEqual(current_lr, expected_lr)
+                state_dict = adam.state_dict()
+                scheduler1 = paddle.optimizer.lr.LinearWarmup(**kwarg)
+                adam1 = paddle.optimizer.Adam(
+                    learning_rate=scheduler1, parameters=linear.parameters())
+                adam1.set_state_dict(state_dict)
+                self.assertEqual(scheduler.last_epoch, scheduler1.last_epoch)
+                self.assertEqual(scheduler.last_lr, scheduler1.last_lr)
+                self.assertEqual(scheduler.learning_rate.last_lr,
+                                 scheduler1.learning_rate.last_lr)
+                self.assertEqual(scheduler.learning_rate.last_epoch,
+                                 scheduler1.learning_rate.last_epoch)
+                scheduler.step()
+            else:
+                self.assertEqual(current_lr, expected_lr)
+                scheduler.step()
 
     def test_scheduler(self):
         with self.assertRaises(NotImplementedError):
@@ -464,8 +489,7 @@ def test_scheduler(self):
             "decay_steps": 20,
             "end_lr": 0,
             "power": 1.0,
-            "cycle": False,
-            "verbose": True
+            "cycle": False
         }), (polynomial_lr, paddle.optimizer.lr.PolynomialDecay, {
             "learning_rate": 0.5,
             "decay_steps": 20,
@@ -475,10 +499,9 @@ def test_scheduler(self):
             "verbose": False
         }), (linear_warmup_lr, paddle.optimizer.lr.LinearWarmup, {
             'learning_rate': 0.5,
-            'warmup_steps': 20,
+            'warmup_steps': 10,
             'start_lr': 0,
-            'end_lr': 0.5,
-            "verbose": True
+            'end_lr': 0.5
         }), (exponential_lr, paddle.optimizer.lr.ExponentialDecay, {
             "learning_rate": 0.5,
             "gamma": 0.9,
@@ -486,8 +509,7 @@ def test_scheduler(self):
         }), (multi_step_lr, paddle.optimizer.lr.MultiStepDecay, {
             "learning_rate": 0.5,
             "milestones": [3, 6, 9, 15, 20],
-            "gamma": 0.8,
-            "verbose": True
+            "gamma": 0.8
         }), (step_lr, paddle.optimizer.lr.StepDecay, {
             "learning_rate": 0.5,
             "step_size": 2,
@@ -510,7 +532,7 @@ def test_scheduler(self):
 
             for place in places:
                 paddle.enable_static()
-                #self._test_static(python_func, paddle_api, kwarg, place)
+                self._test_static(python_func, paddle_api, kwarg, place)
                 paddle.disable_static(place)
                 self._test_dygraph(python_func, paddle_api, kwarg, place)
                 paddle.enable_static()
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 051d3cf18f9f0..80b4b2a9d0562 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -365,7 +365,6 @@ def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
             last_epoch=last_epoch, verbose=verbose)
 
     def get_lr(self):
-
         for i in range(len(self.boundaries)):
             if self.last_epoch < self.boundaries[i]:
                 return self.values[i]
@@ -750,14 +749,34 @@ def __init__(self,
             end_lr, start_lr)
         super(LinearWarmup, self).__init__(start_lr, last_epoch, verbose)
 
+    def state_dict(self):
+        """
+        Returns the state of the LinearWarmup scheduler as a :class:`dict`.
+
+        It is a subset of ``self.__dict__`` .
+        """
+        state_dict = super(LinearWarmup, self).state_dict()
+        if isinstance(self.learning_rate, LRScheduler):
+            state_dict["LinearWarmup_LR"] = self.learning_rate.state_dict()
+        return state_dict
+
+    def set_state_dict(self, state_dict):
+        """
+        Loads state_dict for LinearWarmup scheduler.
+        """
+        super(LinearWarmup, self).set_state_dict(state_dict)
+        if isinstance(self.learning_rate, LRScheduler):
+            self.learning_rate.set_state_dict(state_dict["LinearWarmup_LR"])
+
     def get_lr(self):
         if self.last_epoch < self.warmup_steps:
             return (self.end_lr - self.start_lr) * float(
                 self.last_epoch) / float(self.warmup_steps) + self.start_lr
         else:
             if isinstance(self.learning_rate, LRScheduler):
+                lr_value = self.learning_rate()
                 self.learning_rate.step()
-                return self.learning_rate()
+                return lr_value
 
             return self.learning_rate
 

From 8f83d5d8759925ea17dac2e00c10329e774af3d4 Mon Sep 17 00:00:00 2001
From: pangyoki <pangyoki@126.com>
Date: Wed, 28 Oct 2020 11:36:10 +0800
Subject: [PATCH 070/185] fix AMP auto_cast and grad_scaler En doc (#28177)

* fix AMP auto_cast and grad_scaler En doc

* fix indentation problem

* change Conv2d to Conv2D
---
 python/paddle/amp/auto_cast.py   | 20 ++++++--
 python/paddle/amp/grad_scaler.py | 86 ++++++++++++++++----------------
 2 files changed, 60 insertions(+), 46 deletions(-)

diff --git a/python/paddle/amp/auto_cast.py b/python/paddle/amp/auto_cast.py
index 63c7d999fde77..441bc31b93684 100644
--- a/python/paddle/amp/auto_cast.py
+++ b/python/paddle/amp/auto_cast.py
@@ -23,13 +23,17 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
     If enabled, the input data type (float32 or float16) of each operator is decided 
     by autocast algorithm for better performance. 
     
-    Commonly, it is used together with `AmpScaler` to achieve Auto-Mixed-Precision in 
+    Commonly, it is used together with `GradScaler` to achieve Auto-Mixed-Precision in 
     imperative mode.
 
     Args:
         enable(bool, optional): Enable auto-mixed-precision or not. Default is True.
-        custom_white_list(set|list, optional): The custom white_list.
-        custom_black_list(set|list, optional): The custom black_list.
+        custom_white_list(set|list, optional): The custom white_list. It's the set of ops that support
+             fp16 calculation and are considered numerically-safe and performance-critical. These ops 
+             will be converted to fp16.
+        custom_black_list(set|list, optional): The custom black_list. The set of ops that support fp16
+             calculation and are considered numerically-dangerous and whose effects may also be 
+             observed in downstream ops. These ops will not be converted to fp16.
         
     Examples:
 
@@ -48,5 +52,15 @@ def auto_cast(enable=True, custom_white_list=None, custom_black_list=None):
             conv = conv2d(data)
             print(conv.dtype) # FP32
 
+        with paddle.amp.auto_cast(custom_black_list={'conv2d'}):
+            conv = conv2d(data)
+            print(conv.dtype) # FP32
+
+        a = paddle.rand([2,3])
+        b = paddle.rand([2,3])
+        with paddle.amp.auto_cast(custom_white_list={'elementwise_add'}):
+            c = a + b
+            print(c.dtype) # FP16
+
     """
     return amp_guard(enable, custom_white_list, custom_black_list)
diff --git a/python/paddle/amp/grad_scaler.py b/python/paddle/amp/grad_scaler.py
index e3cd05dcb30a8..5ae04042c87ce 100644
--- a/python/paddle/amp/grad_scaler.py
+++ b/python/paddle/amp/grad_scaler.py
@@ -19,12 +19,12 @@
 
 class GradScaler(AmpScaler):
     """
-    GradScaler is used for Auto-Mixed-Precision training/inferring in dynamic graph
-    mode. It controls the scaling of loss, helps avoiding numerical overflow.
+    GradScaler is used for Auto-Mixed-Precision training in dynamic graph mode. 
+    It controls the scaling of loss, helps avoiding numerical overflow.
     The object of this class has two methods `scale()`, `minimize()`.
 
     `scale()` is used to multiply the loss by a scale ratio.
-    `minimize()` is similar as `Optimizer.minimize()`, performs parameters updating.
+    `minimize()` is similar as `optimizer.minimize()`, performs parameters updating.
 
     Commonly, it is used together with `paddle.amp.auto_cast` to achieve Auto-Mixed-Precision in 
     dynamic graph mode.
@@ -42,24 +42,24 @@ class GradScaler(AmpScaler):
                                     accumulated steps with nan or inf gradients. Default is 2.
         use_dynamic_loss_scaling(bool, optional): Whether to use dynamic loss scaling. If False, fixed loss_scaling is used. If True, the loss scaling is updated dynamicly. Default is True.
     Returns:
-        An AmpScaler object.
+        An GradScaler object.
 
     Examples:
 
-     .. code-block:: python
+        .. code-block:: python
 
-        import paddle
+            import paddle
 
-        model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
-        optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
-        scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-        data = paddle.rand([10, 3, 32, 32])
-        with paddle.amp.auto_cast():
-            conv = model(data)
-            loss = paddle.mean(conv)
-            scaled = scaler.scale(loss)  # scale the loss 
-            scaled.backward()            # do backward
-            scaler.minimize(optimizer, scaled)  # update parameters     
+            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+            data = paddle.rand([10, 3, 32, 32])
+            with paddle.amp.auto_cast():
+                conv = model(data)
+                loss = paddle.mean(conv)
+                scaled = scaler.scale(loss)  # scale the loss 
+                scaled.backward()            # do backward
+                scaler.minimize(optimizer, scaled)  # update parameters     
     """
 
     def __init__(self,
@@ -68,7 +68,7 @@ def __init__(self,
                  incr_ratio=2.0,
                  decr_ratio=0.5,
                  incr_every_n_steps=1000,
-                 decr_every_n_nan_or_inf=1,
+                 decr_every_n_nan_or_inf=2,
                  use_dynamic_loss_scaling=True):
         super(GradScaler, self).__init__(enable, init_loss_scaling, incr_ratio,
                                          decr_ratio, incr_every_n_steps,
@@ -88,24 +88,24 @@ def scale(self, var):
         Examples:
             .. code-block:: python
 
-            import paddle
-
-            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
-            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-            data = paddle.rand([10, 3, 32, 32])
-            with paddle.amp.auto_cast():
-                conv = model(data)
-                loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)  # scale the loss 
-                scaled.backward()            # do backward
-                scaler.minimize(optimizer, scaled)  # update parameters  
+                import paddle
+
+                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+                data = paddle.rand([10, 3, 32, 32])
+                with paddle.amp.auto_cast():
+                    conv = model(data)
+                    loss = paddle.mean(conv)
+                    scaled = scaler.scale(loss)  # scale the loss 
+                    scaled.backward()            # do backward
+                    scaler.minimize(optimizer, scaled)  # update parameters  
         """
         return super(GradScaler, self).scale(var)
 
     def minimize(self, optimizer, *args, **kwargs):
         """
-        This function is similar as `Optimizer.minimize()`, which performs parameters updating.
+        This function is similar as `optimizer.minimize()`, which performs parameters updating.
         
         If the scaled gradients of parameters contains NAN or INF, the parameters updating is skipped.
         Otherwise, it first unscales the scaled gradients of parameters, then updates the parameters.
@@ -115,22 +115,22 @@ def minimize(self, optimizer, *args, **kwargs):
         Args:
             optimizer(Optimizer):  The optimizer used to update parameters.
             args:  Arguments, which will be forward to `optimizer.minimize()`.
-            kwargs: Keyword arguments, which will be forward to `Optimizer.minimize()`.
+            kwargs: Keyword arguments, which will be forward to `optimizer.minimize()`.
 
         Examples:
             .. code-block:: python
 
-            import paddle
-
-            model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
-            optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
-            scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
-            data = paddle.rand([10, 3, 32, 32])
-            with paddle.amp.auto_cast():
-                conv = model(data)
-                loss = paddle.mean(conv)
-                scaled = scaler.scale(loss)  # scale the loss 
-                scaled.backward()            # do backward
-                scaler.minimize(optimizer, scaled)  # update parameters  
+                import paddle
+
+                model = paddle.nn.Conv2D(3, 2, 3, bias_attr=True)
+                optimizer = paddle.optimizer.SGD(learning_rate=0.01, parameters=model.parameters())
+                scaler = paddle.amp.GradScaler(init_loss_scaling=1024)
+                data = paddle.rand([10, 3, 32, 32])
+                with paddle.amp.auto_cast():
+                    conv = model(data)
+                    loss = paddle.mean(conv)
+                    scaled = scaler.scale(loss)  # scale the loss 
+                    scaled.backward()            # do backward
+                    scaler.minimize(optimizer, scaled)  # update parameters  
         """
         return super(GradScaler, self).minimize(optimizer, *args, **kwargs)

From 4dc8c44ba163733eddf9edbae36b93a4f7374501 Mon Sep 17 00:00:00 2001
From: Chengmo <cmchengmo@163.com>
Date: Wed, 28 Oct 2020 14:12:55 +0800
Subject: [PATCH 071/185] =?UTF-8?q?=E3=80=90Paddle.Fleet=E3=80=91Fix=20fle?=
 =?UTF-8?q?etrun=20heter=20(#28252)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix fleetrun heter ps on paddlecloud
---
 python/paddle/distributed/fleet/launch_utils.py    | 11 ++++++-----
 .../meta_optimizers/parameter_server_optimizer.py  | 14 ++++++++++++++
 .../fleet/parameter_server/ir/trainer_pass.py      |  8 ++++----
 .../test_dist_fleet_a_sync_optimizer_auto.py       |  1 +
 .../test_dist_fleet_a_sync_optimizer_auto_async.py |  1 +
 .../test_dist_fleet_a_sync_optimizer_auto_geo.py   |  2 +-
 .../test_dist_fleet_a_sync_optimizer_geo.py        |  4 ++--
 7 files changed, 29 insertions(+), 12 deletions(-)

diff --git a/python/paddle/distributed/fleet/launch_utils.py b/python/paddle/distributed/fleet/launch_utils.py
index ec4b0342f2414..2ae5747af9e7c 100644
--- a/python/paddle/distributed/fleet/launch_utils.py
+++ b/python/paddle/distributed/fleet/launch_utils.py
@@ -603,7 +603,7 @@ def cloud_ps_heter_env_set(args):
     avilable_ports = os.getenv("TRAINER_PORTS", "").split(",")
     assert len(
         avilable_ports
-    ) > 3, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit"
+    ) >= 2, "set paddle_ports_num >= 2 in config.ini for paddlecloud job submit"
 
     # hard code for paddlecloud custom-framework
     trainers_num = len(paddle_pserver_endpoints.split(","))
@@ -894,7 +894,7 @@ def start_pod_server(self, args, pod):
                 "TRAINING_ROLE": "PSERVER",
                 "PADDLE_TRAINERS_NUM": str(self.worker_num),
                 "POD_IP": cur_server.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
                 "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "PADDLE_GLOO_HTTP_ENDPOINT": self.http_port
@@ -958,7 +958,7 @@ def start_pod_worker(self, args, pod):
                 self.heter_worker_endpoints,
                 "TRAINING_ROLE": "TRAINER",
                 "PADDLE_TRAINER_ID": str(cur_worker.rank),
-                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
                 "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "FLAGS_selected_gpus": "0",
@@ -1014,7 +1014,8 @@ def start_pod_heter_worker(self, args, pod):
         elif fluid.core.is_compiled_with_xpu():
             heter_device_num = fluid.core.get_xpu_device_count()
             device_list = [str(x) for x in range(0, heter_device_num)]
-        assert heter_device_num != 0
+        if heter_device_num == 0:
+            return
 
         for idx, cur_heter_worker in enumerate(pod.heter_workers):
             device_id = str(device_list[idx % heter_device_num])
@@ -1027,7 +1028,7 @@ def start_pod_heter_worker(self, args, pod):
                 "TRAINING_ROLE": "HETER_TRAINER",
                 "PADDLE_TRAINERS_NUM": str(self.worker_num),
                 "POD_IP": cur_heter_worker.endpoint.split(":")[0],
-                "PADDLE_WITH_GLOO": "1",
+                "PADDLE_WITH_GLOO": str(os.getenv("PADDLE_WITH_GLOO", "1")),
                 "PADDLE_GLOO_RENDEZVOUS": "3",
                 "PADDLE_GLOO_FS_PATH": self.gloo_rendezvous_dir,
                 "FLAGS_selected_gpus": "0",
diff --git a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
index 83345cb6f623e..10b0c82c0eef9 100644
--- a/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
+++ b/python/paddle/distributed/fleet/meta_optimizers/parameter_server_optimizer.py
@@ -16,7 +16,9 @@
 from paddle.fluid import core
 import subprocess
 import re
+import os
 import platform
+from ..base.private_helper_function import wait_server_ready
 
 
 class ParameterServerOptimizer(MetaOptimizerBase):
@@ -96,6 +98,18 @@ def _build_trainer_programs(self, compiled_config):
             compiled_config.set_origin_ps_main_program(_main)
             compiled_config.set_origin_ps_startup_program(_startup)
 
+        launch_barrier = self.user_defined_strategy.a_sync_configs[
+            "launch_barrier"]
+        launch_barrier_flag = int(os.getenv("FLAGS_LAUNCH_BARRIER", "1"))
+        if launch_barrier and launch_barrier_flag:
+            # for trainer wait server ready
+            wait_server_ready(self.role_maker._get_pserver_endpoints())
+
+            # for ps-heter mode, wait heter worker ready
+            if self.role_maker._is_heter_parameter_server_mode and self.role_maker._is_worker(
+            ):
+                wait_server_ready(self.role_maker._get_heter_worker_endpoints())
+
         return _main, _startup
 
     def _build_pserver_programs(self, compiled_config):
diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
index 3f826da3ae2be..8749b939de22d 100644
--- a/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/ir/trainer_pass.py
@@ -14,6 +14,7 @@
 # limitations under the License.
 
 from __future__ import print_function
+import os
 import six
 import collections
 import warnings
@@ -549,11 +550,10 @@ def create_heter_program(program, config, heter_program, heter_ops,
         "pserver_id": config.get_role_id(),
         "Fanin": config.get_trainers(),
         "distributed_mode": config.get_distributed_mode(),
-        "rpc_get_thread_num": 12,
-        "rpc_send_thread_num": 12,
-        "rpc_prefetch_thread_num": 12
+        "rpc_get_thread_num": int(os.getenv("CPU_NUM", 32)),
+        "rpc_send_thread_num": int(os.getenv("CPU_NUM", 32)),
+        "rpc_prefetch_thread_num": int(os.getenv("CPU_NUM", 32))
     }
-
     # append the listen_and_serv op
     heter_program.global_block().append_op(
         type="listen_and_serv", inputs={'X': []}, outputs={}, attrs=attrs)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
index b8393f1e28a94..35577c2712169 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto.py
@@ -54,6 +54,7 @@ def test_a_sync_optimizer1(self):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
+        os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
         optimizer = paddle.fluid.optimizer.Adam(learning_rate=0.01)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
index 49b34f059e8d8..415a8092b1b9b 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
@@ -66,6 +66,7 @@ def test_a_sync_optimizer3(self):
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
 
+        os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
index 334a4e028b2c4..ec975ec1fa806 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
@@ -53,7 +53,7 @@ def test_a_sync_optimizer2(self):
         cost = paddle.fluid.layers.cross_entropy(
             input=prediction, label=input_y)
         avg_cost = paddle.fluid.layers.mean(x=cost)
-
+        os.environ["FLAGS_LAUNCH_BARRIER"] = "0"
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.auto = True
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
diff --git a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
index db73069bf7d42..71937f70ef8d4 100644
--- a/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
@@ -56,7 +56,7 @@ def test_a_sync_optimizer_trainer(self):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
-        strategy.a_sync_configs = {"k_steps": 100}
+        strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False}
         optimizer = paddle.fluid.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)
@@ -99,7 +99,7 @@ def test_a_sync_optimizer_pserver(self):
 
         strategy = paddle.distributed.fleet.DistributedStrategy()
         strategy.a_sync = True
-        strategy.a_sync_configs = {"k_steps": 100}
+        strategy.a_sync_configs = {"k_steps": 100, "launch_barrier": False}
         optimizer = paddle.optimizer.SGD(learning_rate=0.01)
         optimizer = fleet.distributed_optimizer(optimizer, strategy=strategy)
         optimizer.minimize(avg_cost)

From 2853f0c4f979b95015466113376448abc2daef4d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 28 Oct 2020 02:29:04 -0500
Subject: [PATCH 072/185] Set static shape for shape tensor with constant [part
 1] (#28275)

* set static shape for shape tensor with constant

* remove debug code

* fix typo

* add ut

* refine code

* refine example
---
 python/paddle/fluid/layers/nn.py              |  1 +
 python/paddle/fluid/layers/utils.py           | 53 ++++++++++++++++++-
 ...tatic_shape_inferrence_for_shape_tensor.py | 31 +++++++++++
 3 files changed, 84 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index d5157abf1a992..adde9cbd19ffa 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -15145,6 +15145,7 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0,
     helper.append_op(
         type="uniform_random", inputs=inputs, attrs=attrs,
         outputs={"Out": out})
+    utils.try_set_static_shape_tensor(out, shape)
     return out
 
 
diff --git a/python/paddle/fluid/layers/utils.py b/python/paddle/fluid/layers/utils.py
index 2095c9957e75b..0d278d493bc11 100644
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
@@ -17,7 +17,7 @@
 import copy
 import six
 import numpy as np
-from ..framework import Variable
+from ..framework import Variable, in_dygraph_mode
 from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..layer_helper import LayerHelper
 from sys import version_info
@@ -378,3 +378,54 @@ def check_shape(shape):
                     raise TypeError(
                         "All elements in ``shape`` must be integers when it's a list or tuple"
                     )
+
+
+def try_set_static_shape_tensor(tensor, shape):
+    """Try to set static shape of tensor from a shape tensor.
+    
+    For example,
+
+    import paddle
+    paddle.enable_static()
+    data = paddle.static.data(name="x", shape=[-1, 2], dtype='float32')
+    shape = paddle.shape(data)  # shape should be [-1, 2] instead of [-1, -1]
+    x = paddle.uniform(shape) 
+    print(x.shape) 
+    # (-1, 2)
+    
+    """
+    if not in_dygraph_mode():
+        # static mode, and shape is not all inferred (contains -1)
+        if -1 in tensor.shape:
+            if isinstance(shape, Variable):
+                shape = try_get_constant_shape_from_tensor(shape)
+                if shape:
+                    tensor.desc.set_shape(shape)
+
+
+def try_get_constant_shape_from_tensor(shape_tensor):
+    """Try to get shape from a tensor with constant value.
+
+    For example,
+    
+    import paddle
+    paddle.enable_static()
+    data = paddle.static.data(name="x", shape=[-1, 2], dtype='float32')
+    shape = paddle.shape(data)  # shape should be [-1, 2] instead of [-1, -1]
+    x = paddle.uniform(shape) 
+    print(x.shape) 
+    # (-1, 2)
+    
+    """
+    if not in_dygraph_mode():
+        try:
+            if shape_tensor.op is not None:
+                generate_op = shape_tensor.op
+                if generate_op.type == 'shape':
+                    var = shape_tensor.block.vars[generate_op.input_arg_names[
+                        0]]
+                    return var.shape
+        except:
+            return None
+
+        return None
diff --git a/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
new file mode 100644
index 0000000000000..2c6d646baf593
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_static_shape_inferrence_for_shape_tensor.py
@@ -0,0 +1,31 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import paddle
+import unittest
+
+
+class StaticShapeInferrenceTest(unittest.TestCase):
+    def test_static_graph(self):
+        paddle.enable_static()
+        data = paddle.fluid.layers.data(
+            name="x", shape=[-1, 2], dtype='float32')
+        shape = paddle.fluid.layers.shape(data)  # shape should be [-1, 2]
+        x = paddle.fluid.layers.uniform_random(shape)
+        self.assertEqual(x.shape, data.shape)
+        paddle.disable_static()
+
+
+if __name__ == '__main__':
+    unittest.main()

From e1e666a05fe257781bd028a1e8b7a4a5332d5286 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Wed, 28 Oct 2020 15:38:29 +0800
Subject: [PATCH 073/185] fix conv mkldnn build error (#28288)

---
 paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc | 1 -
 1 file changed, 1 deletion(-)

diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
index b333b42c0142d..5bba3c6d6ed6b 100644
--- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc
@@ -191,7 +191,6 @@ class ConvMKLDNNHandlerT
 
       UpdatePaddingAndDilation(&paddings, &dilations, padding_algorithm,
                                data_dims, strides, ksize);
-      const bool is_conv3d = strides.size() == 3U;
 
       std::transform(dilations.begin(), dilations.end(), dilations.begin(),
                      [](int64_t i) { return i - 1; });

From a98c69b6c65a9ed926cdb3ba2cf38dc305fa72dc Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Wed, 28 Oct 2020 18:34:32 +0800
Subject: [PATCH 074/185] fix dygraph gather api

fix dygraph gather api
---
 python/paddle/fluid/layers/nn.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py
index adde9cbd19ffa..760f5ce58bf26 100755
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -8311,7 +8311,7 @@ def gather(input, index, overwrite=True):
             output = fluid.layers.gather(x, index)
     """
     if in_dygraph_mode():
-        return core.ops.gather(input, index, None)
+        return core.ops.gather(input, index, None, 'overwrite', overwrite)
 
     check_variable_and_dtype(
         input, 'x',

From 6cebd71454e095125739dbb2e4b860f6b9cce3a2 Mon Sep 17 00:00:00 2001
From: chentianyu03 <chentianyu03@baidu.com>
Date: Wed, 28 Oct 2020 18:56:07 +0800
Subject: [PATCH 075/185] add + - * / @ [] operator to ComplexVariable (#28217)

* add + - * / @ [] operator to ComplexVariable, also add unittest

* fix circular reference bug

* fit for py2.7

* remove reverse oprators which not supported now
---
 python/paddle/fluid/dygraph/math_op_patch.py  |  9 +-
 python/paddle/fluid/framework.py              |  3 +
 .../test_complex_elementwise_layers.py        | 13 +++
 .../tests/unittests/test_complex_getitem.py   | 96 +++++++++++++++++++
 .../tests/unittests/test_complex_matmul.py    | 11 +++
 python/paddle/incubate/complex/__init__.py    |  3 +
 .../incubate/complex/tensor_op_patch.py       | 53 ++++++++++
 7 files changed, 187 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_complex_getitem.py
 create mode 100644 python/paddle/incubate/complex/tensor_op_patch.py

diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index 68206f6286085..d1781fdb010e3 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -15,7 +15,7 @@
 from __future__ import print_function
 
 from .. import core
-from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator
+from ..framework import Variable, convert_np_dtype_to_dtype_, _varbase_creator, ComplexVariable
 from ..layers.layer_function_generator import OpProtoHolder
 from . import no_grad
 
@@ -149,6 +149,13 @@ def _binary_creator_(method_name,
                          reverse=False,
                          scalar_method=None):
         def __impl__(self, other_var):
+            # tensor and ComplexVariable opetator
+            if isinstance(other_var, ComplexVariable):
+                # need import paddle in closure
+                import paddle
+                math_op = getattr(paddle.incubate.complex.tensor, op_type)
+                return math_op(self, other_var)
+
             # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
             # which may lose accuracy. This is a hot fix for release 1.6.
             if scalar_method is not None and not (
diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py
index 904622caf45fc..317cae815f48a 100644
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -1826,6 +1826,9 @@ def __init__(self, real, imag):
             self._dtype = "complex128"
         self._shape = self.real.shape
 
+    def __getitem__(self, idx):
+        return ComplexVariable(self.real[idx], self.imag[idx])
+
     @property
     def dtype(self):
         return self._dtype
diff --git a/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py b/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py
index 0ced775689e7f..adf597704f59f 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_elementwise_layers.py
@@ -47,23 +47,36 @@ def compare(self, x, y):
             self.assertTrue(np.allclose(self.calc(x, y, "mul", place), x * y))
             self.assertTrue(np.allclose(self.calc(x, y, "div", place), x / y))
 
+    def compare_op(self, x, y):
+        for place in self._places:
+            with dg.guard(place):
+                var_x = dg.to_variable(x)
+                var_y = dg.to_variable(y)
+                self.assertTrue(var_x + var_y, x + y)
+                self.assertTrue(var_x - var_y, x - y)
+                self.assertTrue(var_x * var_y, x * y)
+                self.assertTrue(var_x / var_y, x / y)
+
     def test_complex_xy(self):
         x = rand([2, 3, 4, 5]).astype(self._dtype) + 1j * rand(
             [2, 3, 4, 5]).astype(self._dtype)
         y = rand([2, 3, 4, 5]).astype(self._dtype) + 1j * rand(
             [2, 3, 4, 5]).astype(self._dtype)
         self.compare(x, y)
+        self.compare_op(x, y)
 
     def test_complex_x_real_y(self):
         x = rand([2, 3, 4, 5]).astype(self._dtype) + 1j * rand(
             [2, 3, 4, 5]).astype(self._dtype)
         y = rand([4, 5]).astype(self._dtype)
         self.compare(x, y)
+        self.compare_op(x, y)
 
     def test_real_x_complex_y(self):
         x = rand([2, 3, 4, 5]).astype(self._dtype)
         y = rand([5]).astype(self._dtype) + 1j * rand([5]).astype(self._dtype)
         self.compare(x, y)
+        self.compare_op(x, y)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/fluid/tests/unittests/test_complex_getitem.py b/python/paddle/fluid/tests/unittests/test_complex_getitem.py
new file mode 100644
index 0000000000000..d6b54bbdc4fde
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_complex_getitem.py
@@ -0,0 +1,96 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import paddle
+import numpy as np
+import paddle.fluid as fluid
+import paddle.fluid.dygraph as dg
+
+
+class TestComplexGetitemLayer(unittest.TestCase):
+    def setUp(self):
+        self._places = [fluid.CPUPlace()]
+        if fluid.core.is_compiled_with_cuda():
+            self._places.append(fluid.CUDAPlace(0))
+
+    def test_case1(self):
+        x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4)
+        x_np_slice = x_np[0]
+
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x_np)
+                x_var_slice = x_var[0]
+
+            np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice)
+
+    def test_case2(self):
+        x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4)
+        x_np_slice = x_np[0][1]
+
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x_np)
+                x_var_slice = x_var[0][1]
+
+            np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice)
+
+    def test_case3(self):
+        x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4)
+        x_np_slice = x_np[0][1][2]
+
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x_np)
+                x_var_slice = x_var[0][1][2]
+
+            np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice)
+
+    def test_case4(self):
+        x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4)
+        x_np_slice = x_np[0][1][0:3]
+
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x_np)
+                x_var_slice = x_var[0][1][0:3]
+
+            np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice)
+
+    def test_case5(self):
+        x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4)
+        x_np_slice = x_np[0][1][0:4:2]
+
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x_np)
+                x_var_slice = x_var[0][1][0:4:2]
+
+            np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice)
+
+    def test_case6(self):
+        x_np = np.random.randn(2, 3, 4) + 1j * np.random.randn(2, 3, 4)
+        x_np_slice = x_np[0][1:3][0:4:2]
+
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x_np)
+                x_var_slice = x_var[0][1:3][0:4:2]
+
+            np.testing.assert_allclose(x_var_slice.numpy(), x_np_slice)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_complex_matmul.py b/python/paddle/fluid/tests/unittests/test_complex_matmul.py
index 90d50bdfb27ab..9accbdfca5518 100644
--- a/python/paddle/fluid/tests/unittests/test_complex_matmul.py
+++ b/python/paddle/fluid/tests/unittests/test_complex_matmul.py
@@ -34,6 +34,15 @@ def compare(self, x, y):
         np_result = np.matmul(x, y)
         self.assertTrue(np.allclose(result.numpy(), np_result))
 
+    def compare_op(self, x, y):
+        for place in self._places:
+            with dg.guard(place):
+                x_var = dg.to_variable(x)
+                y_var = dg.to_variable(y)
+                result = x_var.matmul(y_var)
+        np_result = np.matmul(x, y)
+        self.assertTrue(np.allclose(result.numpy(), np_result))
+
     def test_complex_xy(self):
         x = np.random.random(
             (2, 3, 4, 5)).astype("float32") + 1J * np.random.random(
@@ -42,6 +51,7 @@ def test_complex_xy(self):
             (2, 3, 5, 4)).astype("float32") + 1J * np.random.random(
                 (2, 3, 5, 4)).astype("float32")
         self.compare(x, y)
+        self.compare_op(x, y)
 
     def test_complex_x(self):
         x = np.random.random(
@@ -49,6 +59,7 @@ def test_complex_x(self):
                 (2, 3, 4, 5)).astype("float32")
         y = np.random.random((2, 3, 5, 4)).astype("float32")
         self.compare(x, y)
+        self.compare_op(x, y)
 
     def test_complex_y(self):
         x = np.random.random((2, 3, 4, 5)).astype("float32")
diff --git a/python/paddle/incubate/complex/__init__.py b/python/paddle/incubate/complex/__init__.py
index 34e46296f5b73..ff61c52ca3640 100644
--- a/python/paddle/incubate/complex/__init__.py
+++ b/python/paddle/incubate/complex/__init__.py
@@ -13,6 +13,9 @@
 # limitations under the License.
 
 from . import tensor
+from .tensor_op_patch import monkey_patch_math_complex
 from .tensor import *
 
 __all__ = tensor.__all__ + []
+
+monkey_patch_math_complex()
diff --git a/python/paddle/incubate/complex/tensor_op_patch.py b/python/paddle/incubate/complex/tensor_op_patch.py
new file mode 100644
index 0000000000000..eb7dbd2a3bc1a
--- /dev/null
+++ b/python/paddle/incubate/complex/tensor_op_patch.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from ...fluid import framework
+from . import tensor
+
+
+def monkey_patch_math_complex():
+    # complexVariable do not support scaler type now, so here not contains
+    # reverse methods, such as "__radd__", "__rsub__", "__rmul__", "__rdiv__",
+    # "__rtruediv__", "__rmatmul__".
+    complex_methods = [
+        ('__add__', _binary_creator_('__add__', "elementwise_add", False)),
+        ('__sub__', _binary_creator_('__sub__', "elementwise_sub", False)),
+        ('__mul__', _binary_creator_('__mul__', "elementwise_mul", False)),
+        ('__div__', _binary_creator_('__div__', "elementwise_div", False)),
+        ('__truediv__', _binary_creator_('__truediv__', "elementwise_div",
+                                         False)),
+        ('__matmul__', _binary_creator_('__matmul__', "matmul", False)),
+    ]
+
+    for method in complex_methods:
+        method_name = method[0]
+        method_impl = method[1]
+        if method_impl:
+            setattr(framework.ComplexVariable, method_name, method_impl)
+
+    for method in tensor.__all__:
+        method_impl = getattr(tensor, method)
+        if method_impl:
+            setattr(framework.ComplexVariable, method, method_impl)
+
+
+# for binary operator such as elementwise
+def _binary_creator_(method_name, op_type, reverse=False):
+    def __impl__(self, other_var):
+        math_op = getattr(tensor, op_type)
+        return math_op(self, other_var)
+
+    __impl__.__name__ = method_name
+    return __impl__

From 89530384008b023dc1e8c51e5a8e7e710718efff Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 28 Oct 2020 08:50:39 -0500
Subject: [PATCH 076/185] Fix transpose in conv cudnn kernel when addto enabled
 (#28295)

---
 paddle/fluid/operators/conv_cudnn_op.cu       | 14 ++++++-
 .../unittests/test_inplace_addto_strategy.py  | 39 +++++++++++--------
 2 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index f8b76f387cc19..3f03df04ea376 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -293,8 +293,12 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
 
     // ------------------- cudnn conv forward ---------------------
     ScalingParamType<T> alpha = 1.0f;
-    ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
-    VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+    ScalingParamType<T> beta = 0.0f;
+
+    // NOTE(zhiqiu): inplace addto is not supportted in double grad yet.
+    // ScalingParamType<T> beta = ctx.Attr<bool>("use_addto") ? 1.0f : 0.0f;
+    // VLOG(4) << "Conv: use_addto = " << ctx.Attr<bool>("use_addto");
+
     for (int i = 0; i < groups; i++) {
       workspace_handle.RunFunc(
           [&](void* workspace_ptr) {
@@ -387,6 +391,12 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
       if (input_grad) {
         ResizeToChannelFirst<platform::CUDADeviceContext, T>(
             ctx, input_grad, &transformed_input_grad_channel);
+        // NOTE(zhiqiu): If inplace_addto strategy is enabled, we need to copy
+        // the data of input_grad to transformed_input_grad_channel.
+        if (ctx.Attr<bool>("use_addto")) {
+          TransToChannelFirst<platform::CUDADeviceContext, T>(
+              ctx, input_grad, &transformed_input_grad_channel);
+        }
       }
     } else {
       transformed_input_channel.ShareDataWith(*input);
diff --git a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
index 0c43d5693456c..b9089448d53f1 100644
--- a/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
+++ b/python/paddle/fluid/tests/unittests/test_inplace_addto_strategy.py
@@ -30,22 +30,21 @@ def __init__(self,
                  filter_size,
                  stride=1,
                  groups=1,
-                 act=None,
-                 use_cudnn=False):
+                 data_format="NCHW"):
         super(ConvBNLayer, self).__init__()
 
-        self._conv = fluid.dygraph.Conv2D(
-            num_channels=num_channels,
-            num_filters=num_filters,
-            filter_size=filter_size,
+        self._conv = paddle.nn.Conv2D(
+            in_channels=num_channels,
+            out_channels=num_filters,
+            kernel_size=filter_size,
             stride=stride,
             padding=(filter_size - 1) // 2,
             groups=groups,
-            act=None,
             bias_attr=False,
-            use_cudnn=use_cudnn)
+            data_format=data_format)
 
-        self._batch_norm = fluid.dygraph.BatchNorm(num_filters, act=act)
+        self._batch_norm = paddle.nn.BatchNorm(
+            num_filters, data_layout=data_format)
 
     def forward(self, inputs):
         y = self._conv(inputs)
@@ -53,19 +52,20 @@ def forward(self, inputs):
         return y
 
 
-def create_program():
+def create_program(data_format="NCHW"):
     main = fluid.Program()
     startup = fluid.Program()
     with fluid.program_guard(main, startup):
         x = fluid.data(name='img', shape=[-1, 3, 224, 224])
         x.stop_gradient = False
+        if data_format == "NHWC":
+            x = paddle.transpose(x, [0, 2, 3, 1])
         x = fluid.layers.prelu(x, mode="channel")
         conv = ConvBNLayer(
             num_channels=3,
             num_filters=3,
             filter_size=1,
-            act='relu',
-            use_cudnn=True)
+            data_format=data_format)
         y = conv(x) + x
 
         loss = fluid.layers.reduce_sum(y)
@@ -77,7 +77,7 @@ def create_program():
 
 
 class TestInplaceAddto(unittest.TestCase):
-    def test_result(self):
+    def check_result(self, data_format="NCHW"):
         def run_program(enable_addto):
             np.random.seed(10)
             paddle.seed(10)
@@ -85,7 +85,7 @@ def run_program(enable_addto):
             if fluid.core.is_compiled_with_cuda():
                 fluid.set_flags({"FLAGS_cudnn_deterministic": True})
             fluid.set_flags({"FLAGS_max_inplace_grad_add": 2})
-            loss, main, startup, w = create_program()
+            loss, main, startup, w = create_program(data_format=data_format)
             place = fluid.CUDAPlace(0) if fluid.core.is_compiled_with_cuda(
             ) else fluid.CPUPlace()
             exe = fluid.Executor(place)
@@ -98,7 +98,7 @@ def run_program(enable_addto):
             exe.run(startup)
             img = np.random.uniform(-128, 128,
                                     [8, 3, 224, 224]).astype(np.float32)
-            for i in range(2):
+            for i in range(10):
                 res = exe.run(compiled,
                               feed={'img': img},
                               fetch_list=[loss.name, w.name])
@@ -106,9 +106,16 @@ def run_program(enable_addto):
 
         res1, w1 = run_program(True)
         res2, w2 = run_program(False)
-        print(res1, res2)
+
         self.assertTrue(np.array_equal(res1, res2))
 
+    def test_nchw(self):
+        self.check_result()
+
+    def test_nhwc(self):
+        self.check_result("NHWC")
+
 
 if __name__ == "__main__":
+    paddle.enable_static()
     unittest.main()

From 03511689670ee3d4652fd36d8e610703b3c6fb7b Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Thu, 29 Oct 2020 09:36:34 +0800
Subject: [PATCH 077/185] Fix lr scheduler step error in hapi when use static
 mode (#28297)

* fix lr scheduler

* fix code style
---
 python/paddle/hapi/model.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index ff962fb1c1d5c..aa99d698bc7b2 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -454,7 +454,8 @@ def _run(self, inputs, labels=None):
                 rets.insert(i, feed[name])
 
         # step learning rate scheduler on each batch end
-        if self.model._optimizer and \
+        if self.model._optimizer and self.mode == 'train' and \
+                hasattr(self.model._optimizer, '_learning_rate') and \
                 isinstance(self.model._optimizer._learning_rate,
                            paddle.optimizer.lr.LRScheduler):
             self.model._optimizer._learning_rate.step()

From 842a4e5abd2c766e10368fac1251a16e2c389a00 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 29 Oct 2020 10:01:56 +0800
Subject: [PATCH 078/185] fix analyzer_capi_tester, test=develop (#28289)

---
 paddle/fluid/inference/tests/api/analyzer_capi_tester.cc | 2 --
 1 file changed, 2 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
index fd20581123c10..0b2be0076fdb1 100644
--- a/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_capi_tester.cc
@@ -90,8 +90,6 @@ TEST(PD_AnalysisConfig, profile_mkldnn) {
   bool quantizer_enable = PD_MkldnnQuantizerEnabled(config);
   EXPECT_TRUE(quantizer_enable);
   PD_EnableMkldnnBfloat16(config);
-  bool bfloat16_enable = PD_MkldnnBfloat16Enabled(config);
-  EXPECT_TRUE(bfloat16_enable);
   PD_SetMkldnnCacheCapacity(config, 0);
   PD_SetModel(config, prog_file.c_str(), params_file.c_str());
   PD_DeleteAnalysisConfig(config);

From e8f2614da5e57b114efeaceaff8f8488575bd8c4 Mon Sep 17 00:00:00 2001
From: Guanghua Yu <742925032@qq.com>
Date: Thu, 29 Oct 2020 10:54:43 +0800
Subject: [PATCH 079/185] Enhance multiclass_nms op to support LoD for dygraph
 mode (#28276)

* Enhance multiclass_nms to support LoD for dygraph mode

* fix some error in multiclass_nms

* update GetLodFromRoisNum to GetNmsLodFromRoisNum
---
 .../operators/detection/multiclass_nms_op.cc  |  77 ++++++++++-
 paddle/fluid/pybind/op_function_generator.cc  |   2 +
 .../tests/unittests/test_multiclass_nms_op.py | 122 ++++++++++++++++++
 tools/static_mode_white_list.pyc              | Bin 0 -> 21082 bytes
 4 files changed, 198 insertions(+), 3 deletions(-)
 create mode 100644 tools/static_mode_white_list.pyc

diff --git a/paddle/fluid/operators/detection/multiclass_nms_op.cc b/paddle/fluid/operators/detection/multiclass_nms_op.cc
index 0e835a62839b4..7927410ef3786 100644
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -21,6 +21,16 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
+inline std::vector<size_t> GetNmsLodFromRoisNum(const Tensor* rois_num) {
+  std::vector<size_t> rois_lod;
+  auto* rois_num_data = rois_num->data<int>();
+  rois_lod.push_back(static_cast<size_t>(0));
+  for (int i = 0; i < rois_num->numel(); ++i) {
+    rois_lod.push_back(rois_lod.back() + static_cast<size_t>(rois_num_data[i]));
+  }
+  return rois_lod;
+}
+
 class MultiClassNMSOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -321,6 +331,8 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     auto* outs = ctx.Output<LoDTensor>("Out");
     bool return_index = ctx.HasOutput("Index") ? true : false;
     auto index = ctx.Output<LoDTensor>("Index");
+    bool has_roisnum = ctx.HasInput("RoisNum") ? true : false;
+    auto rois_num = ctx.Input<Tensor>("RoisNum");
     auto score_dims = scores->dims();
     auto score_size = score_dims.size();
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
@@ -332,7 +344,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
     int64_t out_dim = box_dim + 2;
     int num_nmsed_out = 0;
     Tensor boxes_slice, scores_slice;
-    int n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    int n = 0;
+    if (has_roisnum) {
+      n = score_size == 3 ? batch_size : rois_num->numel();
+    } else {
+      n = score_size == 3 ? batch_size : boxes->lod().back().size() - 1;
+    }
     for (int i = 0; i < n; ++i) {
       std::map<int, std::vector<int>> indices;
       if (score_size == 3) {
@@ -341,7 +358,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         boxes_slice = boxes->Slice(i, i + 1);
         boxes_slice.Resize({score_dims[2], box_dim});
       } else {
-        auto boxes_lod = boxes->lod().back();
+        std::vector<size_t> boxes_lod;
+        if (has_roisnum) {
+          boxes_lod = GetNmsLodFromRoisNum(rois_num);
+        } else {
+          boxes_lod = boxes->lod().back();
+        }
         if (boxes_lod[i] == boxes_lod[i + 1]) {
           all_indices.push_back(indices);
           batch_starts.push_back(batch_starts.back());
@@ -380,7 +402,12 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
             offset = i * score_dims[2];
           }
         } else {
-          auto boxes_lod = boxes->lod().back();
+          std::vector<size_t> boxes_lod;
+          if (has_roisnum) {
+            boxes_lod = GetNmsLodFromRoisNum(rois_num);
+          } else {
+            boxes_lod = boxes->lod().back();
+          }
           if (boxes_lod[i] == boxes_lod[i + 1]) continue;
           scores_slice = scores->Slice(boxes_lod[i], boxes_lod[i + 1]);
           boxes_slice = boxes->Slice(boxes_lod[i], boxes_lod[i + 1]);
@@ -403,6 +430,15 @@ class MultiClassNMSKernel : public framework::OpKernel<T> {
         }
       }
     }
+    if (ctx.HasOutput("NmsRoisNum")) {
+      auto* nms_rois_num = ctx.Output<Tensor>("NmsRoisNum");
+      nms_rois_num->mutable_data<int>({n}, ctx.GetPlace());
+      int* num_data = nms_rois_num->data<int>();
+      for (int i = 1; i <= n; i++) {
+        num_data[i - 1] = batch_starts[i] - batch_starts[i - 1];
+      }
+      nms_rois_num->Resize({n});
+    }
 
     framework::LoD lod;
     lod.emplace_back(batch_starts);
@@ -535,6 +571,34 @@ class MultiClassNMS2OpMaker : public MultiClassNMSOpMaker {
   }
 };
 
+class MultiClassNMS3Op : public MultiClassNMS2Op {
+ public:
+  MultiClassNMS3Op(const std::string& type,
+                   const framework::VariableNameMap& inputs,
+                   const framework::VariableNameMap& outputs,
+                   const framework::AttributeMap& attrs)
+      : MultiClassNMS2Op(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    MultiClassNMS2Op::InferShape(ctx);
+
+    ctx->SetOutputDim("NmsRoisNum", {-1});
+  }
+};
+
+class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
+ public:
+  void Make() override {
+    MultiClassNMS2OpMaker::Make();
+    AddInput("RoisNum",
+             "(Tensor) The number of RoIs in shape (B),"
+             "B is the number of images")
+        .AsDispensable();
+    AddOutput("NmsRoisNum", "(Tensor), The number of NMS RoIs in each image")
+        .AsDispensable();
+  }
+};
+
 }  // namespace operators
 }  // namespace paddle
 
@@ -551,3 +615,10 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OP_CPU_KERNEL(multiclass_nms2, ops::MultiClassNMSKernel<float>,
                        ops::MultiClassNMSKernel<double>);
+
+REGISTER_OPERATOR(
+    multiclass_nms3, ops::MultiClassNMS3Op, ops::MultiClassNMS3OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(multiclass_nms3, ops::MultiClassNMSKernel<float>,
+                       ops::MultiClassNMSKernel<double>);
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 7f2736a9b1d41..cac44173c1772 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -52,6 +52,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"hierarchical_sigmoid",
      {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
     {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
+    {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.
@@ -78,6 +79,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"distribute_fpn_proposals",
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
     {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
+    {"multiclass_nms3", {"Out", "NmsRoisNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
index 34c19b88bcdba..3158d78db63dc 100644
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
@@ -571,6 +571,128 @@ def test_scores_Variable():
             self.assertRaises(TypeError, test_scores_Variable)
 
 
+class TestMulticlassNMS3Op(TestMulticlassNMS2Op):
+    def setUp(self):
+        self.set_argument()
+        N = 7
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+
+        scores = np.random.random((N * M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+        scores = np.reshape(scores, (N, M, C))
+        scores = np.transpose(scores, (0, 2, 1))
+
+        boxes = np.random.random((N, M, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0:2] = boxes[:, :, 0:2] * 0.5
+        boxes[:, :, 2:4] = boxes[:, :, 2:4] * 0.5 + 0.5
+
+        det_outs, lod = batched_multiclass_nms(boxes, scores, background,
+                                               score_threshold, nms_threshold,
+                                               nms_top_k, keep_top_k)
+        det_outs = np.array(det_outs)
+
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        index_outs = det_outs[:, -1:].astype('int') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms3'
+        self.inputs = {'BBoxes': boxes, 'Scores': scores}
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'Index': (index_outs, [lod]),
+            'NmsRoisNum': np.array(lod).astype('int32')
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': True,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMS3OpNoOutput(TestMulticlassNMS3Op):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0
+        self.score_threshold = 2.0
+
+
+class TestMulticlassNMS3LoDInput(TestMulticlassNMS2LoDInput):
+    def setUp(self):
+        self.set_argument()
+        M = 1200
+        C = 21
+        BOX_SIZE = 4
+        box_lod = [[1200]]
+        background = 0
+        nms_threshold = 0.3
+        nms_top_k = 400
+        keep_top_k = 200
+        score_threshold = self.score_threshold
+        normalized = False
+
+        scores = np.random.random((M, C)).astype('float32')
+
+        scores = np.apply_along_axis(softmax, 1, scores)
+
+        boxes = np.random.random((M, C, BOX_SIZE)).astype('float32')
+        boxes[:, :, 0] = boxes[:, :, 0] * 10
+        boxes[:, :, 1] = boxes[:, :, 1] * 10
+        boxes[:, :, 2] = boxes[:, :, 2] * 10 + 10
+        boxes[:, :, 3] = boxes[:, :, 3] * 10 + 10
+
+        det_outs, lod = lod_multiclass_nms(
+            boxes, scores, background, score_threshold, nms_threshold,
+            nms_top_k, keep_top_k, box_lod, normalized)
+
+        det_outs = np.array(det_outs)
+        nmsed_outs = det_outs[:, :-1].astype('float32') if len(
+            det_outs) else det_outs
+        self.op_type = 'multiclass_nms3'
+        self.inputs = {
+            'BBoxes': (boxes, box_lod),
+            'Scores': (scores, box_lod),
+            'RoisNum': np.array(box_lod).astype('int32')
+        }
+        self.outputs = {
+            'Out': (nmsed_outs, [lod]),
+            'NmsRoisNum': np.array(lod).astype('int32')
+        }
+        self.attrs = {
+            'background_label': 0,
+            'nms_threshold': nms_threshold,
+            'nms_top_k': nms_top_k,
+            'keep_top_k': keep_top_k,
+            'score_threshold': score_threshold,
+            'nms_eta': 1.0,
+            'normalized': normalized,
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestMulticlassNMS3LoDNoOutput(TestMulticlassNMS3LoDInput):
+    def set_argument(self):
+        # Here set 2.0 to test the case there is no outputs.
+        # In practical use, 0.0 < score_threshold < 1.0
+        self.score_threshold = 2.0
+
+
 if __name__ == '__main__':
     paddle.enable_static()
     unittest.main()
diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d2a45c248ce271c1c4fff310505a172339e5eee
GIT binary patch
literal 21082
zcmeHPb-XNhbw9sffMCJhH3W&g_d;+8kN`n~B)AN7_dC0HxVt->nce%|4eqX`c#Bq`
z)Q}dJ3ee(GfhyEps257V=QlIw%<jD}ANuLP3E_R-p7Yb=@|&FS-IK2P-g~_+8%g)i
zYoo~7{Co466Ggy(gCGh{5L`xZS;6H5CkievxPst{f-4EGEI3JU6~R>nR})-aa1FsV
z1=kW>TW}r0bp_WGTwibl!3_mB65LpD6TwXdHxt}ka0|gL1-BC1T5z)9HiA<Gw-ww@
zaC^ZW1g8p46PzwMLvToNSa3verr?f(vjle%+*xoJ!CeJ+6Wm>J55YYJ_Y#~fxVPXw
zg8K^25u7WypWyz22MEp+JW%i;!Gi@45u7i0sNi9OhYKDdxIl2B;E{qy2_7wYjNq|?
z#|a)Uc!J=Gf+q<w!IK5KU`sF(Ku`$Af{9>TPzs(R*bz(xGr?R?2^NBj1ht?MECnmU
zuHdNPm|#zEvEZqKrwN`ec!uDaf@cYyEqIRLxq{~jo-cTT;Dv%030^FCiQuJzmkC}j
zc!l7Vf>#M%EqIOKM+C1G{HWk{g4YY)Ab6wTO@cQI-Xi!h!H)~xDtMdV?Sgj*-YIyO
z;3ot>DflVDPYd2Hc#q&`1n(96tl;MaKQH(N!7mElCwRZ$1A-3<eo64lf)5EkEcl4v
zqk@kKJ}&r#;FE$+2|g|O6~V6xeogS}g3ky(EBKt?Hw3>a_`Ki?f-efbB>1x6D}t{I
zz9#s(;2VP968yH{cLcvH_@>}ng5MMTzTgi8e<=7P!5<60E%=V$PXvD|_%p$u3;sgz
zmx8|%{I%e31b-{|JHg)z{z34Mf`1bHv*5dee-Zqv;NJxQF8B|@e+vFf@ZW;(3BE7*
zAHfeO{{bN&2AmLZnSjd%TrS|mfXfG5A>fJuR|>dtz)1mD3Ak#&)dH>_aE*X#23#xP
z+5y)IxNgAp0<Ir$gMb?b+$iA20XGS_X~4|_ZXR%pfLjLKD&W=uCkNan;FN&d2HY;-
z_5pVYI5pt3fYSrc2sjjQIN(UYnE`hUI4j^z0e23#OTb+N?iO(OfO`boGvHnUX9wIn
z;64HO4LB#@+<^N9+&|y}0p|rgFyKJ}4-R-p!1)0W4R~0<!vh`>a6!O@0gnuLRKTMH
z9ux4`fX4+qKHv!fPYifcKo;=ifIMI;U=)CWB48Xa3D^!O1D+DF6EF>!1<V7gfJMMX
z0d+tVunbrQ>;@bSI2N!MaB;v>1D+P}^nhmsJTu@~0nZM2PQY^mo)_@^fENV3FyKW2
zFAjJ~z)J&O7Vz?bR|LE=;8g*y4tPz#j|99n;70>q7x4OkHw3&f;7tK<4tPtzj|KdA
zz*_^}7V!3fcLcmM;9UVf5%7}%KNaxP0q+iYPr%Ovyf@%y1AZ>x=L3Er;1>hl7x4ao
z4+MNL;Fkh^Ip9M99}f6Pz()f<7Vz<aPXv52;8Ovg4)~RTUk&)RfL{;zOu%OYJ{RyC
z0lyjW`G7A3d@<ll0bdUIO2AhGz83KHfNuo+R={ru{7%5{27EK%TLHfp@cRLO5b%cq
ze-!Y?0pAYzPQaf8{As|S1^juyUj+PRz+VOYb->>Q{B6MB1^j)$KLq?^z&{22bHH~4
z{w3gF1O6@G-vj<5;6DTYE8xEaz8CQQfd2{jL4ZVrh!}A~#APBb8*#bVnf((ZE+5tG
zpG03kV8oT8x_~S5--wf9=LoJsU{t?wH9{h;5pm6kYeif;;yMx6jksRK^&@T&al?ok
zMcg>zCJ{G{xLL%_BW@9K%ZOV=+&bdqh}%S*5^>vz+eO?y;tmm~Mw}LLdc+wKhawI~
z9EmtH;*Jq#McgUk&JlNsxNF4SBJLh>kBECl+$-Yjh<iueC*r;l=R}+valeTBM?4_n
zyod)zJSgJ95f6ztKjNVg4~uws#3Lduh`2D~kr9uIcyz>LA|4y@xQNF`JR#zV5l@Q9
zBAy(PM{GrmA`nqTj3Xuy+Yx2NQzCXErV+D<c|;Ylh`1=Cj%Xs55vz#Zh@%n5BK9IK
zj(BRs(;}W8@r;OPMm#Ix*%8l)cy7e=BAy@df`}JJyeQ(u5ig08*T%;Q9Ynm~l$X(`
zFRw$VzATz$mXF8fyvRn|`FvhXvud%tN{<><^HIK3&~<xIF)e1re0j8NiYy0?i)&U}
zio0GvZ(L5N*~LX&HQBV>DGsk8r#wJj>k-O%Hmb*pym~)#S|O{8rby?wmP4zZ)goKw
zThl^^pVT83IS!{|UA2cayu-4nb(BqSii=joe8g*~D=x2Lg9nJ+1Hg-_nmRRXnc~<Y
zpCikg1GZ>!2OxM<J(O|6R$I+Gl#$O+gUjrn*Gx-ZX-Ex&_~1p(G0Q4Lu~=^F;2YZA
zFIwewfoys>L)k2Ir4R8}>+$($w5s#bo&rrE^3k%~&6j00&*t-NQs)><K#pR%%yojX
zBS)^I6n14>8Tm{BH{Jx4vywuSjTc7_^N+L6Tw7+AAItJZ>GH~@6~={3929g^Ax26<
z7CbV=cXyI0I}hOiq^{E;2ZsV-O_?rB>8Vi8C%X0x{Mah1ymYPPE#}~;z2P-6nyse8
z(_xk_eob4U)y7F(CJH%xIvq_bD%~{SE9&JfZ2W9Ny(vaJ8MU}v@>Uc>3Q4OKe(CvW
zyQ;HEF)!+T$-nW`SF$zgd_F0bzE?fTX4J%vmd?3)(o}WQLtnVA^SyNE>}aj~b78X|
zdDHN$o1k(sSCHu<M=5AGpRUq$6IE5)c_Q|-Stc_1kyj&K!=;eF8*+o!UZ@BYcT>CG
zZbf3!WKo=zMrA%NFHREcjM&OYJ4f>xPK2$zDMwjNnV#pf!Y#p8z8r05^Quk<x^}<G
z8aUfMJXo6Bl5>qq$yPyWZ)owj5DJP>y0(#jh~_<v99!jR#sBjTv}Oyv;`MF!b<}P$
zuST4$a+-5WKF!P=m~|ztHNL7yCdUTt_G%Y2iW-{yp+u166;`8lN3m5d@_MNfF-sKV
zagBRht<alc*{Y6ZqiLD$<R?=gX_I92CU#RoTbk8L)H~tf>b}~dP6Ss9nO&zwjFNRK
zDN@TT^vjKMIu<iS{FN?YON}^~2T1I5;C3}Fnw?|@ef{m#d?za?#Ja7SpHTu6V$z=O
zxwU5!<ePT{Fmp4irqhB{9WUleVOpMi+UV*g5u<8GA{SIYl0ONN$)lpmDK&vJX>MnO
zD_3(2mzX0NG#-3wT8(xFC#`t-V@vXHo=>R9#u@s|-9vN}$r73O=%7m)lZI#QE!EmA
zdggYPnFyWL`s=lY2)e|DjvTlUp(M}RLIiCtWU{1tui;f<lj^J;+WeX`n-jBUyBsHJ
zG5a`TaMP5@E;x(m4yXf@X|+WP?{Wi|1O_@ulshltl0atbMa&|M6wv9~8vhwiH5p};
z_`P+V=M+b^0_?J}hU0=Rt_eek^Jh8AXfG%80cbUAR_o??w3~jLS|MNN&IkQ9t&&%y
z(VMEfpw_vy8Q^z{f>*Dp^~)ReIyyj8$9#aD8pYOXq7rHrk7T8AG9L3-!#tLfe6>ib
z(K;?pyru26ADQX?t&bfXxfs&}q~KGfDBC@x3^Ub*qZdtGl_``Uo84kFGGHLFNA)ae
zK-B<p{_Xr2^kNKKd4*XqtLkKHQY_S3|1o2@fkOd<)|{U5V!HpWruW+>g9G%mN&$<B
zEQ)4C(_768x6UY*`DlCidzY@$)gOia&L#{N;qe&Ce3BKjtpdxN*)<qN*5%fU3b4<J
zneWn2vmM01g}S;dCVOtT4a)$t8JwwFH`J((&KK3HZYFSP#)3bUu6|uLqpLL4d>!^^
z9~ax124-bOwx&41lo5M#jRKZtH3-OOU1-oOshAqgCa*GBa;||xkLZNQC2%jH8DTGz
zJ727p%Uky}E7aD~E=-FQ`B1tx?6Y~{jge_ep(dtr)!P^|saF|U?YgyrPjQwGINm1A
zOH$CQ6frh$EJDd$)ite>lHRm<W6;rUdM6!VR;l1U*YXb2=3&l>a(8{%K|VU3Rx}-#
zFu)kXeGgo{i+B&lF(N+hJm#p!#jR3EVB8<xrq7rT+;CB)#O3lP84RSHFRE#Y6^w1y
z#n9YTF<a2WXu7S3GbK=ev6nHpu03XQ*kM<)qRqseL&wNjO&&DIP<1i0Sqw2N`u;BJ
z$VRjzg{N0Gu1i@8S_?Cph8z>j_-Nj_yyfkU0{u31v0T+Xo<k{WyG5-Dh#Fb*-j<c?
z{mt(>v<+*`*L4Vv-CUs^Xz^>Ymndpn8|NhPz{+w3BU-|lc_!wH{cXl6g)xq%Tf47e
ze337hsP8QbSLbn|$%j@%x@aKR^veT<7QdR$ih4rQ+Ra+HMjw7IEy(0^mydM}?T%Ie
zk`9|KOhp`C!7sjE)lo8=oO7_o&J1-+T`Kt<i=oHNNO*-5vzQ&}fLfDTmXlSrYO<7>
zu1Rk{XpNH7f;P+Keyl$!6&jTs8XPu(7ki9`sWaPov+Xj`PN^B#&&bQ#<KqoPc4TwC
z6*Ec}Mv6vLW7DyA+G<G}4DPJQQdr~B_?S`ufMz^fakQ#;C{2uM)E~M!j++z}IyaVe
zvMeU7;}u!jtGf?VD5+)Pz<4!h&8C_&9@W_8@W5g3LwWFg-1$e2KYtipd+0M(1CjGk
zUD~o>4mTiyVtL%I@{^EO?MPGWV!9%ISiMSBNWDXrhTYsWi)$cLL)+FVy+p-@%}eWI
z95vfwE`Cvp&~#td#~$6;S*mMahHyz+HCp&mx5}v&uXAMu_rr%vD!gWCD8+VMGF^sq
zhgZ}wJW{%tmd{vb?T4AM?6%-Atx|PmOg+Sq2;D;M2r2_t?CQA|&GhSZr9ZlutSjBx
zZ6^70n~J=wi*>6v$yco0<h&34L6s5!^Y&e#Nt6F(hdJL?(Wb7Z>o%azYzKpkL;D7$
z^w%OsuV#6Q;8uVYt!y#XKv~IR@s)x*<2kN*>V{%6(m=bej;4{uT#FVoWq0vSrMV`b
zF@?x_eQ`rFz^1~*U^QkPmi3#lozw$x#=4O?nNFz8eCTgAHhLRVZna_xux+;K0NZ+#
zg3UB+GssSLfc3$vt**MEU}JbWJ8bo4!cBb38Dmps3tDp-mz1O=<f#f^_bF!!tuShR
zHC52uYz#195;qz<Ne8HLZvltLu_Y(;NP}ZKoZ+Ym{$MrDI%z~xoXzB(We|%c)<&5>
zWnFwtAPzhuY0_FJ_XUI17R#-aFx;lMM*DG@`U;9%FJVIx+nqd&UNxk<$4P_r>Y|)8
zV;QBM%z<}XyN#CpJ$P82cEAZqES8C>)JIEp2l@kMtYzoZ<=%l?r22}c!<41Xi&+Fx
z&PR27N{hovmdB4}ohFf{n~0>CDwr;S^D+0BSIl8Zy{*+)e>T%<*$g<LV^V8p#^T{L
zD=YWFR{f=icQKkLE5Xb;#rIY{DtkDfQqgCo&ejg!RO6-Q4T)fDWfu<zxE8a{W<g8i
znvw!6CenQ^*&?SDFKP3o!>C!X$fBGx(rsO&X?Mx<<<vM8_kqGyD=kN{_{+FBRS;Z7
z>iu<vF->$b@Mgrz{iBkfHIr1b_>^)CGHC$pIuQh#S=yOuUZ!<aWvH32otmzW0K47v
zP=9F`hvYr7W>!^gzshD$bNcS#E~&O?3mB!>z)DQJ;gt2Lxt8Cw?7+6kHVdg!3WrZ^
zi{oB$Hzsvu^R)uGRiub)w9n=@O_qpfHQ$y1nyhz$e#(f3XjZJ1Z8^&kl!V#3Z&EGJ
z7usP`0{Ed6oH{bof{9Ra4JPQRk(#qgotma@+1w&@bE`$;?9_@;&o|?-0z0|)B!fWK
z1(`$*=vxiCv|WOv2et%)8N0V}H?piLGdzxUc6Uu6r!oO4B)0p{@_NEFodF4hj!xK2
z1{vEZK^QACiSlC;%v@G?GJWbm?Rkdt>+)F73OmKDptUSnRk9GSE7{eBhO))6(R9TQ
zPfG7J1M^+l%3f>T2T-PNS&77NBlxUnw)>u-i4c2q1@oAiLF};<kEA(%uyW1}YDmFZ
z*+vUy<7gu3$!SZs*CGMmw$f&QTkJJH%qm79bYQ*_ubEmT7Q1u=wUi^()>7cmlh0YE
zWw&kiOtm7x%@|V*NTf&xTB$nf^i9yZ(xi)#aa&kEHeyDglg*ZB-;O$6F@d%k3azRO
zzCJq87Z7VYY@*3BSjxnbT}R5p?Z$IWv?yO*0bwPl4+fc6r8;P;&a}bDV9&L2+Z$y}
z7CrQhMCxk$X4-3HzT{*{exa`&8DDxoVq8!Fy|&3q`@8FXvR$hNq;?z4M+da{z1sNp
z)aj%iMp#yhv+m@k96fBKq(X4};KS88fhd(`W2usfRm@J94UYziJXzOvtF=t+8q3LD
z{AFp3fym5MKVT}Ku^xET-Jw3I;1*;Es~Vamd#Jtn5JL%(GROWIga?-zMY1hN7WcI%
zXmh3&Gds>fqu6X%%{_qHKO|WQ-nh}X7y7J4t;OL!M%zE0p<fs2v@<!Ht?ksVa0|SD
z+HB7Cm!}?oT^^HCKx(B8`cmG~0-ya$rgubbMVDa`)#=E-)te^QXp(jIOFFiNi@FE9
z?!!t9LFs;;#12oCP#ZPoL160PUcXG&!ZN$7kqtx2a;m*26YE8JjJ$5P8EUd1Z<@5I
zrh5~HF50)TUEK1t4@Z&~o6;zZUCp)kDjkf%yEcqvlYPSo<1jPKDW&hK)h>1EZ3^SS
zdBL1rQ?R7boM+Zn+e&3BPH2CrEex5a6UoX7hYuiR=9+6<>sJlNRV??BaAu79wzJc>
z?L$)q*<UgT)Ypj@6gtfqyH^<IIC^{=fopNs!Wp8L6>|WVf|}8wfJFE)W~wZ1dh>5M
zsYSMpA?I`5cj@%DR-#SE8c2in@^qB*Y%ke<g}H?Ey+vMlwpGqkvg}Qyg`qY2nijiB
z%IJ>VQ}~dDzQswwf(K3m?21l(aq!+<oqVbqZ&{R2?Q$=xqj-+#V_3>!eB4_*kIIL2
z)HRy%Vcn{5;%ITG6Cx8Ag{9ce_|lb-Qlpz!F*#Q_O=2o88CM8oK5UZ3pC0Www2cee
zR`UQl=uMfm@%Ix}TdIq(madaIW0Kjv#_|gk6Oy#v8yvKltTfT0`*s@D4on(tu|-{^
z9yiTwUwIf1jG>-Z{z(R78Tm*NVnjk=Uowf*!V4o#Z5)~u))b;HF?86hp*o}(*-5~_
zVN_7{)yDXJ3GL$6?P5xj>;686E$s9PK%tgJCY9lyn%1hdF-xeUoP}NcTY+XC=$=zd
z;S^=Fi<W(NIIo51JU=Gi!mvW2MLd7`<5EP=)mseCI=S~k>Uc{AgA^uJM|~KRels@j
zf<_BQI+b89Q!Z2RY^Ew@*lkTgLqcsMtAn;1*rPk6Pl6x=gFK!py^!)4*Gy0UqO(oq
zj6#i;IEHj=iirl}gbn(;ZFz@2RqG3_M6A)7_+-(2H&Sj(E5@n+XQ5)(DEG_sB51Uw
zP%_bIYc@QVS?FfFE~$Uo@9u1r)3SA&H0#>6;v|#*&Y|jy-~Q3@ZF^?%G2l!e?5&kj
z+UkmP#q4Z9dR)<+S@7yq>>eAM0~{1YIAtS0YV-vK&*_(Liw>+xq-ftSy$07QmYf{F
z9HhgXHmWrGB(IO!eSrn@{<g)cUA%#f&;?8-n>4J;6P<4f%5l!wMN)r1daCqi1sxt=
zTvv1q&XukS!1urXseEBJxpvlX(3Vxb@Z?)=ybfxpFU+{jBDBBR8|Et@Bgj6jwwo@&
z&IO+fBy}*>D7019WU6k}i_fyqmNUzOOJ?0vmtH{nyv7KzWa+9(k0SerT&C*7=Bn0A
z+uv@z*uqP4V${)RAc;B-;X~;(#+*!YB^uW|50k-6Z{>!ZO^p_{g=_rg8<{CBY?;*A
z#95)6B`UwhCbCV=%swq?7u!ch2efkR+^admmqhykXk4}IFlw?hp(Wr!FUZ7!D8>XX
zX6^5;URu#KH=f$@<+~aulEZx1N>jF0_UvY>1v@@nn?tRvhA5n4)Td$AJfm8Jl!^G}
zN{$xgWLw+Bnk4rRf=xz`Qik-oFiX_FNDJgfHf*n&R{iTO6HNu~9x-ENw%0#@SUYSB
z=BjoyeYDOY-IJu{w$?bDe)jsdt6OlUv0<a6F+{Dk4H|vqWBzd;YF{1S)O0YkO(d<c
zwu#xdhS}`FNLG#*q0LETFQp97)rgNx`m#`#obQ3-+A51C*nHofwFk4uDTDRa$EA$b
zwUHSSUv+97on$j+wmPcG4Z)35nsLO8l1%-f9c}j5QaAQ>4|g>r;!0hq!fwAmedZT0
z9_sB!Gj?c`D{~S1$D~uXJs5B6T;i}S=Tjc0<SdbFI}OtNliv6n)3sBcu|2|XJ=#vY
zIqUsf@Wp@2Q0@P*TgbrVjNUe1(&$uX>jH6|beA7`Qaat>J%P?P<CALtjmNC}kI=r2
z(GrShZ`S@S=B$3D=D<*u99`=ji86z1sBy`6c=SRHFx|oS9~ZHO+TqMN*hKA^yIV9c
zsj4yAftVEdA8>{~7U2P{NMH-nXYJTGFq&DL_0+?qrz`f_p#2zv34_P@Z9;3u?~9*p
zlgsL@Y(sc9oxVQsW70?`liaS^mQPRZxz0+Y7g6~LB9-G3Gm~^Co~LG%cMbWO(1@#$
zAqip5f=e|!{J6kpC%kD>a<v;@!TB<lgQa0xWzP4t>}E0C^)Y6C+c87bDgA{KYWVh#
z&pHk#jq-J@pmosTjj?9l4jk)iS{SuP99M1qDq?leUHbPfzLb$--vbsOV^pP_btm%&
zEI!P|apU2M4>4kRl+?fd;==ll#mCxpJUyP}Dekq6(qPiReG$YbHD=80$Q+`HHBa8X
z1nl=o5A{r**d1az!#}kf)P;&e6JQOcHpJ21h6fB9nIyXK*_Anr!N)_LY;iqv7~Lu8
z?eE-+9?^E?4t?-M{kCFjeCVvf#NoP%zJzrZ?dux*_58#*Uwq8)RQhd&W9+hncan}Z
zHf?CgV1WaNXrswj8OpjtZDPk?mG_NH+u;U74p?9_O!gsW2eoPGU?WE-6OJ*Tpf}h!
zjd7@}4{xAt(GGpUqExE(a{$NL6>I-kz_Dh*dY9oSyP8(a9c;eA#bD!{<`aE{Pq(nK
z10|gpd`p_tHQ8x%3B!?{(5HfR2lp8k*LE0J$gV=-Lr?3?^ZHOfap!giw#LjwH>Iun
zbbI*9mLjyuO>0V}HLmlc3&xLRyr%kXDa_c~sTY?@Wxl~Ijn4E#=)rS07y0zMl|7T1
zEi_w8&qE8q(5<z29tX6T)R;}(S0j>_FX=e@M=<Lib4J7GorFQ?rHi^*+aGH!CLTbi
zvaJXAq>iP0?dw^MAAPYF*0zwW8ttmx-#&e7+=kpK03D<Y`(LvpHg{38k&I7n*8_=x
zBU)xjvyug;1?*om=(XFRV{Mi8Ys<xS#WxAw_|x@WzK-kCtb~=`7mzxpF{w#q50X?e
zRZiN6go#fsWfi{zZdiwKFJ;SivgG`5qIX}?dZgB(>~%EzOGiC<Yz2w(Jv7}OYw_ep
z!0xEWy*<a8m!<G^^=Lv9L$mpGP}iF8YF^|)>!Mr~S_<)8rc3bCw_pw(T=TG3?OJHR
z26t{hTJLf<<7zuBJ=IxMWU7lNkGr1{CeX_BuF33$D#AAFpl$flijtXRoy<46t>`BP
z|M2P8<o}gP>Fob=@}`qa0yQ!2>c4$fd8V1%;WG|5X&Jv6u2cWJH+OB;?wXv87P+6^
ze+(ve<IM7QZ?3XqJdsgz;A2=}QIw7E%PtBuuV-zB^<@Sj+vYOeSHx+&Ez>!yZmdc9
zP-oOccX)Hw>5B`_<<(N@e|^#JWp+2o_A*nRMw{?<;8X-m7`u39ViM5ushPk1NeJU<
zm!uCT8Ff2c+gx8?uk)mb`#YL)w7rKk*t}Wa0^5Lg|9uOEo_|VELDC@Of(!3`;d%GX
z9`ew0&dn}7_ks)0JOBRKgU`F*Le0Iz|7h;PpQq~=Y}4X&Z9{i|KYZF5r$4NPE?InR
zPVcHjT5if%Bho@6yM3oE_R>3|yA#ewg7;MFH78$I<b-hP|1PD#r4+c70+&+Y_!RhG
Da}^uh

literal 0
HcmV?d00001


From 571a63e7ecc1f385360230cfc5811e8ffc8defe6 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Thu, 29 Oct 2020 04:09:54 +0100
Subject: [PATCH 080/185] Add bf16 transpose2, reshape2, concat ops (#28195)

---
 .../framework/ir/graph_pattern_detector.cc    |   3 +-
 .../cpu_bfloat16_placement_pass_tester.cc     |  15 ++-
 .../operators/mkldnn/concat_mkldnn_op.cc      |   1 +
 .../operators/mkldnn/transpose_mkldnn_op.cc   |   5 +
 paddle/fluid/operators/reshape_op.cc          |   4 +-
 .../mkldnn/test_concat_bf16_mkldnn_op.py      | 110 ++++++++++++++++++
 .../unittests/mkldnn/test_reshape_bf16_op.py  |  62 ++++++++++
 .../mkldnn/test_transpose_bf16_mkldnn_op.py   |  66 +++++++++++
 8 files changed, 260 insertions(+), 6 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 5ffaf28fe92f1..20da74eca4ef8 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2101,7 +2101,8 @@ PDNode *patterns::QuantizePlacement::operator()(
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>({"conv2d", "fusion_gru"});
+      std::unordered_set<std::string>(
+          {"concat", "conv2d", "fusion_gru", "reshape2", "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
index b9797a4bfcc00..146e29249b7c6 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -40,6 +40,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0], inputs[1]});
   } else if (type == "pool2d") {
     op->SetInput("X", {inputs[0]});
+  } else if (type == "transpose2") {
+    op->SetInput("X", {inputs[0]});
+  } else if (type == "reshape2") {
+    op->SetInput("X", {inputs[0]});
   } else {
     FAIL() << "Unexpected operator type.";
   }
@@ -57,8 +61,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 ProgramDesc BuildProgramDesc() {
   ProgramDesc prog;
 
-  for (auto& v :
-       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l"})) {
+  for (auto& v : std::vector<std::string>(
+           {"a", "b", "c", "f", "g", "h", "k", "l", "m", "n", "o", "p"})) {
     prog.MutableBlock(0)->Var(v);
   }
 
@@ -68,6 +72,9 @@ ProgramDesc BuildProgramDesc() {
   SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
   SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
   SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
+  SetOp(&prog, "concat", "concat2", {"l", "m"}, {"n"});
+  SetOp(&prog, "transpose2", "transpose", {"n"}, {"o"});
+  SetOp(&prog, "reshape2", "reshape", {"o"}, {"p"});
 
   return prog;
 }
@@ -115,7 +122,7 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "relu", "concat"}, 6);
+  MainTest({"conv2d", "pool2d", "relu", "concat"}, 7);
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
@@ -123,7 +130,7 @@ TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   MainTest({"conv2d", "pool2d"}, 3);
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(0); }
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(5); }
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
index b2815cbdc65b5..bb475b4e54366 100644
--- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc
@@ -221,5 +221,6 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(concat, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::ConcatMKLDNNOpKernel<float>,
+                   ops::ConcatMKLDNNOpKernel<paddle::platform::bfloat16>,
                    ops::ConcatMKLDNNOpKernel<int8_t>,
                    ops::ConcatMKLDNNOpKernel<uint8_t>);
diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
index 398bdb01b5c24..28cdd8413ab13 100644
--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -142,6 +142,11 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN,
                                     ops::kTransposeMKLDNNINT8,
                                     ops::TransposeMKLDNNOpKernel<int8_t>);
 
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(
+    transpose2, MKLDNN, ::paddle::platform::CPUPlace, BF16,
+    ops::kTransposeMKLDNNFP32,
+    ops::TransposeMKLDNNOpKernel<paddle::platform::bfloat16>);
+
 REGISTER_OP_KERNEL(transpose, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::TransposeMKLDNNOpKernel<float>);
 
diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc
index aa8e39037062e..7cf85420c579b 100644
--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -622,7 +622,9 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
                                ops::ReshapeKernel, int8_t, ops::ReshapeKernel,
                                uint8_t, ops::ReshapeKernel, int,
                                ops::ReshapeKernel, int64_t, ops::ReshapeKernel,
-                               bool, ops::ReshapeKernel);
+                               bool, ops::ReshapeKernel,
+                               paddle::platform::bfloat16, ops::ReshapeKernel);
+
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                                double, ops::ReshapeGradKernel, int,
                                ops::ReshapeGradKernel, int64_t,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..1179556f915be
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_concat_bf16_mkldnn_op.py
@@ -0,0 +1,110 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestConcatBf16Op(OpTest):
+    def setUp(self):
+        enable_static()
+        self.op_type = "concat"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.init_axis()
+        self.init_shape()
+        self.init_test_data()
+        self.inputs = {'X': [('x0', self.x0), ('x1', self.x1), ('x2', self.x2)]}
+        self.attrs = {
+            'axis': self.axis,
+            'use_mkldnn': True,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+
+        self.output = np.concatenate(
+            (self.x0, self.x1, self.x2), axis=self.axis).astype(np.uint16)
+        self.outputs = {'Out': self.output}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+# --------------------test concat bf16 in with axis 0--------------------
+
+    def init_test_data(self):
+        self.x0 = convert_float_to_uint16(
+            np.random.random(self.x0_shape).astype(np.float32))
+        self.x1 = convert_float_to_uint16(
+            np.random.random(self.x1_shape).astype(np.float32))
+        self.x2 = convert_float_to_uint16(
+            np.random.random(self.x2_shape).astype(np.float32))
+
+    def init_axis(self):
+        self.axis = 0
+
+    def init_shape(self):
+        self.x0_shape = [2, 2, 1, 2]
+        self.x1_shape = [1, 2, 1, 2]
+        self.x2_shape = [3, 2, 1, 2]
+
+
+# --------------------test concat bf16 in with axis 1--------------------
+
+
+class TestAxis1Case(TestConcatBf16Op):
+    def init_axis(self):
+        self.axis = 1
+
+    def init_shape(self):
+        self.x0_shape = [1, 1, 5, 5]
+        self.x1_shape = [1, 2, 5, 5]
+        self.x2_shape = [1, 3, 5, 5]
+
+
+# --------------------test concat bf16 in with axis 2--------------------
+
+
+class TestAxis2Case(TestConcatBf16Op):
+    def init_axis(self):
+        self.axis = 2
+
+    def init_shape(self):
+        self.x0_shape = [2, 3, 4, 5]
+        self.x1_shape = [2, 3, 5, 5]
+        self.x2_shape = [2, 3, 6, 5]
+
+
+# --------------------test concat bf16 in with axis 3--------------------
+
+
+class TestAxis3Case(TestConcatBf16Op):
+    def init_axis(self):
+        self.axis = 3
+
+    def init_shape(self):
+        self.x0_shape = [2, 3, 5, 5]
+        self.x1_shape = [2, 3, 5, 6]
+        self.x2_shape = [2, 3, 5, 7]
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
new file mode 100644
index 0000000000000..854ddb17fb275
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_reshape_bf16_op.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import struct
+
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestReshapeBf16Op(OpTest):
+    def setUp(self):
+        enable_static()
+        self.op_type = "reshape2"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.init_data()
+        self.init_input_data()
+
+        self.inputs = {'X': self.input_data}
+        self.attrs = {
+            'shape': self.new_shape,
+            'use_mkldnn': self.use_mkldnn,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype(np.float32)
+        }
+
+    def init_data(self):
+        self.ori_shape = (10, 2, 6)
+        self.new_shape = (10, 0, 3, -1)
+        self.infered_shape = (10, 2, 3, -1)
+
+    def init_input_data(self):
+        self.input_data = convert_float_to_uint16(
+            np.random.random(self.ori_shape).astype(np.float32))
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..de04cecbf4c9b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_transpose_bf16_mkldnn_op.py
@@ -0,0 +1,66 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestTransposeOp(OpTest):
+    def setUp(self):
+        enable_static()
+        self.op_type = "transpose2"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+        self.init_test_case()
+        self.init_test_data()
+        self.axis = (0, 2, 3, 1)
+
+        self.inputs = {'X': self.input_data}
+
+        self.attrs = {
+            'axis': list(self.axis),
+            'use_mkldnn': self.use_mkldnn,
+            'mkldnn_data_type': self.mkldnn_data_type
+        }
+
+        self.outputs = {
+            'XShape': np.random.random(self.shape).astype(np.uint16),
+            'Out': self.inputs['X'].transpose(self.axis)
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
+
+    def init_test_case(self):
+        self.shape = (2, 3, 4, 5)
+
+    def init_test_data(self):
+        self.input_data = convert_float_to_uint16(
+            np.random.random(self.shape).astype(np.float32))
+
+
+class TestBF16Case(TestTransposeOp):
+    def init_test_case(self):
+        self.shape = (2, 4, 6, 8)
+
+
+if __name__ == '__main__':
+    unittest.main()

From 3ccc0a2f5e059772565d1411372148035e337b10 Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Thu, 29 Oct 2020 11:40:35 +0800
Subject: [PATCH 081/185] enable test_parallel_executor_fetch_isolated_var
 (#28219)

* enable test_parallel_executor_fetch_isolated_var, test=develop

* add enable_static, test=develop

* set test_parallel_executor_fetch_isolated_var RUN_TYPE=DIST, develop=test
---
 python/paddle/fluid/tests/unittests/CMakeLists.txt              | 2 +-
 .../unittests/test_parallel_executor_fetch_isolated_var.py      | 2 ++
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4cd9d9e530d87..52950a4d92a71 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -339,7 +339,6 @@ list(REMOVE_ITEM TEST_OPS test_conv3d_transpose_op)
 # disable this unittest temporarily
 list(REMOVE_ITEM TEST_OPS test_imperative_data_loader_exception)
 list(REMOVE_ITEM TEST_OPS test_sampling_id_op)
-list(REMOVE_ITEM TEST_OPS test_parallel_executor_fetch_isolated_var)
 
 if (APPLE OR WIN32)
   list(REMOVE_ITEM TEST_OPS test_dataset)
@@ -587,6 +586,7 @@ set_tests_properties(test_parallel_executor_crf test_sync_batch_norm_op test_inp
         test_parallel_executor_seresnext_with_reduce_gpu
         test_parallel_executor_seresnext_with_fuse_all_reduce_gpu
         test_parallel_executor_profiler
+        test_parallel_executor_fetch_isolated_var
         PROPERTIES LABELS "RUN_TYPE=DIST")
 
 if(NOT WIN32 AND NOT APPLE)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
index 13932238705f5..d64aa510f4e1a 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_isolated_var.py
@@ -16,6 +16,7 @@
 import numpy as np
 import six
 import paddle.fluid as fluid
+import paddle
 
 
 def enable_parallel_ssa_executor(enabled=True):
@@ -57,6 +58,7 @@ def test_main(self):
 
     def run_impl(self, use_gpu, dev_cnt, is_training, use_experimental_executor,
                  use_parallel_ssa_executor):
+        paddle.enable_static()
         enable_parallel_ssa_executor(use_parallel_ssa_executor)
 
         if fluid.is_compiled_with_cuda():

From f763cb81a697a364e57f0b6104af4a5d6d96b7d5 Mon Sep 17 00:00:00 2001
From: iducn <45056973+iducn@users.noreply.github.com>
Date: Thu, 29 Oct 2020 12:27:09 +0800
Subject: [PATCH 082/185] Modify the shell script according to the
 specification (#28302)

* 01:Modify the shell script according to the specification

* 01:Modify the shell script according to the specification
---
 paddle/.set_port.sh                           |   6 +-
 paddle/.set_python_path.sh                    |   8 +-
 paddle/fluid/inference/api/demo_ci/clean.sh   |   3 +-
 paddle/fluid/inference/api/demo_ci/run.sh     | 175 +++++++++---------
 paddle/fluid/inference/check_symbol.sh        |  12 +-
 paddle/fluid/train/demo/clean.sh              |   2 +-
 paddle/fluid/train/demo/run.sh                |  12 +-
 paddle/fluid/train/imdb_demo/run.sh           |   2 +-
 paddle/scripts/paddle_docker_build.sh         |  32 ++--
 tools/cudaError/start.sh                      |   4 +-
 .../dockerfile/build_scripts/install_nccl2.sh |   4 +-
 tools/gen_alias_mapping.sh                    |   4 +-
 .../manylinux1/build_scripts/install_nccl2.sh |  21 ++-
 13 files changed, 154 insertions(+), 131 deletions(-)

diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh
index 617ac79a24889..e71f494aadf2c 100755
--- a/paddle/.set_port.sh
+++ b/paddle/.set_port.sh
@@ -13,6 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-DIRNAME=`dirname $0`
-source $DIRNAME/.common_test_util.sh
-set_port $@
+DIRNAME="$(dirname "$0")"
+sh "$DIRNAME"/.common_test_util.sh
+set_port "$@"
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index 8fd58925ee482..8da4565be617b 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -24,12 +24,14 @@
 PYPATH=""
 set -x
 while getopts "d:" opt; do
-  case $opt in
+  case "$opt" in
     d)
       PYPATH=$OPTARG
       ;;
+    *)
+      ;;
   esac
 done
-shift $(($OPTIND - 1))
+shift $(("$OPTIND" - 1))
 export PYTHONPATH=$PYPATH:$PYTHONPATH
-$@
+"$@"
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index 0d9f3d2aa237a..5f603465776f1 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,4 +1,5 @@
+#!/bin/bash
 set -x
-cd `dirname $0`
+cd "$(dirname "$0")" || exit
 rm -rf build/ data/
 set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index 6d283ca56cb65..aee013e8f3652 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -1,29 +1,29 @@
 #!/bin/bash
 set -x
-PADDLE_ROOT=$1
-TURN_ON_MKL=$2 # use MKL or Openblas
-TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
-DATA_DIR=$4 # dataset
-TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
-TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-MSVC_STATIC_CRT=$7
-inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
+PADDLE_ROOT="$1"
+TURN_ON_MKL="$2" # use MKL or Openblas
+TEST_GPU_CPU="$3" # test both GPU/CPU mode or only CPU mode
+DATA_DIR="$4" # dataset
+TENSORRT_INCLUDE_DIR="$5" # TensorRT header file dir, default to /usr/local/TensorRT/include
+TENSORRT_LIB_DIR="$6" # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+MSVC_STATIC_CRT="$7"
+inference_install_dir="${PADDLE_ROOT}"/build/paddle_inference_install_dir
 
-cd `dirname $0`
-current_dir=`pwd`
-if [ $2 == ON ]; then
+cd "$(dirname "$0")" || exit
+current_dir=$(pwd)
+if [ "$2" == ON ]; then
   # You can export yourself if move the install path
-  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
-  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
+  MKL_LIB="${inference_install_dir}"/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"${MKL_LIB}"
 fi
-if [ $3 == ON ]; then
+if [ "$3" == ON ]; then
   use_gpu_list='true false'
 else
   use_gpu_list='false'
 fi
 
 USE_TENSORRT=OFF
-if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
+if [ -d "$TENSORRT_INCLUDE_DIR" ] && [ -d "$TENSORRT_LIB_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -32,77 +32,79 @@ URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
-  dir_name=$1
-  mkdir -p $dir_name
-  cd $dir_name
+  dir_name="$1"
+  mkdir -p "$dir_name"
+  cd "$dir_name" || exit
   if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
     echo "${PREFIX}${dir_name}.tar.gz has been downloaded."
   else
-      wget -q ${URL_ROOT}$dir_name.tar.gz
-      tar xzf *.tar.gz
+      wget -q "${URL_ROOT}""$dir_name".tar.gz
+      tar xzf ./*.tar.gz
   fi
-  cd ..
+  cd .. || exit
 }
-mkdir -p $DATA_DIR
-cd $DATA_DIR
+mkdir -p "$DATA_DIR"
+cd "$DATA_DIR" || exit
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
-  download $vis_demo_name
+  download "$vis_demo_name"
 done
 
 # download word2vec data
 mkdir -p word2vec
-cd word2vec
+cd word2vec || exit
 if [[ -e "word2vec.inference.model.tar.gz" ]]; then
   echo "word2vec.inference.model.tar.gz has been downloaded."
 else
     wget -q http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
-    tar xzf *.tar.gz
+    tar xzf ./*.tar.gz
 fi
 
 # compile and test the demo
-cd $current_dir
+cd "$current_dir" || exit
 mkdir -p build
-cd build
-rm -rf *
+cd build || exit
+rm -rf ./*
 
 for WITH_STATIC_LIB in ON OFF; do
-  if [ $(echo `uname` | grep "Win") != "" ]; then
+  if [ "$(uname | grep Win)" != "" ]; then
     # -----simple_on_word2vec on windows-----
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=simple_on_word2vec \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
+      -DMSVC_STATIC_CRT="$MSVC_STATIC_CRT"
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
-        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-        --use_gpu=$use_gpu
-      if [ $? -ne 0 ]; then
+        --dirname="$DATA_DIR"/word2vec/word2vec.inference.model \
+        --use_gpu="$use_gpu"
+      EXCODE="$?"
+      if [ "$EXCODE" -ne 0 ]; then
         echo "simple_on_word2vec demo runs fail."
         exit 1
       fi
     done
 
     # -----vis_demo on windows-----
-    rm -rf *
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    rm -rf ./*
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=vis_demo \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
+      -DMSVC_STATIC_CRT="$MSVC_STATIC_CRT"
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
         Release/vis_demo.exe \
-          --modeldir=$DATA_DIR/$vis_demo_name/model \
-          --data=$DATA_DIR/$vis_demo_name/data.txt \
-          --refer=$DATA_DIR/$vis_demo_name/result.txt \
-          --use_gpu=$use_gpu
-        if [ $? -ne 0 ]; then
+          --modeldir="$DATA_DIR"/"$vis_demo_name"/model \
+          --data="$DATA_DIR"/"$vis_demo_name"/data.txt \
+          --refer="$DATA_DIR"/"$vis_demo_name"/result.txt \
+          --use_gpu="$use_gpu"
+        EXCODE="$?"
+        if [ "$EXCODE" -ne 0 ]; then
           echo "vis demo $vis_demo_name runs fail."
           exit 1
         fi
@@ -110,63 +112,66 @@ for WITH_STATIC_LIB in ON OFF; do
     done
   else
     # -----simple_on_word2vec on linux/mac-----
-    rm -rf *
-    cmake .. -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    rm -rf ./*
+    cmake .. -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=simple_on_word2vec \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-    make -j$(nproc)
-    word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
-    if [ -d $word2vec_model ]; then
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB"
+    make -j"$(nproc)"
+    word2vec_model="$DATA_DIR"'/word2vec/word2vec.inference.model'
+    if [ -d "$word2vec_model" ]; then
       for use_gpu in $use_gpu_list; do
         ./simple_on_word2vec \
-          --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
-          --use_gpu=$use_gpu
-        if [ $? -ne 0 ]; then
+          --dirname="$DATA_DIR"/word2vec/word2vec.inference.model \
+          --use_gpu="$use_gpu"
+        EXCODE="$?"
+        if [ "$EXCODE" -ne 0 ]; then
           echo "simple_on_word2vec demo runs fail."
           exit 1
         fi
       done
     fi
     # ---------vis_demo on linux/mac---------
-    rm -rf *
-    cmake .. -DPADDLE_LIB=${inference_install_dir} \
-      -DWITH_MKL=$TURN_ON_MKL \
+    rm -rf ./*
+    cmake .. -DPADDLE_LIB="${inference_install_dir}" \
+      -DWITH_MKL="$TURN_ON_MKL" \
       -DDEMO_NAME=vis_demo \
-      -DWITH_GPU=$TEST_GPU_CPU \
-      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
-    make -j$(nproc)
+      -DWITH_GPU="$TEST_GPU_CPU" \
+      -DWITH_STATIC_LIB="$WITH_STATIC_LIB"
+    make -j"$(nproc)"
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
         ./vis_demo \
-          --modeldir=$DATA_DIR/$vis_demo_name/model \
-          --data=$DATA_DIR/$vis_demo_name/data.txt \
-          --refer=$DATA_DIR/$vis_demo_name/result.txt \
-          --use_gpu=$use_gpu
-        if [ $? -ne 0 ]; then
+          --modeldir="$DATA_DIR"/"$vis_demo_name"/model \
+          --data="$DATA_DIR"/"$vis_demo_name"/data.txt \
+          --refer="$DATA_DIR"/"$vis_demo_name"/result.txt \
+          --use_gpu="$use_gpu"
+        EXCODE="$?"
+        if [ "$EXCODE" -ne 0 ]; then
           echo "vis demo $vis_demo_name runs fail."
           exit 1
         fi
       done
     done
     # --------tensorrt mobilenet on linux/mac------
-    if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
-      rm -rf *
-      cmake .. -DPADDLE_LIB=${inference_install_dir} \
-        -DWITH_MKL=$TURN_ON_MKL \
+    if [ "$USE_TENSORRT" == ON ] && [ "$TEST_GPU_CPU" == ON ]; then
+      rm -rf ./*
+      cmake .. -DPADDLE_LIB="${inference_install_dir}" \
+        -DWITH_MKL="$TURN_ON_MKL" \
         -DDEMO_NAME=trt_mobilenet_demo \
-        -DWITH_GPU=$TEST_GPU_CPU \
-        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
-        -DUSE_TENSORRT=$USE_TENSORRT \
-        -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
-        -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
-      make -j$(nproc)
+        -DWITH_GPU="$TEST_GPU_CPU" \
+        -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
+        -DUSE_TENSORRT="$USE_TENSORRT" \
+        -DTENSORRT_INCLUDE_DIR="$TENSORRT_INCLUDE_DIR" \
+        -DTENSORRT_LIB_DIR="$TENSORRT_LIB_DIR"
+      make -j"$(nproc)"
       ./trt_mobilenet_demo \
-        --modeldir=$DATA_DIR/mobilenet/model \
-        --data=$DATA_DIR/mobilenet/data.txt \
-        --refer=$DATA_DIR/mobilenet/result.txt 
-      if [ $? -ne 0 ]; then
+        --modeldir="$DATA_DIR"/mobilenet/model \
+        --data="$DATA_DIR"/mobilenet/data.txt \
+        --refer="$DATA_DIR"/mobilenet/result.txt 
+      EXCODE="$?"
+      if [ "$EXCODE" != 0 ]; then
         echo "trt demo trt_mobilenet_demo runs fail."
         exit 1
       fi
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index a0f64796576c8..0c66946c4b8a1 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,12 +1,12 @@
 #!/bin/sh
 
-lib=$1
-if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
+lib="$1"
+if [ "$#" -ne 1 ]; then echo "No input library"; exit 1 ; fi
 
-num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
-num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
+num_paddle_syms=$(nm -D "${lib}" | grep -c paddle )
+num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -c "T " )
 
-if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
-if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
+if [ "$num_paddle_syms" -le 0 ]; then echo "Have no paddle symbols"; exit 1 ; fi
+if [ "$num_google_syms" -ge 1 ]; then echo "Have some google symbols"; exit 1 ; fi
 
 exit 0
diff --git a/paddle/fluid/train/demo/clean.sh b/paddle/fluid/train/demo/clean.sh
index a2064492c08b8..192bdf8752c15 100755
--- a/paddle/fluid/train/demo/clean.sh
+++ b/paddle/fluid/train/demo/clean.sh
@@ -15,6 +15,6 @@
 # limitations under the License.
 
 set -x
-cd "$(dirname "$0")"
+cd "$(dirname "$0")" || exit
 rm -rf build/
 set +x
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
index 2955e7574daa2..a9c0ed4ac68a2 100755
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
@@ -14,14 +14,14 @@ function download() {
 download
 
 # build demo trainer
-paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
+paddle_install_dir="${PADDLE_ROOT}"/build/paddle_install_dir
 
 mkdir -p build
-cd build
-rm -rf *
-cmake .. -DPADDLE_LIB=$paddle_install_dir \
-         -DWITH_MKLDNN=$TURN_ON_MKL \
-         -DWITH_MKL=$TURN_ON_MKL
+cd build || exit
+rm -rf ./*
+cmake .. -DPADDLE_LIB="$paddle_install_dir" \
+         -DWITH_MKLDNN="$TURN_ON_MKL" \
+         -DWITH_MKL="$TURN_ON_MKL"
 make
 
 cd ..
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
index f71b4bac602a9..8a585c614e53f 100644
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ b/paddle/fluid/train/imdb_demo/run.sh
@@ -1,3 +1,3 @@
-
+#!/bin/bash
 set -exu
 build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index d6b639d0da2a5..fdd0d490a6fdb 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -15,14 +15,14 @@
 # limitations under the License.
 
 function start_build_docker() {
-    docker pull $IMG
+    docker pull "$IMG"
 
     apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
     DOCKER_ENV=$(cat <<EOL
         -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
         -e CTEST_OUTPUT_ON_FAILURE=1 \
         -e CTEST_PARALLEL_LEVEL=1 \
-        -e APT_MIRROR=${apt_mirror} \
+        -e APT_MIRROR="${apt_mirror}" \
         -e WITH_GPU=ON \
         -e CUDA_ARCH_NAME=Auto \
         -e WITH_AVX=ON \
@@ -39,24 +39,24 @@ EOL
     )
 
     DOCKER_CMD="nvidia-docker"
-    if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
+    if ! [ -x "$(command -v "${DOCKER_CMD}")" ]; then
         DOCKER_CMD="docker"
     fi
     if [ ! -d "${HOME}/.ccache" ]; then
-        mkdir ${HOME}/.ccache
+        mkdir "${HOME}"/.ccache
     fi
     set -ex
-    ${DOCKER_CMD} run -it \
-        ${DOCKER_ENV} \
-        -e SCRIPT_NAME=$0 \
-        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
-        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
-        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
-        -v $PADDLE_ROOT:/paddle \
-        -v ${HOME}/.ccache:/root/.ccache \
+    "${DOCKER_CMD}" run -it \
+        "${DOCKER_ENV}" \
+        -e SCRIPT_NAME="$0" \
+        -e CONTENT_DEC_PASSWD="$CONTENT_DEC_PASSWD" \
+        -e TRAVIS_BRANCH="$TRAVIS_BRANCH" \
+        -e TRAVIS_PULL_REQUEST="$TRAVIS_PULL_REQUEST" \
+        -v "$PADDLE_ROOT":/paddle \
+        -v "${HOME}"/.ccache:/root/.ccache \
         -w /paddle \
-        $IMG \
-        paddle/scripts/paddle_build.sh $@
+        "$IMG" \
+        paddle/scripts/paddle_build.sh "$@"
     set +x
 }
 
@@ -65,7 +65,7 @@ function main() {
     VERSION="latest-dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
     IMG=${DOCKER_REPO}:${VERSION}
-    start_build_docker $@
+    start_build_docker "$@"
 }
 
-main $@
+main "$@"
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
index 3c0e57ffe7ec1..b98d9491ca968 100644
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -ex
-SYSTEM=`uname -s`
+SYSTEM="$(uname -s)"
 rm -f protoc-3.11.3-linux-x86_64.*
 if [ "$SYSTEM" == "Linux" ]; then
     wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
@@ -28,5 +28,5 @@ if [ "$1" != "" ]; then
     fi
 fi
 
-python spider.py --version=$version --url=$url
+python spider.py --version=$version --url="$url"
 tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 0c9bf1409d90d..2708f4f976d23 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -24,8 +24,8 @@ wget -q -O $DIR/$DEB $URL
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
 for sub_deb in $DEBS; do
-  echo $sub_deb
-  ar x $sub_deb && tar xf data.tar.xz
+  echo "$sub_deb"
+  ar x "$sub_deb" && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
index 3ab1e68b37557..d199c535f9673 100755
--- a/tools/gen_alias_mapping.sh
+++ b/tools/gen_alias_mapping.sh
@@ -31,9 +31,9 @@
 #         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
 
 
-PADDLE_ROOT="$(dirname $(readlink -f ${BASH_SOURCE[0]}))/.."
+PADDLE_ROOT="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")/.."
 
-find ${PADDLE_ROOT}/python/ -name '*.py' \
+find "${PADDLE_ROOT}"/python/ -name '*.py' \
     | xargs  grep -v '^#' \
     | grep 'DEFINE_ALIAS' \
     | perl -ne '
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index 0c9bf1409d90d..c2adf6a79de4b 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -1,4 +1,19 @@
 #!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
@@ -24,10 +39,10 @@ wget -q -O $DIR/$DEB $URL
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
 for sub_deb in $DEBS; do
-  echo $sub_deb
-  ar x $sub_deb && tar xf data.tar.xz
+  echo "$sub_deb"
+  ar x "$sub_deb" && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
 rm /usr/include/nccl.h
-rm -rf $DIR
+rm -rf "$DIR"

From e1fb46739ab082937edb9fa9c910cde18b712045 Mon Sep 17 00:00:00 2001
From: joejiong <wujionghao@baidu.com>
Date: Thu, 29 Oct 2020 13:59:20 +0800
Subject: [PATCH 083/185] move cinn dockerfile to dockerfile folder (#28281)

---
 .../Dockerfile.cuda10_ubuntu18_cinn           | 152 ++++++++++++++++++
 1 file changed, 152 insertions(+)
 create mode 100644 tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn

diff --git a/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn b/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn
new file mode 100644
index 0000000000000..964f082b56137
--- /dev/null
+++ b/tools/dockerfile/Dockerfile.cuda10_ubuntu18_cinn
@@ -0,0 +1,152 @@
+# A image for building paddle binaries
+# Use cuda devel base image for both cpu and gpu environment
+# When you modify it, please be aware of cudnn-runtime version
+FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu18.04
+MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
+
+# ENV variables
+ARG WITH_GPU
+ARG WITH_AVX
+
+ENV WITH_GPU=${WITH_GPU:-ON}
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV DEBIAN_FRONTEND=noninteractive
+
+ENV HOME /root
+# Add bash enhancements
+COPY paddle/scripts/docker/root/ /root/
+
+RUN apt-get update && \
+  apt-get install -y software-properties-common && add-apt-repository ppa:deadsnakes/ppa && \
+  apt-get update && \
+  apt-get install -y curl wget vim git unzip unrar tar xz-utils bzip2 gzip \ 
+    coreutils ntp language-pack-zh-hans python-qt4 libsm6 libxext6 libxrender-dev
+
+
+# Downgrade gcc&&g++
+WORKDIR /usr/bin 
+      RUN apt-get update --fix-missing
+      COPY tools/dockerfile/build_scripts /build_scripts 
+      RUN bash /build_scripts/install_gcc.sh gcc82 && rm -rf /build_scripts 
+      RUN cp gcc gcc.bak && cp g++ g++.bak && rm gcc && rm g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/local/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/local/bin/g++ 
+      RUN ln -s /usr/local/gcc-8.2/bin/gcc /usr/bin/gcc 
+      RUN ln -s /usr/local/gcc-8.2/bin/g++ /usr/bin/g++ 
+      ENV PATH=/usr/local/gcc-8.2/bin:$PATH 
+
+RUN apt-get update && \
+  apt-get install -y python2.7 python2.7-dev \
+  python3.5 python3.5-dev \
+  python3.6 python3.6-dev \
+  python3.7 python3.7-dev \
+  python3.8 python3.8-dev && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python2.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.5 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.6 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.7 && easy_install pip && \
+  curl https://bootstrap.pypa.io/ez_setup.py -o - | python3.8 && easy_install pip && \
+  rm /usr/bin/python && ln -s /usr/bin/python2.7 /usr/bin/python && \
+  rm /usr/bin/python3 && ln -s /usr/bin/python3.5 /usr/bin/python3 && \
+  rm /usr/local/bin/pip && ln -s /usr/local/bin/pip2.7 /usr/local/bin/pip && \
+  rm /usr/local/bin/pip3 && ln -s /usr/local/bin/pip3.5 /usr/local/bin/pip3
+
+
+# install cmake
+WORKDIR /home
+RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz && tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz && rm cmake-3.16.0-Linux-x86_64.tar.gz
+ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH
+
+
+# remove them when apt-get support 2.27 and higher version
+RUN wget -q https://ftp.gnu.org/gnu/binutils/binutils-2.33.1.tar.gz && \ 
+    tar -xzf binutils-2.33.1.tar.gz && \ 
+    cd binutils-2.33.1 && \
+    ./configure && make -j && make install && cd .. && rm -rf binutils-2.33.1 binutils-2.33.1.tar.gz
+
+
+# Install Go and glide
+RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \
+    tar -xz -C /usr/local && \
+    mkdir /root/gopath && \
+    mkdir /root/gopath/bin && \
+    mkdir /root/gopath/src
+ENV GOROOT=/usr/local/go GOPATH=/root/gopath
+# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT.
+ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin
+# install glide
+RUN curl -s -q https://glide.sh/get | sh
+
+# git credential to skip password typing
+RUN git config --global credential.helper store
+
+# Fix locales to en_US.UTF-8
+RUN localedef -i en_US -f UTF-8 en_US.UTF-8
+
+RUN pip3 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.6 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.6 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.7 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.7 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip3.8 --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip3.8 --no-cache-dir install ipykernel==4.6.0 wheel && \
+    pip --no-cache-dir install pre-commit==1.10.4 ipython==5.3.0 && \
+    pip --no-cache-dir install ipykernel==4.6.0 wheel 
+
+#For docstring checker
+RUN pip3 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.6 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.7 --no-cache-dir install pylint pytest astroid isort && \
+    pip3.8 --no-cache-dir install pylint pytest astroid isort && \
+    pip --no-cache-dir install pylint pytest astroid isort
+
+COPY ./python/requirements.txt /root/
+RUN pip3 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.6 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.7 --no-cache-dir install -r /root/requirements.txt && \
+    pip3.8 --no-cache-dir install -r /root/requirements.txt && \
+    pip --no-cache-dir install -r /root/requirements.txt
+
+
+# Older versions of patchelf limited the size of the files being processed and were fixed in this pr.
+# https://github.com/NixOS/patchelf/commit/ba2695a8110abbc8cc6baf0eea819922ee5007fa
+# So install a newer version here.
+RUN wget -q http://mirrors.kernel.org/ubuntu/pool/universe/p/patchelf/patchelf_0.10-2_amd64.deb && \
+    dpkg -i patchelf_0.10-2_amd64.deb
+
+# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service
+#RUN mkdir /var/run/sshd && echo 'root:root' | chpasswd && sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config && sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config
+#CMD source ~/.bashrc
+
+# ccache 3.7.9
+RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \
+    tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \
+    ./configure -prefix=/usr/local/ccache-3.7.9 && \
+    make -j8 && make install && \
+    ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache
+
+# For CINN environment 
+RUN apt update --fix-missing
+RUN apt-get install autoconf autogen
+RUN apt-get install libtool
+RUN apt-get install zlib1g-dev
+RUN apt install libginac-dev -y
+RUN apt install clang cmake -y
+RUN python3 -m pip install numpy
+RUN python3 -m pip install pybind11
+
+
+# Install LLVM
+RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
+RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic main" >> /etc/apt/source.list
+RUN echo "deb http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
+RUN echo "deb-src http://apt.llvm.org/bionic/ llvm-toolchain-bionic-10 main" >> /etc/apt/source.list
+RUN ln -s /usr/bin/llvm-config-6.0 /usr/bin/llvm-config
+RUN wget -O - https://apt.llvm.org/llvm-snapshot.gpg.key|apt-key add -
+
+RUN apt update
+RUN apt install libclang-dev llvm-10 llvm-10-dev libclang-10-dev -y
+
+
+EXPOSE 22

From 1c385e26f9e6727c66f971a33f930c59f75a6639 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Thu, 29 Oct 2020 15:05:51 +0800
Subject: [PATCH 084/185] add op_function_generator for box_coder (#28303)

* add op_function_generator for box_coder

* fix format
---
 paddle/fluid/pybind/op_function_generator.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index cac44173c1772..92006bff2cc16 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -53,6 +53,7 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
      {"X", "W", "Label", "PathTable", "PathCode", "Bias"}},
     {"moving_average_abs_max_scale", {"X", "InAccum", "InState"}},
     {"multiclass_nms3", {"BBoxes", "Scores", "RoisNum"}},
+    {"box_coder", {"PriorBox", "PriorBoxVar", "TargetBox"}},
 };
 
 // NOTE(zhiqiu): Like op_ins_map.

From 8cd1c102d9ec0d04071422af999d04f3840a931b Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Thu, 29 Oct 2020 14:32:08 +0100
Subject: [PATCH 085/185] Enable GRU infer model running CAPI (#28313)

* enable infer model running CAPI

* output size should bigger than 0
---
 .../inference/tests/api/analyzer_lexical_analysis_gru_tester.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
index e4035c8034137..7c5757ce9d4c6 100644
--- a/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_lexical_analysis_gru_tester.cc
@@ -190,7 +190,7 @@ std::vector<double> Lexical_Test(
     // return acc_res;
   } else {
     EXPECT_GT(outputs->size(), 0UL);
-    EXPECT_EQ(outputs[0].size(), 1UL);
+    EXPECT_GT(outputs[0].size(), 0UL);
     LOG(INFO) << "No accuracy result. To get accuracy result provide a model "
                  "with accuracy layers in it and use --with_accuracy_layer "
                  "option.";

From 26ede6e07e051061981498851f50b9c59b8133f3 Mon Sep 17 00:00:00 2001
From: zhulei <563755780@qq.com>
Date: Fri, 30 Oct 2020 17:39:37 +0800
Subject: [PATCH 086/185] Add median api. (#28310)

* Add median api.

* Add median api.

* Add median api.

* Add median api.

* Add median api.
---
 python/paddle/__init__.py                     |  1 +
 .../fluid/tests/unittests/test_median.py      | 88 +++++++++++++++++++
 python/paddle/tensor/__init__.py              |  1 +
 python/paddle/tensor/stat.py                  | 88 ++++++++++++++++++-
 4 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_median.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index ae4dda166c733..c8e0d830f4e17 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -248,6 +248,7 @@
 from .tensor.stat import var  #DEFINE_ALIAS
 # from .fluid.data import data
 from .tensor.stat import numel  #DEFINE_ALIAS
+from .tensor.stat import median  #DEFINE_ALIAS
 from .device import get_cudnn_version
 from .device import set_device
 from .device import get_device
diff --git a/python/paddle/fluid/tests/unittests/test_median.py b/python/paddle/fluid/tests/unittests/test_median.py
new file mode 100644
index 0000000000000..be2206d0267ef
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_median.py
@@ -0,0 +1,88 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import paddle
+from paddle.static import Program, program_guard
+
+DELTA = 1e-6
+
+
+class TestMedian(unittest.TestCase):
+    def check_numpy_res(self, np1, np2):
+        self.assertEqual(np1.shape, np2.shape)
+        mismatch = np.sum((np1 - np2) * (np1 - np2))
+        self.assertAlmostEqual(mismatch, 0, DELTA)
+
+    def static_single_test_median(self, lis_test):
+        paddle.enable_static()
+        x, axis, keepdims = lis_test
+        res_np = np.median(x, axis=axis, keepdims=keepdims)
+        if not isinstance(res_np, np.ndarray):
+            res_np = np.array([res_np])
+        main_program = Program()
+        startup_program = Program()
+        exe = paddle.static.Executor()
+        with program_guard(main_program, startup_program):
+            x_in = paddle.fluid.data(shape=x.shape, dtype=x.dtype, name='x')
+            y = paddle.median(x_in, axis, keepdims)
+            [res_pd] = exe.run(feed={'x': x}, fetch_list=[y])
+            self.check_numpy_res(res_pd, res_np)
+        paddle.disable_static()
+
+    def dygraph_single_test_median(self, lis_test):
+        x, axis, keepdims = lis_test
+        res_np = np.median(x, axis=axis, keepdims=keepdims)
+        if not isinstance(res_np, np.ndarray):
+            res_np = np.array([res_np])
+        res_pd = paddle.median(paddle.to_tensor(x), axis, keepdims)
+        self.check_numpy_res(res_pd.numpy(), res_np)
+
+    def test_median_static(self):
+        h = 3
+        w = 4
+        l = 2
+        x = np.arange(h * w * l).reshape([h, w, l])
+        lis_tests = [[x, axis, keepdims]
+                     for axis in [-1, 0, 1, 2, None]
+                     for keepdims in [False, True]]
+        for lis_test in lis_tests:
+            self.static_single_test_median(lis_test)
+
+    def test_median_dygraph(self):
+        paddle.disable_static()
+        h = 3
+        w = 4
+        l = 2
+        x = np.arange(h * w * l).reshape([h, w, l])
+        lis_tests = [[x, axis, keepdims]
+                     for axis in [-1, 0, 1, 2, None]
+                     for keepdims in [False, True]]
+        for lis_test in lis_tests:
+            self.dygraph_single_test_median(lis_test)
+
+    def test_median_exception(self):
+        paddle.disable_static()
+        x = [1, 2, 3, 4]
+        self.assertRaises(TypeError, paddle.median, x)
+        x = paddle.arange(12).reshape([3, 4])
+        self.assertRaises(ValueError, paddle.median, x, 1.0)
+        self.assertRaises(ValueError, paddle.median, x, 2)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index eaade222388fa..43e6c9654c4d8 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -190,6 +190,7 @@
 from .stat import std  #DEFINE_ALIAS
 from .stat import var  #DEFINE_ALIAS
 from .stat import numel  #DEFINE_ALIAS
+from .stat import median  #DEFINE_ALIAS
 # from .tensor import Tensor        #DEFINE_ALIAS
 # from .tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor import LoDTensorArray        #DEFINE_ALIAS
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 24f62bfcd8d46..5647896066d38 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -14,7 +14,7 @@
 
 # TODO: define statistical functions of a tensor  
 
-__all__ = ['mean', 'std', 'var', 'numel']
+__all__ = ['mean', 'std', 'var', 'numel', 'median']
 
 import numpy as np
 from ..fluid.framework import Variable
@@ -258,3 +258,89 @@ def numel(x, name=None):
         dtype=core.VarDesc.VarType.INT64)
     helper.append_op(type='size', inputs={'Input': x}, outputs={'Out': out})
     return out
+
+
+def median(x, axis=None, keepdim=False, name=None):
+    """
+    Compute the median along the specified axis.
+
+    Args:
+        x (Tensor): The input Tensor, it's data type can be bool, float16, float32, float64, int32, int64.
+        axis (int, optional): The axis along which to perform median calculations ``axis`` should be int.
+            ``axis`` should be in range [-D, D), where D is the dimensions of ``x`` .
+            If ``axis`` is less than 0, it works the same way as :math:`axis + D`.
+            If ``axis`` is None, median is calculated over all elements of ``x``. Default is None.
+        keepdim (bool, optional): Whether to reserve the reduced dimension(s)
+            in the output Tensor. If ``keepdim`` is True, the dimensions of
+            the output Tensor is the same as ``x`` except in the reduced
+            dimensions(it is of size 1 in this case). Otherwise, the shape of
+            the output Tensor is squeezed in ``axis`` . Default is False.
+        name (str, optional): Name for the operation (optional, default is None).
+            For more information, please refer to :ref:`api_guide_Name`.
+
+    Returns:
+        Tensor, results of median along ``axis`` of ``x``. If data type of ``x`` is float64, data type of results will be float64, otherwise data type will be float32.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            x = paddle.arange(12).reshape([3, 4])
+            # x is [[0 , 1 , 2 , 3 ],
+            #       [4 , 5 , 6 , 7 ],
+            #       [8 , 9 , 10, 11]]
+
+            y1 = paddle.median(x)
+            # y1 is [5.5]
+
+            y2 = paddle.median(x, axis=0)
+            # y2 is [4., 5., 6., 7.]
+
+            y3 = paddle.median(x, axis=1)
+            # y3 is [1.5, 5.5, 9.5]
+
+            y4 = paddle.median(x, axis=0, keepdim=True)
+            # y4 is [[4., 5., 6., 7.]]
+
+    """
+    if not isinstance(x, Variable):
+        raise TypeError("In median, the input x should be a Tensor.")
+    is_flatten = axis is None
+    dims = len(x.shape)
+    if is_flatten:
+        x = paddle.flatten(x)
+        axis = 0
+    else:
+        if not isinstance(axis, int) or not (axis < dims and axis >= -dims):
+            raise ValueError(
+                "In median, axis should be none or an integer in range [-rank(x), rank(x))."
+            )
+        if axis < 0:
+            axis += dims
+    sz = x.shape[axis]
+    kth = sz >> 1
+    tensor_topk, idx = paddle.topk(x, kth + 1, axis=axis, largest=False)
+    dtype = 'float64' if x.dtype == core.VarDesc.VarType.FP64 else 'float32'
+    if sz & 1 == 0:
+        out_tensor = paddle.slice(
+            tensor_topk, axes=[axis], starts=[kth - 1],
+            ends=[kth]) + paddle.slice(
+                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1])
+        out_tensor = paddle.cast(out_tensor, dtype=dtype) / 2
+    else:
+        out_tensor = paddle.cast(
+            paddle.slice(
+                tensor_topk, axes=[axis], starts=[kth], ends=[kth + 1]),
+            dtype=dtype)
+    if not keepdim or is_flatten:
+        if not is_flatten:
+            newshape = x.shape[:axis] + x.shape[axis + 1:]
+        elif not keepdim:
+            newshape = [1]
+        else:
+            newshape = [1] * dims
+    else:
+        newshape = out_tensor.shape
+    out_tensor = out_tensor.reshape(newshape, name=name)
+    return out_tensor

From fb1e0c93c9cc3d6d4422233c493b3c938dcb4979 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Fri, 30 Oct 2020 18:36:43 +0800
Subject: [PATCH 087/185] Make vision datasets return PIL.Image as default
 (#28264)

* return pil image as default according backend
---
 python/paddle/hapi/callbacks.py               | 23 ++++++--
 python/paddle/hapi/model.py                   | 45 ++++++++++++----
 python/paddle/tests/test_callbacks.py         |  8 ++-
 python/paddle/tests/test_dataset_cifar.py     | 54 ++++++++++++++++---
 python/paddle/tests/test_dataset_voc.py       | 24 +++++++++
 python/paddle/tests/test_datasets.py          | 42 ++++++++++++++-
 python/paddle/vision/datasets/cifar.py        | 44 ++++++++++++---
 python/paddle/vision/datasets/flowers.py      | 27 ++++++++--
 python/paddle/vision/datasets/mnist.py        | 31 +++++++++--
 python/paddle/vision/datasets/voc2012.py      | 35 ++++++++++--
 python/paddle/vision/transforms/transforms.py |  2 +
 11 files changed, 291 insertions(+), 44 deletions(-)

diff --git a/python/paddle/hapi/callbacks.py b/python/paddle/hapi/callbacks.py
index 4a1751b331d21..8a89ee8517426 100644
--- a/python/paddle/hapi/callbacks.py
+++ b/python/paddle/hapi/callbacks.py
@@ -296,12 +296,17 @@ class ProgBarLogger(Callback):
         .. code-block:: python
 
             import paddle
+            import paddle.vision.transforms as T
             from paddle.static import InputSpec
 
             inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
 
-            train_dataset = paddle.vision.datasets.MNIST(mode='train')
+            transform = T.Compose([
+                T.Transpose(),
+                T.Normalize([127.5], [127.5])
+            ])
+            train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
 
             lenet = paddle.vision.LeNet()
             model = paddle.Model(lenet,
@@ -432,12 +437,17 @@ class ModelCheckpoint(Callback):
         .. code-block:: python
 
             import paddle
+            import paddle.vision.transforms as T
             from paddle.static import InputSpec
 
             inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
 
-            train_dataset = paddle.vision.datasets.MNIST(mode='train')
+            transform = T.Compose([
+                T.Transpose(),
+                T.Normalize([127.5], [127.5])
+            ])
+            train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
 
             lenet = paddle.vision.LeNet()
             model = paddle.Model(lenet,
@@ -484,13 +494,18 @@ class VisualDL(Callback):
         .. code-block:: python
 
             import paddle
+            import paddle.vision.transforms as T
             from paddle.static import InputSpec
 
             inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
             labels = [InputSpec([None, 1], 'int64', 'label')]
 
-            train_dataset = paddle.vision.datasets.MNIST(mode='train')
-            eval_dataset = paddle.vision.datasets.MNIST(mode='test')
+            transform = T.Compose([
+                T.Transpose(),
+                T.Normalize([127.5], [127.5])
+            ])
+            train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+            eval_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
 
             net = paddle.vision.LeNet()
             model = paddle.Model(net, inputs, labels)
diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index aa99d698bc7b2..466b6f2e63ec5 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -837,6 +837,7 @@ class Model(object):
 
         import paddle
         import paddle.nn as nn
+        import paddle.vision.transforms as T
         from paddle.static import InputSpec
 
         device = paddle.set_device('cpu') # or 'gpu'
@@ -858,7 +859,11 @@ class Model(object):
                       paddle.nn.CrossEntropyLoss(),
                       paddle.metric.Accuracy())
         
-        data = paddle.vision.datasets.MNIST(mode='train')
+        transform = T.Compose([
+            T.Transpose(),
+            T.Normalize([127.5], [127.5])
+        ])
+        data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
         model.fit(data, epochs=2, batch_size=32, verbose=1)
     """
 
@@ -1067,6 +1072,7 @@ def save(self, path, training=True):
 
                 import paddle
                 import paddle.nn as nn
+                import paddle.vision.transforms as T
                 from paddle.static import InputSpec
 
                 class Mnist(nn.Layer):
@@ -1093,7 +1099,13 @@ def forward(self, x):
                 optim = paddle.optimizer.SGD(learning_rate=1e-3,
                     parameters=model.parameters())
                 model.prepare(optim, paddle.nn.CrossEntropyLoss())
-                data = paddle.vision.datasets.MNIST(mode='train')
+                
+                transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+                data = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+                
                 model.fit(data, epochs=1, batch_size=32, verbose=0)
                 model.save('checkpoint/test')  # save for training
                 model.save('inference_model', False)  # save for inference
@@ -1353,14 +1365,19 @@ def fit(
             .. code-block:: python
 
               import paddle
+              import paddle.vision.transforms as T
               from paddle.static import InputSpec
 
               dynamic = True
               device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device) if dynamic else None
-           
-              train_dataset = paddle.vision.datasets.MNIST(mode='train')
-              val_dataset = paddle.vision.datasets.MNIST(mode='test')
+              
+              transform = T.Compose([
+                  T.Transpose(),
+                  T.Normalize([127.5], [127.5])
+              ])
+              train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
+              val_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
            
               input = InputSpec([None, 1, 28, 28], 'float32', 'image')
               label = InputSpec([None, 1], 'int64', 'label')
@@ -1386,16 +1403,21 @@ def fit(
             .. code-block:: python
 
               import paddle
+              import paddle.vision.transforms as T
               from paddle.static import InputSpec
 
               dynamic = True
               device = paddle.set_device('cpu') # or 'gpu'
               paddle.disable_static(device) if dynamic else None
-           
-              train_dataset = paddle.vision.datasets.MNIST(mode='train')
+              
+              transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+              train_dataset = paddle.vision.datasets.MNIST(mode='train', transform=transform)
               train_loader = paddle.io.DataLoader(train_dataset,
                   places=device, batch_size=64)
-              val_dataset = paddle.vision.datasets.MNIST(mode='test')
+              val_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
               val_loader = paddle.io.DataLoader(val_dataset,
                   places=device, batch_size=64)
            
@@ -1522,10 +1544,15 @@ def evaluate(
         .. code-block:: python
 
             import paddle
+            import paddle.vision.transforms as T
             from paddle.static import InputSpec
 
             # declarative mode
-            val_dataset = paddle.vision.datasets.MNIST(mode='test')
+            transform = T.Compose([
+                    T.Transpose(),
+                    T.Normalize([127.5], [127.5])
+                ])
+            val_dataset = paddle.vision.datasets.MNIST(mode='test', transform=transform)
 
             input = InputSpec([-1, 1, 28, 28], 'float32', 'image')
             label = InputSpec([None, 1], 'int64', 'label')
diff --git a/python/paddle/tests/test_callbacks.py b/python/paddle/tests/test_callbacks.py
index 5c349c5f1d35e..e9664be0bfdd3 100644
--- a/python/paddle/tests/test_callbacks.py
+++ b/python/paddle/tests/test_callbacks.py
@@ -24,6 +24,7 @@
 from paddle.static import InputSpec
 from paddle.vision.models import LeNet
 from paddle.hapi.callbacks import config_callbacks
+import paddle.vision.transforms as T
 
 
 class TestCallbacks(unittest.TestCase):
@@ -112,8 +113,11 @@ def test_visualdl_callback(self):
         inputs = [InputSpec([-1, 1, 28, 28], 'float32', 'image')]
         labels = [InputSpec([None, 1], 'int64', 'label')]
 
-        train_dataset = paddle.vision.datasets.MNIST(mode='train')
-        eval_dataset = paddle.vision.datasets.MNIST(mode='test')
+        transform = T.Compose([T.Transpose(), T.Normalize([127.5], [127.5])])
+        train_dataset = paddle.vision.datasets.MNIST(
+            mode='train', transform=transform)
+        eval_dataset = paddle.vision.datasets.MNIST(
+            mode='test', transform=transform)
 
         net = paddle.vision.LeNet()
         model = paddle.Model(net, inputs, labels)
diff --git a/python/paddle/tests/test_dataset_cifar.py b/python/paddle/tests/test_dataset_cifar.py
index 672de7ae8e94e..e84f73188666a 100644
--- a/python/paddle/tests/test_dataset_cifar.py
+++ b/python/paddle/tests/test_dataset_cifar.py
@@ -27,10 +27,11 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
+        data = np.array(data)
         self.assertTrue(len(data.shape) == 3)
-        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
-        self.assertTrue(data.shape[2] == 32)
+        self.assertTrue(data.shape[0] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
 
@@ -43,12 +44,30 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
+        data = np.array(data)
         self.assertTrue(len(data.shape) == 3)
-        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
-        self.assertTrue(data.shape[2] == 32)
+        self.assertTrue(data.shape[0] == 32)
         self.assertTrue(0 <= int(label) <= 9)
 
+        # test cv2 backend
+        cifar = Cifar10(mode='test', backend='cv2')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[2] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(0 <= int(label) <= 99)
+
+        with self.assertRaises(ValueError):
+            cifar = Cifar10(mode='test', backend=1)
+
 
 class TestCifar100Train(unittest.TestCase):
     def test_main(self):
@@ -59,10 +78,11 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 50000)
         data, label = cifar[idx]
+        data = np.array(data)
         self.assertTrue(len(data.shape) == 3)
-        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
-        self.assertTrue(data.shape[2] == 32)
+        self.assertTrue(data.shape[0] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
 
@@ -75,12 +95,30 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 10000)
         data, label = cifar[idx]
+        data = np.array(data)
         self.assertTrue(len(data.shape) == 3)
-        self.assertTrue(data.shape[0] == 3)
+        self.assertTrue(data.shape[2] == 3)
         self.assertTrue(data.shape[1] == 32)
-        self.assertTrue(data.shape[2] == 32)
+        self.assertTrue(data.shape[0] == 32)
         self.assertTrue(0 <= int(label) <= 99)
 
+        # test cv2 backend
+        cifar = Cifar100(mode='test', backend='cv2')
+        self.assertTrue(len(cifar) == 10000)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 10000)
+        data, label = cifar[idx]
+        self.assertTrue(len(data.shape) == 3)
+        self.assertTrue(data.shape[2] == 3)
+        self.assertTrue(data.shape[1] == 32)
+        self.assertTrue(data.shape[0] == 32)
+        self.assertTrue(0 <= int(label) <= 99)
+
+        with self.assertRaises(ValueError):
+            cifar = Cifar100(mode='test', backend=1)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_dataset_voc.py b/python/paddle/tests/test_dataset_voc.py
index d45df419b1283..6ca2a8e184ca3 100644
--- a/python/paddle/tests/test_dataset_voc.py
+++ b/python/paddle/tests/test_dataset_voc.py
@@ -32,6 +32,9 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 3)
         image, label = voc2012[idx]
+        image = np.array(image)
+        label = np.array(label)
+
         self.assertTrue(len(image.shape) == 3)
         self.assertTrue(len(label.shape) == 2)
 
@@ -45,6 +48,9 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 1)
         image, label = voc2012[idx]
+        image = np.array(image)
+        label = np.array(label)
+
         self.assertTrue(len(image.shape) == 3)
         self.assertTrue(len(label.shape) == 2)
 
@@ -58,9 +64,27 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 1)
         image, label = voc2012[idx]
+        image = np.array(image)
+        label = np.array(label)
+
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(len(label.shape) == 2)
+
+        # test cv2 backend
+        voc2012 = VOC2012(mode='test', backend='cv2')
+        self.assertTrue(len(voc2012) == 2)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1)
+        image, label = voc2012[idx]
+
         self.assertTrue(len(image.shape) == 3)
         self.assertTrue(len(label.shape) == 2)
 
+        with self.assertRaises(ValueError):
+            voc2012 = VOC2012(mode='test', backend=1)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 1e0d6dbacf6c4..1dc651f916c42 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -19,6 +19,7 @@
 import shutil
 import cv2
 
+import paddle.vision.transforms as T
 from paddle.vision.datasets import *
 from paddle.dataset.common import _check_exists_and_download
 
@@ -89,7 +90,8 @@ def test_errors(self):
 
 class TestMNISTTest(unittest.TestCase):
     def test_main(self):
-        mnist = MNIST(mode='test')
+        transform = T.Transpose()
+        mnist = MNIST(mode='test', transform=transform)
         self.assertTrue(len(mnist) == 10000)
 
         for i in range(len(mnist)):
@@ -103,7 +105,8 @@ def test_main(self):
 
 class TestMNISTTrain(unittest.TestCase):
     def test_main(self):
-        mnist = MNIST(mode='train')
+        transform = T.Transpose()
+        mnist = MNIST(mode='train', transform=transform)
         self.assertTrue(len(mnist) == 60000)
 
         for i in range(len(mnist)):
@@ -114,6 +117,22 @@ def test_main(self):
             self.assertTrue(label.shape[0] == 1)
             self.assertTrue(0 <= int(label) <= 9)
 
+        # test cv2 backend
+        mnist = MNIST(mode='train', transform=transform, backend='cv2')
+        self.assertTrue(len(mnist) == 60000)
+
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+            break
+
+        with self.assertRaises(ValueError):
+            mnist = MNIST(mode='train', transform=transform, backend=1)
+
 
 class TestFlowersTrain(unittest.TestCase):
     def test_main(self):
@@ -124,6 +143,7 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 6149)
         image, label = flowers[idx]
+        image = np.array(image)
         self.assertTrue(len(image.shape) == 3)
         self.assertTrue(image.shape[2] == 3)
         self.assertTrue(label.shape[0] == 1)
@@ -138,6 +158,7 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 1020)
         image, label = flowers[idx]
+        image = np.array(image)
         self.assertTrue(len(image.shape) == 3)
         self.assertTrue(image.shape[2] == 3)
         self.assertTrue(label.shape[0] == 1)
@@ -152,10 +173,27 @@ def test_main(self):
         # long time, randomly check 1 sample
         idx = np.random.randint(0, 1020)
         image, label = flowers[idx]
+        image = np.array(image)
         self.assertTrue(len(image.shape) == 3)
         self.assertTrue(image.shape[2] == 3)
         self.assertTrue(label.shape[0] == 1)
 
+        # test cv2 backend
+        flowers = Flowers(mode='test', backend='cv2')
+        self.assertTrue(len(flowers) == 1020)
+
+        # traversal whole dataset may cost a
+        # long time, randomly check 1 sample
+        idx = np.random.randint(0, 1020)
+        image, label = flowers[idx]
+
+        self.assertTrue(len(image.shape) == 3)
+        self.assertTrue(image.shape[2] == 3)
+        self.assertTrue(label.shape[0] == 1)
+
+        with self.assertRaises(ValueError):
+            flowers = Flowers(mode='test', backend=1)
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index c531f3d0e4e3d..671632f871bac 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -17,6 +17,7 @@
 import tarfile
 import numpy as np
 import six
+from PIL import Image
 from six.moves import cPickle as pickle
 
 import paddle
@@ -51,6 +52,10 @@ class Cifar10(Dataset):
         transform(callable): transform to perform on image, None for on transform.
         download(bool): whether to download dataset automatically if
             :attr:`data_file` is not set. Default True
+        backend(str, optional): Specifies which type of image to be returned: 
+            PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}. 
+            If this option is not set, will get backend from ``paddle.vsion.get_image_backend`` ,
+            default backend is 'pil'. Default: None.
 
     Returns:
         Dataset: instance of cifar-10 dataset
@@ -72,13 +77,14 @@ def __init__(self):
                         nn.Softmax())
 
                 def forward(self, image, label):
-                    image = paddle.reshape(image, (3, -1))
+                    image = paddle.reshape(image, (1, -1))
                     return self.fc(image), label
 
             paddle.disable_static()
 
             normalize = Normalize(mean=[0.5, 0.5, 0.5],
-                                std=[0.5, 0.5, 0.5])
+                                  std=[0.5, 0.5, 0.5],
+                                  data_format='HWC')
             cifar10 = Cifar10(mode='train', transform=normalize)
 
             for i in range(10):
@@ -96,11 +102,20 @@ def __init__(self,
                  data_file=None,
                  mode='train',
                  transform=None,
-                 download=True):
+                 download=True,
+                 backend=None):
         assert mode.lower() in ['train', 'test', 'train', 'test'], \
             "mode should be 'train10', 'test10', 'train100' or 'test100', but got {}".format(mode)
         self.mode = mode.lower()
 
+        if backend is None:
+            backend = paddle.vision.get_image_backend()
+        if backend not in ['pil', 'cv2']:
+            raise ValueError(
+                "Expected backend are one of ['pil', 'cv2'], but got {}"
+                .format(backend))
+        self.backend = backend
+
         self._init_url_md5_flag()
 
         self.data_file = data_file
@@ -143,8 +158,16 @@ def _load_data(self):
     def __getitem__(self, idx):
         image, label = self.data[idx]
         image = np.reshape(image, [3, 32, 32])
+        image = image.transpose([1, 2, 0])
+
+        if self.backend == 'pil':
+            image = Image.fromarray(image)
         if self.transform is not None:
             image = self.transform(image)
+
+        if self.backend == 'pil':
+            return image, np.array(label).astype('int64')
+
         return image.astype(self.dtype), np.array(label).astype('int64')
 
     def __len__(self):
@@ -163,6 +186,10 @@ class Cifar100(Cifar10):
         transform(callable): transform to perform on image, None for on transform.
         download(bool): whether to download dataset automatically if
             :attr:`data_file` is not set. Default True
+        backend(str, optional): Specifies which type of image to be returned: 
+            PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}. 
+            If this option is not set, will get backend from ``paddle.vsion.get_image_backend`` ,
+            default backend is 'pil'. Default: None.
 
     Returns:
         Dataset: instance of cifar-100 dataset
@@ -184,13 +211,14 @@ def __init__(self):
                         nn.Softmax())
 
                 def forward(self, image, label):
-                    image = paddle.reshape(image, (3, -1))
+                    image = paddle.reshape(image, (1, -1))
                     return self.fc(image), label
 
             paddle.disable_static()
 
             normalize = Normalize(mean=[0.5, 0.5, 0.5],
-                                std=[0.5, 0.5, 0.5])
+                                  std=[0.5, 0.5, 0.5],
+                                  data_format='HWC')
             cifar100 = Cifar100(mode='train', transform=normalize)
 
             for i in range(10):
@@ -208,8 +236,10 @@ def __init__(self,
                  data_file=None,
                  mode='train',
                  transform=None,
-                 download=True):
-        super(Cifar100, self).__init__(data_file, mode, transform, download)
+                 download=True,
+                 backend=None):
+        super(Cifar100, self).__init__(data_file, mode, transform, download,
+                                       backend)
 
     def _init_url_md5_flag(self):
         self.data_url = CIFAR100_URL
diff --git a/python/paddle/vision/datasets/flowers.py b/python/paddle/vision/datasets/flowers.py
index 2251333fd8d28..f0695ee8ba4da 100644
--- a/python/paddle/vision/datasets/flowers.py
+++ b/python/paddle/vision/datasets/flowers.py
@@ -56,6 +56,10 @@ class Flowers(Dataset):
         transform(callable): transform to perform on image, None for on transform.
         download(bool): whether to download dataset automatically if
             :attr:`data_file` is not set. Default True
+        backend(str, optional): Specifies which type of image to be returned: 
+            PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}. 
+            If this option is not set, will get backend from ``paddle.vsion.get_image_backend`` ,
+            default backend is 'pil'. Default: None.
 
     Examples:
         
@@ -67,7 +71,7 @@ class Flowers(Dataset):
 
             for i in range(len(flowers)):
                 sample = flowers[i]
-                print(sample[0].shape, sample[1])
+                print(sample[0].size, sample[1])
 
     """
 
@@ -77,9 +81,19 @@ def __init__(self,
                  setid_file=None,
                  mode='train',
                  transform=None,
-                 download=True):
+                 download=True,
+                 backend=None):
         assert mode.lower() in ['train', 'valid', 'test'], \
                 "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+
+        if backend is None:
+            backend = paddle.vision.get_image_backend()
+        if backend not in ['pil', 'cv2']:
+            raise ValueError(
+                "Expected backend are one of ['pil', 'cv2'], but got {}"
+                .format(backend))
+        self.backend = backend
+
         self.flag = MODE_FLAG_MAP[mode.lower()]
 
         self.data_file = data_file
@@ -122,11 +136,18 @@ def __getitem__(self, idx):
         img_name = "jpg/image_%05d.jpg" % index
         img_ele = self.name2mem[img_name]
         image = self.data_tar.extractfile(img_ele).read()
-        image = np.array(Image.open(io.BytesIO(image)))
+
+        if self.backend == 'pil':
+            image = Image.open(io.BytesIO(image))
+        elif self.backend == 'cv2':
+            image = np.array(Image.open(io.BytesIO(image)))
 
         if self.transform is not None:
             image = self.transform(image)
 
+        if self.backend == 'pil':
+            return image, label.astype('int64')
+
         return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index 16c39e56ef0d6..c8bb6b3ca848d 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -18,6 +18,7 @@
 import gzip
 import struct
 import numpy as np
+from PIL import Image
 
 import paddle
 from paddle.io import Dataset
@@ -48,7 +49,11 @@ class MNIST(Dataset):
         mode(str): 'train' or 'test' mode. Default 'train'.
         download(bool): whether to download dataset automatically if
             :attr:`image_path` :attr:`label_path` is not set. Default True
-
+        backend(str, optional): Specifies which type of image to be returned: 
+            PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}. 
+            If this option is not set, will get backend from ``paddle.vsion.get_image_backend`` ,
+            default backend is 'pil'. Default: None.
+            
     Returns:
         Dataset: MNIST Dataset.
 
@@ -62,7 +67,7 @@ class MNIST(Dataset):
 
             for i in range(len(mnist)):
                 sample = mnist[i]
-                print(sample[0].shape, sample[1])
+                print(sample[0].size, sample[1])
 
     """
 
@@ -71,9 +76,19 @@ def __init__(self,
                  label_path=None,
                  mode='train',
                  transform=None,
-                 download=True):
+                 download=True,
+                 backend=None):
         assert mode.lower() in ['train', 'test'], \
                 "mode should be 'train' or 'test', but got {}".format(mode)
+
+        if backend is None:
+            backend = paddle.vision.get_image_backend()
+        if backend not in ['pil', 'cv2']:
+            raise ValueError(
+                "Expected backend are one of ['pil', 'cv2'], but got {}"
+                .format(backend))
+        self.backend = backend
+
         self.mode = mode.lower()
         self.image_path = image_path
         if self.image_path is None:
@@ -145,9 +160,17 @@ def _parse_dataset(self, buffer_size=100):
 
     def __getitem__(self, idx):
         image, label = self.images[idx], self.labels[idx]
-        image = np.reshape(image, [1, 28, 28])
+        image = np.reshape(image, [28, 28])
+
+        if self.backend == 'pil':
+            image = Image.fromarray(image, mode='L')
+
         if self.transform is not None:
             image = self.transform(image)
+
+        if self.backend == 'pil':
+            return image, label.astype('int64')
+
         return image.astype(self.dtype), label.astype('int64')
 
     def __len__(self):
diff --git a/python/paddle/vision/datasets/voc2012.py b/python/paddle/vision/datasets/voc2012.py
index 5fc9d7c38153e..33a3b4e19487d 100644
--- a/python/paddle/vision/datasets/voc2012.py
+++ b/python/paddle/vision/datasets/voc2012.py
@@ -48,6 +48,10 @@ class VOC2012(Dataset):
         mode(str): 'train', 'valid' or 'test' mode. Default 'train'.
         download(bool): whether to download dataset automatically if
             :attr:`data_file` is not set. Default True
+        backend(str, optional): Specifies which type of image to be returned: 
+            PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}. 
+            If this option is not set, will get backend from ``paddle.vsion.get_image_backend`` ,
+            default backend is 'pil'. Default: None.
 
     Examples:
 
@@ -55,6 +59,7 @@ class VOC2012(Dataset):
 
             import paddle
             from paddle.vision.datasets import VOC2012
+            from paddle.vision.transforms import Normalize
 
             class SimpleNet(paddle.nn.Layer):
                 def __init__(self):
@@ -65,7 +70,10 @@ def forward(self, image, label):
 
             paddle.disable_static()
 
-            voc2012 = VOC2012(mode='train')
+            normalize = Normalize(mean=[0.5, 0.5, 0.5],
+                                  std=[0.5, 0.5, 0.5],
+                                  data_format='HWC')
+            voc2012 = VOC2012(mode='train', transform=normalize, backend='cv2')
 
             for i in range(10):
                 image, label= voc2012[i]
@@ -82,9 +90,19 @@ def __init__(self,
                  data_file=None,
                  mode='train',
                  transform=None,
-                 download=True):
+                 download=True,
+                 backend=None):
         assert mode.lower() in ['train', 'valid', 'test'], \
             "mode should be 'train', 'valid' or 'test', but got {}".format(mode)
+
+        if backend is None:
+            backend = paddle.vision.get_image_backend()
+        if backend not in ['pil', 'cv2']:
+            raise ValueError(
+                "Expected backend are one of ['pil', 'cv2'], but got {}"
+                .format(backend))
+        self.backend = backend
+
         self.flag = MODE_FLAG_MAP[mode.lower()]
 
         self.data_file = data_file
@@ -126,11 +144,18 @@ def __getitem__(self, idx):
         label = self.data_tar.extractfile(self.name2mem[label_file]).read()
         data = Image.open(io.BytesIO(data))
         label = Image.open(io.BytesIO(label))
-        data = np.array(data)
-        label = np.array(label)
+
+        if self.backend == 'cv2':
+            data = np.array(data)
+            label = np.array(label)
+
         if self.transform is not None:
             data = self.transform(data)
-        return data.astype(self.dtype), label.astype(self.dtype)
+
+        if self.backend == 'cv2':
+            return data.astype(self.dtype), label.astype(self.dtype)
+
+        return data, label
 
     def __len__(self):
         return len(self.data)
diff --git a/python/paddle/vision/transforms/transforms.py b/python/paddle/vision/transforms/transforms.py
index 06f3f231ef3d2..a24fc888ec679 100644
--- a/python/paddle/vision/transforms/transforms.py
+++ b/python/paddle/vision/transforms/transforms.py
@@ -686,6 +686,8 @@ def _apply_image(self, img):
         if F._is_pil_image(img):
             img = np.asarray(img)
 
+        if len(img.shape) == 2:
+            img = img[..., np.newaxis]
         return img.transpose(self.order)
 
 

From 4086f48ea1845f7bd88047a8b3757e00015a9714 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 30 Oct 2020 19:00:51 +0800
Subject: [PATCH 088/185] Check and fix tensor and scalar type promotion
 (#28299)

* check and fix tensor and scalar type promotion

* fix else branch error

* fix scalar method error

* fix test_math_op_path unittest

* add future division for unittest

* rm useless bin file
---
 python/paddle/fluid/dygraph/math_op_patch.py  |  46 ++-
 python/paddle/fluid/layers/math_op_patch.py   |  43 +-
 .../tests/unittests/test_math_op_patch.py     |  10 +-
 ...st_tensor_scalar_type_promotion_dynamic.py | 318 +++++++++++++++
 ...est_tensor_scalar_type_promotion_static.py | 369 ++++++++++++++++++
 5 files changed, 758 insertions(+), 28 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py

diff --git a/python/paddle/fluid/dygraph/math_op_patch.py b/python/paddle/fluid/dygraph/math_op_patch.py
index d1781fdb010e3..203a5e0f86ac5 100644
--- a/python/paddle/fluid/dygraph/math_op_patch.py
+++ b/python/paddle/fluid/dygraph/math_op_patch.py
@@ -149,28 +149,46 @@ def _binary_creator_(method_name,
                          reverse=False,
                          scalar_method=None):
         def __impl__(self, other_var):
-            # tensor and ComplexVariable opetator
+            # 0. check tensor and ComplexVariable opetator
             if isinstance(other_var, ComplexVariable):
                 # need import paddle in closure
                 import paddle
                 math_op = getattr(paddle.incubate.complex.tensor, op_type)
                 return math_op(self, other_var)
 
-            # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
-            # which may lose accuracy. This is a hot fix for release 1.6.
-            if scalar_method is not None and not (
-                    op_type == 'elementwise_div' and
-                    self.dtype in _supported_int_dtype_):
-                if isinstance(other_var, float):
-                    if self.dtype in _supported_int_dtype_:
-                        assert other_var == int(other_var), \
-                            "float value {} cannot convert to integer".format(other_var)
+            # 1. scalar exists cases
+            # we need combine the tensor.dtype and scalar.dtype, cast correct object
+            if isinstance(other_var, float):
+                # in all cases(+, -, *, /, **, //, %), we need cast tensor.dtype to float
+                if self.dtype in _supported_int_dtype_:
+                    self = astype(self, 'float32')
+                # here use `scale` replace `elementwise` to get better performance
+                # but only +, -, *, / can use this method
+                if scalar_method is not None:
                     return scalar_method(self, other_var)
-                elif isinstance(other_var, int):
-                    return scalar_method(self, float(other_var))
+            elif isinstance(other_var, int):
+                # in all cases(+, -, *, /, **, //, %), we can cast it to float
+                # because the output tensor.dtype depend on the type of input tensor
+                other_var = float(other_var)
+                # division is a special case
+                # NOTE(chenweihang): because we cast tensor to float32 instead float64,
+                # the division result can only guarantee the numerical accuracy of 6 digits 
+                # after the decimal point. The result of numpy calculation is of float64 type, 
+                # so the calculation result here and the calculation result of numpy are 
+                # different after 6 decimal point. If necessary, we can also use float64 here.
+                # torch's behavior here is consistent with ours
+                if op_type == 'elementwise_div' and self.dtype in _supported_int_dtype_:
+                    self = astype(self, 'float32')
+                # here use `scale` replace `elementwise` to get better performance
+                # but only +, -, *, / can use this method
+                if scalar_method is not None:
+                    return scalar_method(self, other_var)
+            else:
+                # do nothing
+                pass
 
+            # 2. create varbase for scalar
             lhs_dtype = self.dtype
-
             if not isinstance(other_var, core.VarBase):
                 if reverse:
                     other_var = create_tensor(
@@ -179,6 +197,7 @@ def __impl__(self, other_var):
                     # add fill_op 
                     other_var = create_scalar(value=other_var, dtype=lhs_dtype)
 
+            # 3. unify right var type to left var
             rhs_dtype = other_var.dtype
             if lhs_dtype != rhs_dtype:
                 other_var = astype(other_var, lhs_dtype)
@@ -187,6 +206,7 @@ def __impl__(self, other_var):
                 self = other_var
                 other_var = tmp
 
+            # 4. calculation
             axis = -1
             math_op = getattr(core.ops, op_type)
             return math_op(self, other_var, 'axis', axis)
diff --git a/python/paddle/fluid/layers/math_op_patch.py b/python/paddle/fluid/layers/math_op_patch.py
index 92b58a7e2ee4c..8f5fdf52d95ef 100644
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
@@ -215,21 +215,39 @@ def _binary_creator_(method_name,
                          reverse=False,
                          scalar_method=None):
         def __impl__(self, other_var):
-            # FIXME(zjl): elementwise_div between integers cannot be converted to scale,
-            # which may lose accuracy. This is a hot fix for release 1.6.
-            if scalar_method is not None and not (
-                    op_type == 'elementwise_div' and
-                    self.dtype in _supported_int_dtype_):
-                if isinstance(other_var, float):
-                    if self.dtype in _supported_int_dtype_:
-                        assert other_var == int(other_var), \
-                            "float value {} cannot convert to integer".format(other_var)
+            # 1. scalar exists cases
+            # we need combine the tensor.dtype and scalar.dtype, cast correct object
+            if isinstance(other_var, float):
+                # in all cases(+, -, *, /, **, //, %), we need cast tensor.dtype to float
+                if self.dtype in _supported_int_dtype_:
+                    self = astype(self, 'float32')
+                # here use `scale` replace `elementwise` to get better performance
+                # but only +, -, *, / can use this method
+                if scalar_method is not None:
                     return scalar_method(self, other_var)
-                elif isinstance(other_var, int):
-                    return scalar_method(self, float(other_var))
+            elif isinstance(other_var, int):
+                # in all cases(+, -, *, /, **, //, %), we can cast it to float
+                # because the output tensor.dtype depend on the type of input tensor
+                other_var = float(other_var)
+                # division is a special case
+                # NOTE(chenweihang): because we cast tensor to float32 instead float64,
+                # the division result can only guarantee the numerical accuracy of 6 digits 
+                # after the decimal point. The result of numpy calculation is of float64 type, 
+                # so the calculation result here and the calculation result of numpy are 
+                # different after 6 decimal point. If necessary, we can also use float64 here.
+                # torch's behavior here is consistent with ours
+                if op_type == 'elementwise_div' and self.dtype in _supported_int_dtype_:
+                    self = astype(self, 'float32')
+                # here use `scale` replace `elementwise` to get better performance
+                # but only +, -, *, / can use this method
+                if scalar_method is not None:
+                    return scalar_method(self, other_var)
+            else:
+                # do nothing
+                pass
 
+            # 2. create variable for scalar
             lhs_dtype = safe_get_dtype(self)
-
             if not isinstance(other_var, Variable):
                 if reverse:
                     has_batch_size = False
@@ -251,6 +269,7 @@ def __impl__(self, other_var):
                     other_var = create_scalar(
                         current_block(self), value=other_var, dtype=lhs_dtype)
 
+            # 3. unify right var type to left var
             rhs_dtype = safe_get_dtype(other_var)
             if lhs_dtype != rhs_dtype:
                 other_var = astype(other_var, lhs_dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_math_op_patch.py b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
index f6eff22d6ce5f..76e371b216778 100644
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
@@ -12,15 +12,19 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from __future__ import print_function
+from __future__ import print_function, division
 
 import unittest
 from decorator_helper import prog_scope
+import paddle
 import paddle.fluid as fluid
 import numpy
 
 
 class TestMathOpPatches(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
     @prog_scope()
     def test_add_scalar(self):
         a = fluid.layers.data(name="a", shape=[1])
@@ -197,8 +201,8 @@ def test_integer_div(self):
                         feed={"a": a_np},
                         fetch_list=[b])
 
-        b_np_actual = (a_np / 7).astype('int64')
-        self.assertTrue(numpy.array_equal(b_np, b_np_actual))
+        b_np_actual = (a_np / 7).astype('float32')
+        self.assertTrue(numpy.allclose(b_np, b_np_actual))
 
     @prog_scope()
     def test_equal(self):
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
new file mode 100644
index 0000000000000..5f2dfbdd99e16
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_dynamic.py
@@ -0,0 +1,318 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import unittest
+import numpy as np
+
+import paddle
+
+# Support types are ref from `paddle.tensor.math`
+# - Related paddle dtypes:
+#  - int type: int64, (no test here: uint8, int8, int16, int32)
+#  - float type: float32, (no test here: float64)
+# - Python scalar dtypes: 
+#  - int(64)
+#  - float(64)
+
+
+class TestTensorScalarTypePromotionDynamic(unittest.TestCase):
+    def check_operation(self, a, b, c, op):
+        if op == '+':
+            c_rlt = a + b
+        elif op == '-':
+            c_rlt = a - b
+        elif op == '*':
+            c_rlt = a * b
+        elif op == '/':
+            c_rlt = a / b
+        elif op == '**':
+            c_rlt = a**b
+        elif op == '//':
+            c_rlt = a // b
+        elif op == '%':
+            c_rlt = a % b
+        else:
+            raise ValueError("Unsupported operation.")
+
+        self.assertEqual(c_rlt.dtype, c.dtype)
+        self.assertTrue(np.array_equal(c_rlt.numpy(), c.numpy()))
+
+    def test_tensor_add_scalar(self):
+        # tensor(int64) + scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1
+        c = paddle.full([2, 2, 2], 2, dtype="int64")
+        self.check_operation(a, b, c, '+')
+
+        # tensor(float32) + scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 1
+        c = paddle.full([2, 2, 2], 2, dtype="float32")
+        self.check_operation(a, b, c, '+')
+
+        # tensor(int64) + scalar(float, .0)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1.0
+        c = paddle.full([2, 2, 2], 2, dtype="float32")
+        self.check_operation(a, b, c, '+')
+
+        # tensor(int64) + scalar(float, .5)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1.5
+        c = paddle.full([2, 2, 2], 2.5, dtype="float32")
+        self.check_operation(a, b, c, '+')
+
+        # tensor(float32) + scalar(float)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 1.5
+        c = paddle.full([2, 2, 2], 2.5, dtype="float32")
+        self.check_operation(a, b, c, '+')
+
+    def test_tensor_sub_scalar(self):
+        # tensor(int64) - scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1
+        c = paddle.zeros([2, 2, 2], dtype="int64")
+        self.check_operation(a, b, c, '-')
+
+        # tensor(float32) - scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 1
+        c = paddle.zeros([2, 2, 2], dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+        # tensor(int64) - scalar(float, .0)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1.0
+        c = paddle.zeros([2, 2, 2], dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+        # tensor(int64) - scalar(float, .5)
+        a = paddle.full([2, 2, 2], 2, dtype='int64')
+        b = 1.5
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+        # tensor(float32) - scalar(float)
+        a = paddle.full([2, 2, 2], 2, dtype='float32')
+        b = 1.5
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+    def test_scalar_sub_tensor(self):
+        # scalar(int) - tensor(int64)
+        a = 1
+        b = paddle.ones([2, 2, 2], dtype='int64')
+        c = paddle.zeros([2, 2, 2], dtype="int64")
+        self.check_operation(a, b, c, '-')
+
+        # scalar(int) - tensor(float32)
+        a = 1
+        b = paddle.ones([2, 2, 2], dtype='float32')
+        c = paddle.zeros([2, 2, 2], dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+        # scalar(float, .0) - tensor(int64)
+        a = 1.0
+        b = paddle.ones([2, 2, 2], dtype='int64')
+        c = paddle.zeros([2, 2, 2], dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+        # scalar(float, .5) - tensor(int64)
+        a = 1.5
+        b = paddle.full([2, 2, 2], 2, dtype='int64')
+        c = paddle.full([2, 2, 2], -0.5, dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+        # scalar(float) - tensor(float32)
+        a = 1.5
+        b = paddle.full([2, 2, 2], 2, dtype='float32')
+        c = paddle.full([2, 2, 2], -0.5, dtype="float32")
+        self.check_operation(a, b, c, '-')
+
+    def test_tensor_mul_tensor(self):
+        # tensor(int64) * scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1
+        c = paddle.ones([2, 2, 2], dtype="int64")
+        self.check_operation(a, b, c, '*')
+
+        # tensor(float32) * scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 1
+        c = paddle.ones([2, 2, 2], dtype="float32")
+        self.check_operation(a, b, c, '*')
+
+        # tensor(int64) * scalar(float, .0)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1.0
+        c = paddle.ones([2, 2, 2], dtype="float32")
+        self.check_operation(a, b, c, '*')
+
+        # tensor(int64) * scalar(float, .5)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 1.5
+        c = paddle.full([2, 2, 2], 1.5, dtype="float32")
+        self.check_operation(a, b, c, '*')
+
+        # tensor(float32) * scalar(float)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 1.5
+        c = paddle.full([2, 2, 2], 1.5, dtype="float32")
+        self.check_operation(a, b, c, '*')
+
+    def test_tensor_div_scalar(self):
+        # tensor(int64) / scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 2
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # tensor(float32) / scalar(int)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 2
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # tensor(int64) / scalar(float, .0)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 2.0
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # tensor(int64) / scalar(float, .5)
+        a = paddle.ones([2, 2, 2], dtype='int64')
+        b = 0.5
+        c = paddle.full([2, 2, 2], 2, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # tensor(float32) / scalar(float)
+        a = paddle.ones([2, 2, 2], dtype='float32')
+        b = 0.5
+        c = paddle.full([2, 2, 2], 2, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+    def test_scalar_div_tensor(self):
+        # scalar(int) / tensor(int64)
+        a = 1
+        b = paddle.full([2, 2, 2], 2, dtype='int64')
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # scalar(int) / tensor(float32)
+        a = 1
+        b = paddle.full([2, 2, 2], 0.5, dtype='float32')
+        c = paddle.full([2, 2, 2], 2, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # scalar(float) / tensor(int64)
+        a = 1.0
+        b = paddle.full([2, 2, 2], 2, dtype='int64')
+        c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+        # scalar(float) / tensor(float32)
+        a = 1.0
+        b = paddle.full([2, 2, 2], 0.5, dtype='float32')
+        c = paddle.full([2, 2, 2], 2, dtype="float32")
+        self.check_operation(a, b, c, '/')
+
+    def test_tensor_pow_scalar(self):
+        # tensor(int64) ** scalar(int)
+        a = paddle.full([2, 2, 2], 2, dtype='int64')
+        b = 3
+        c = paddle.full([2, 2, 2], 8, dtype="int64")
+        self.check_operation(a, b, c, '**')
+
+        # tensor(int64) ** scalar(float)
+        a = paddle.full([2, 2, 2], 2, dtype='int64')
+        b = 3.0
+        c = paddle.full([2, 2, 2], 8, dtype="float32")
+        self.check_operation(a, b, c, '**')
+
+        # tensor(float32) ** scalar(int)
+        a = paddle.full([2, 2, 2], 2, dtype='float32')
+        b = 3
+        c = paddle.full([2, 2, 2], 8, dtype="float32")
+        self.check_operation(a, b, c, '**')
+
+        # tensor(float32) ** scalar(float)
+        a = paddle.full([2, 2, 2], 2, dtype='float32')
+        b = 3.0
+        c = paddle.full([2, 2, 2], 8, dtype="float32")
+        self.check_operation(a, b, c, '**')
+
+    def test_scalar_pow_tensor(self):
+        # scalar(int) ** tensor(int64)
+        a = 3
+        b = paddle.full([2, 2, 2], 2, dtype='int64')
+        c = paddle.full([2, 2, 2], 9, dtype="int64")
+        self.check_operation(a, b, c, '**')
+
+        # scalar(float) ** tensor(int64)
+        a = 3.0
+        b = paddle.full([2, 2, 2], 2, dtype='int64')
+        c = paddle.full([2, 2, 2], 9, dtype="float32")
+        self.check_operation(a, b, c, '**')
+
+        # scalar(int) ** tensor(float32)
+        a = 3
+        b = paddle.full([2, 2, 2], 2, dtype='float32')
+        c = paddle.full([2, 2, 2], 9, dtype="float32")
+        self.check_operation(a, b, c, '**')
+
+        # tensor(float32) ** scalar(float)
+        a = 3.0
+        b = paddle.full([2, 2, 2], 2, dtype='float32')
+        c = paddle.full([2, 2, 2], 9, dtype="float32")
+        self.check_operation(a, b, c, '**')
+
+    ## TODO: floordiv op kernel doesn't support float
+    def test_tensor_floordiv_scalar(self):
+        # tensor(int64) // scalar(int)
+        a = paddle.full([2, 2, 2], 3, dtype='int64')
+        b = 2
+        c = paddle.full([2, 2, 2], 1, dtype="int64")
+        self.check_operation(a, b, c, '//')
+
+    def test_tensor_mod_scalar(self):
+        # tensor(int64) % scalar(int)
+        a = paddle.full([2, 2, 2], 3, dtype='int64')
+        b = 2
+        c = paddle.full([2, 2, 2], 1, dtype="int64")
+        self.check_operation(a, b, c, '%')
+
+        # tensor(int64) % scalar(float)
+        a = paddle.full([2, 2, 2], 3, dtype='int64')
+        b = 2.0
+        c = paddle.full([2, 2, 2], 1, dtype="float32")
+        self.check_operation(a, b, c, '%')
+
+        # tensor(float32) % scalar(int)
+        a = paddle.full([2, 2, 2], 3, dtype='float32')
+        b = 2
+        c = paddle.full([2, 2, 2], 1, dtype="float32")
+        self.check_operation(a, b, c, '%')
+
+        # tensor(float32) % scalar(float)
+        a = paddle.full([2, 2, 2], 3, dtype='float32')
+        b = 2.0
+        c = paddle.full([2, 2, 2], 1, dtype="float32")
+        self.check_operation(a, b, c, '%')
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
new file mode 100644
index 0000000000000..d697666e12ddd
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_tensor_scalar_type_promotion_static.py
@@ -0,0 +1,369 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function, division
+
+import unittest
+import numpy as np
+
+import paddle
+from paddle.static import program_guard
+from paddle.static import Program
+
+# Support types are ref from `paddle.tensor.math`
+# - Related paddle dtypes:
+#  - int type: int64, (no test here: uint8, int8, int16, int32)
+#  - float type: float32, (no test here: float64)
+# - Python scalar dtypes: 
+#  - int(64)
+#  - float(64)
+
+
+class TestTensorScalarTypePromotionStatic(unittest.TestCase):
+    def setUp(self):
+        paddle.enable_static()
+
+    def check_operation(self, a, b, c, op):
+        exe = paddle.static.Executor()
+
+        if op == '+':
+            c_rlt = a + b
+        elif op == '-':
+            c_rlt = a - b
+        elif op == '*':
+            c_rlt = a * b
+        elif op == '/':
+            c_rlt = a / b
+        elif op == '**':
+            c_rlt = a**b
+        elif op == '//':
+            c_rlt = a // b
+        elif op == '%':
+            c_rlt = a % b
+        else:
+            raise ValueError("Unsupported operation.")
+
+        rlt = exe.run(fetch_list=[c_rlt.name, c.name])
+
+        self.assertEqual(rlt[0].dtype, rlt[1].dtype)
+        self.assertTrue(np.array_equal(rlt[0], rlt[1]))
+
+    def test_tensor_add_scalar(self):
+        # tensor(int64) + scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1
+            c = paddle.full([2, 2, 2], 2, dtype="int64")
+            self.check_operation(a, b, c, '+')
+
+        # tensor(float32) + scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 1
+            c = paddle.full([2, 2, 2], 2, dtype="float32")
+            self.check_operation(a, b, c, '+')
+
+        # tensor(int64) + scalar(float, .0)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1.0
+            c = paddle.full([2, 2, 2], 2, dtype="float32")
+            self.check_operation(a, b, c, '+')
+
+        # tensor(int64) + scalar(float, .5)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1.5
+            c = paddle.full([2, 2, 2], 2.5, dtype="float32")
+            self.check_operation(a, b, c, '+')
+
+        # tensor(float32) + scalar(float)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 1.5
+            c = paddle.full([2, 2, 2], 2.5, dtype="float32")
+            self.check_operation(a, b, c, '+')
+
+    def test_tensor_sub_scalar(self):
+        # tensor(int64) - scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1
+            c = paddle.zeros([2, 2, 2], dtype="int64")
+            self.check_operation(a, b, c, '-')
+
+        # tensor(float32) - scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 1
+            c = paddle.zeros([2, 2, 2], dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+        # tensor(int64) - scalar(float, .0)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1.0
+            c = paddle.zeros([2, 2, 2], dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+        # tensor(int64) - scalar(float, .5)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 2, dtype='int64')
+            b = 1.5
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+        # tensor(float32) - scalar(float)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 2, dtype='float32')
+            b = 1.5
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+    def test_scalar_sub_tensor(self):
+        # scalar(int) - tensor(int64)
+        with program_guard(Program()):
+            a = 1
+            b = paddle.ones([2, 2, 2], dtype='int64')
+            c = paddle.zeros([2, 2, 2], dtype="int64")
+            self.check_operation(a, b, c, '-')
+
+        # scalar(int) - tensor(float32)
+        with program_guard(Program()):
+            a = 1
+            b = paddle.ones([2, 2, 2], dtype='float32')
+            c = paddle.zeros([2, 2, 2], dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+        # scalar(float, .0) - tensor(int64)
+        with program_guard(Program()):
+            a = 1.0
+            b = paddle.ones([2, 2, 2], dtype='int64')
+            c = paddle.zeros([2, 2, 2], dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+        # scalar(float, .5) - tensor(int64)
+        with program_guard(Program()):
+            a = 1.5
+            b = paddle.full([2, 2, 2], 2, dtype='int64')
+            c = paddle.full([2, 2, 2], -0.5, dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+        # scalar(float) - tensor(float32)
+        with program_guard(Program()):
+            a = 1.5
+            b = paddle.full([2, 2, 2], 2, dtype='float32')
+            c = paddle.full([2, 2, 2], -0.5, dtype="float32")
+            self.check_operation(a, b, c, '-')
+
+    def test_tensor_mul_tensor(self):
+        # tensor(int64) * scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1
+            c = paddle.ones([2, 2, 2], dtype="int64")
+            self.check_operation(a, b, c, '*')
+
+        # tensor(float32) * scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 1
+            c = paddle.ones([2, 2, 2], dtype="float32")
+            self.check_operation(a, b, c, '*')
+
+        # tensor(int64) * scalar(float, .0)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1.0
+            c = paddle.ones([2, 2, 2], dtype="float32")
+            self.check_operation(a, b, c, '*')
+
+        # tensor(int64) * scalar(float, .5)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 1.5
+            c = paddle.full([2, 2, 2], 1.5, dtype="float32")
+            self.check_operation(a, b, c, '*')
+
+        # tensor(float32) * scalar(float)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 1.5
+            c = paddle.full([2, 2, 2], 1.5, dtype="float32")
+            self.check_operation(a, b, c, '*')
+
+    def test_tensor_div_scalar(self):
+        # tensor(int64) / scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 2
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # tensor(float32) / scalar(int)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 2
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # tensor(int64) / scalar(float, .0)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 2.0
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # tensor(int64) / scalar(float, .5)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='int64')
+            b = 0.5
+            c = paddle.full([2, 2, 2], 2, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # tensor(float32) / scalar(float)
+        with program_guard(Program()):
+            a = paddle.ones([2, 2, 2], dtype='float32')
+            b = 0.5
+            c = paddle.full([2, 2, 2], 2, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+    def test_scalar_div_tensor(self):
+        # scalar(int) / tensor(int64)
+        with program_guard(Program()):
+            a = 1
+            b = paddle.full([2, 2, 2], 2, dtype='int64')
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # scalar(int) / tensor(float32)
+        with program_guard(Program()):
+            a = 1
+            b = paddle.full([2, 2, 2], 0.5, dtype='float32')
+            c = paddle.full([2, 2, 2], 2, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # scalar(float) / tensor(int64)
+        with program_guard(Program()):
+            a = 1.0
+            b = paddle.full([2, 2, 2], 2, dtype='int64')
+            c = paddle.full([2, 2, 2], 0.5, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+        # scalar(float) / tensor(float32)
+        with program_guard(Program()):
+            a = 1.0
+            b = paddle.full([2, 2, 2], 0.5, dtype='float32')
+            c = paddle.full([2, 2, 2], 2, dtype="float32")
+            self.check_operation(a, b, c, '/')
+
+    def test_tensor_pow_scalar(self):
+        # tensor(int64) ** scalar(int)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 2, dtype='int64')
+            b = 3
+            c = paddle.full([2, 2, 2], 8, dtype="int64")
+            self.check_operation(a, b, c, '**')
+
+        # tensor(int64) ** scalar(float)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 2, dtype='int64')
+            b = 3.0
+            c = paddle.full([2, 2, 2], 8, dtype="float32")
+            self.check_operation(a, b, c, '**')
+
+        # tensor(float32) ** scalar(int)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 2, dtype='float32')
+            b = 3
+            c = paddle.full([2, 2, 2], 8, dtype="float32")
+            self.check_operation(a, b, c, '**')
+
+        # tensor(float32) ** scalar(float)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 2, dtype='float32')
+            b = 3.0
+            c = paddle.full([2, 2, 2], 8, dtype="float32")
+            self.check_operation(a, b, c, '**')
+
+    def test_scalar_pow_tensor(self):
+        # scalar(int) ** tensor(int64)
+        with program_guard(Program()):
+            a = 3
+            b = paddle.full([2, 2, 2], 2, dtype='int64')
+            c = paddle.full([2, 2, 2], 9, dtype="int64")
+            self.check_operation(a, b, c, '**')
+
+        # scalar(float) ** tensor(int64)
+        with program_guard(Program()):
+            a = 3.0
+            b = paddle.full([2, 2, 2], 2, dtype='int64')
+            c = paddle.full([2, 2, 2], 9, dtype="float32")
+            self.check_operation(a, b, c, '**')
+
+        # scalar(int) ** tensor(float32)
+        with program_guard(Program()):
+            a = 3
+            b = paddle.full([2, 2, 2], 2, dtype='float32')
+            c = paddle.full([2, 2, 2], 9, dtype="float32")
+            self.check_operation(a, b, c, '**')
+
+        # tensor(float32) ** scalar(float)
+        with program_guard(Program()):
+            a = 3.0
+            b = paddle.full([2, 2, 2], 2, dtype='float32')
+            c = paddle.full([2, 2, 2], 9, dtype="float32")
+            self.check_operation(a, b, c, '**')
+
+    # ## TODO: floordiv op kernel doesn't support float
+    def test_tensor_floordiv_scalar(self):
+        # tensor(int64) // scalar(int)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 3, dtype='int64')
+            b = 2
+            c = paddle.full([2, 2, 2], 1, dtype="int64")
+            self.check_operation(a, b, c, '//')
+
+    def test_tensor_mod_scalar(self):
+        # tensor(int64) % scalar(int)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 3, dtype='int64')
+            b = 2
+            c = paddle.full([2, 2, 2], 1, dtype="int64")
+            self.check_operation(a, b, c, '%')
+
+        # tensor(int64) % scalar(float)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 3, dtype='int64')
+            b = 2.0
+            c = paddle.full([2, 2, 2], 1, dtype="float32")
+            self.check_operation(a, b, c, '%')
+
+        # tensor(float32) % scalar(int)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 3, dtype='float32')
+            b = 2
+            c = paddle.full([2, 2, 2], 1, dtype="float32")
+            self.check_operation(a, b, c, '%')
+
+        # tensor(float32) % scalar(float)
+        with program_guard(Program()):
+            a = paddle.full([2, 2, 2], 3, dtype='float32')
+            b = 2.0
+            c = paddle.full([2, 2, 2], 1, dtype="float32")
+            self.check_operation(a, b, c, '%')
+
+
+if __name__ == '__main__':
+    unittest.main()

From 18c86fb2fb209c0dfcc785c253457cf0ed4790b8 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 30 Oct 2020 06:03:11 -0500
Subject: [PATCH 089/185] hide some logs of p2p (#28307)

---
 paddle/fluid/platform/init.cc   | 5 +++--
 python/paddle/fluid/__init__.py | 3 +++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index 2e708e44fd0e4..ba4520b1388e6 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <string.h>  // for strdup
+
 #include <algorithm>
 #include <fstream>
 #include <iostream>
@@ -105,8 +106,8 @@ void InitP2P(std::vector<int> devices) {
         PADDLE_ENFORCE_CUDA_SUCCESS(
             cudaDeviceCanAccessPeer(&can_acess, devices[i], devices[j]));
         if (can_acess != 1) {
-          LOG(WARNING) << "Cannot enable P2P access from " << devices[i]
-                       << " to " << devices[j];
+          VLOG(2) << "Cannot enable P2P access from " << devices[i] << " to "
+                  << devices[j];
         } else {
           platform::CUDADeviceGuard guard(devices[i]);
           cudaDeviceEnablePeerAccess(devices[j], 0);
diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py
index c8ba7e829e93c..74b56b842cf96 100644
--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -152,6 +152,9 @@ def __bootstrap__():
     import platform
     from . import core
 
+    # NOTE(zhiqiu): When (1)numpy < 1.19; (2) python < 3.7, 
+    # unittest is always imported in numpy (maybe some versions not). 
+    # so is_test is True and p2p is not inited.
     in_test = 'unittest' in sys.modules
 
     try:

From d9b5f1261cb470f07cfa726f9e2edc0e63b7e493 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Fri, 30 Oct 2020 19:55:43 +0800
Subject: [PATCH 090/185] update the version of pybind, test=develop (#28284)

* update version pybind to v2.4.3, test=develop

* update unittests, test=develop
---
 cmake/external/pybind11.cmake                 |  3 +--
 paddle/fluid/pybind/tensor_py.h               |  4 +--
 python/paddle/fluid/executor.py               |  7 ++++-
 .../test_feed_data_check_shape_type.py        | 10 ++-----
 .../fluid/tests/unittests/test_var_base.py    | 27 +++----------------
 5 files changed, 14 insertions(+), 37 deletions(-)

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 3a0b3676db36e..8722b9003b7ef 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -17,7 +17,7 @@ include(ExternalProject)
 set(PYBIND_PREFIX_DIR     ${THIRD_PARTY_PATH}/pybind)
 set(PYBIND_SOURCE_DIR     ${THIRD_PARTY_PATH}/pybind/src/extern_pybind)
 SET(PYBIND_REPOSITORY     ${GIT_URL}/pybind/pybind11.git)
-SET(PYBIND_TAG            v2.2.4)
+SET(PYBIND_TAG            v2.4.3)
 
 cache_third_party(extern_pybind
     REPOSITORY    ${PYBIND_REPOSITORY}
@@ -34,7 +34,6 @@ ExternalProject_Add(
         "${PYBIND_DOWNLOAD_CMD}"
         PREFIX            ${PYBIND_PREFIX_DIR}
         SOURCE_DIR        ${PYBIND_SOURCE_DIR}
-        UPDATE_COMMAND    ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h
index 142ab2bb9d790..012f624f67bbb 100644
--- a/paddle/fluid/pybind/tensor_py.h
+++ b/paddle/fluid/pybind/tensor_py.h
@@ -58,7 +58,7 @@ struct npy_format_descriptor<paddle::platform::float16> {
     // https://docs.python.org/3/library/struct.html#format-characters.
     return "e";
   }
-  static PYBIND11_DESCR name() { return _("float16"); }
+  static constexpr auto name = _("float16");
 };
 
 // Note: Since bfloat16 is not a builtin type in C++ and in numpy,
@@ -75,7 +75,7 @@ struct npy_format_descriptor<paddle::platform::bfloat16> {
     // https://docs.python.org/3/library/struct.html#format-characters.
     return "H";
   }
-  static PYBIND11_DESCR name() { return _("bfloat16"); }
+  static constexpr auto name = _("bfloat16");
 };
 
 }  // namespace detail
diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py
index be72b4158c317..90851e6d864c2 100644
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -1261,6 +1261,11 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
                 "Executor requires Program as its Parameter. But you passed in %s"
                 % (type(program)))
 
+        if not isinstance(fetch_var_name, str):
+            raise TypeError(
+                "The name of fetch variable requires string as its Parameter. But you passed in %s"
+                % (type(fetch_var_name)))
+
         if use_program_cache:
             cache_key = _get_strong_program_cache_key(program, feed, fetch_list)
             cached_program = self._get_program_cache(cache_key)
@@ -1311,7 +1316,7 @@ def _run_program(self, program, feed, fetch_list, feed_var_name,
 
         if not use_program_cache:
             self._default_executor.run(program.desc, scope, 0, True, True,
-                                       fetch_var_name)
+                                       [fetch_var_name])
         else:
             self._default_executor.run_prepared_ctx(ctx, scope, False, False,
                                                     False)
diff --git a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
index 4d7fc69058ded..3bbc4cc2904b8 100644
--- a/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_data_check_shape_type.py
@@ -86,14 +86,8 @@ def test(self):
                 self._test_feed_lod_tensor(use_cuda, use_parallel_executor)
 
                 # Test exception message when feeding with error 
-                if six.PY2:
-                    in_shape_tuple = (long(-1), long(3), long(4), long(8))
-                    error_shape_list = [
-                        long(self.data_batch_size), long(3), long(4), long(5)
-                    ]
-                else:
-                    in_shape_tuple = (-1, 3, 4, 8)
-                    error_shape_list = [self.data_batch_size, 3, 4, 5]
+                in_shape_tuple = (-1, 3, 4, 8)
+                error_shape_list = [self.data_batch_size, 3, 4, 5]
 
                 with self.assertRaises(ValueError) as shape_mismatch_err:
                     self._test_feed_data_shape_mismatch(use_cuda,
diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 42fd2de864d08..41aef68db624d 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -430,18 +430,7 @@ def test_tensor_str(self):
         paddle.set_printoptions(4, 100, 3)
         a_str = str(a)
 
-        if six.PY2:
-            expected = '''Tensor(shape=[10L, 20L], dtype=float32, place=CPUPlace, stop_gradient=True,
-       [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
-        [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
-        [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
-        ...,
-        [0.3426, 0.1909, 0.7240, ..., 0.4218, 0.2676, 0.5679],
-        [0.5561, 0.2081, 0.0676, ..., 0.9778, 0.3302, 0.9559],
-        [0.2665, 0.8483, 0.5389, ..., 0.4956, 0.6862, 0.9178]])'''
-
-        else:
-            expected = '''Tensor(shape=[10, 20], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[10, 20], dtype=float32, place=CPUPlace, stop_gradient=True,
        [[0.2727, 0.5489, 0.8655, ..., 0.2916, 0.8525, 0.9000],
         [0.3806, 0.8996, 0.0928, ..., 0.9535, 0.8378, 0.6409],
         [0.1484, 0.4038, 0.8294, ..., 0.0148, 0.6520, 0.4250],
@@ -458,12 +447,7 @@ def test_tensor_str2(self):
         a = paddle.to_tensor([[1.5111111, 1.0], [0, 0]])
         a_str = str(a)
 
-        if six.PY2:
-            expected = '''Tensor(shape=[2L, 2L], dtype=float32, place=CPUPlace, stop_gradient=True,
-       [[1.5111, 1.    ],
-        [0.    , 0.    ]])'''
-        else:
-            expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
        [[1.5111, 1.    ],
         [0.    , 0.    ]])'''
 
@@ -475,12 +459,7 @@ def test_tensor_str3(self):
         a = paddle.to_tensor([[-1.5111111, 1.0], [0, -0.5]])
         a_str = str(a)
 
-        if six.PY2:
-            expected = '''Tensor(shape=[2L, 2L], dtype=float32, place=CPUPlace, stop_gradient=True,
-       [[-1.5111,  1.    ],
-        [ 0.    , -0.5000]])'''
-        else:
-            expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
+        expected = '''Tensor(shape=[2, 2], dtype=float32, place=CPUPlace, stop_gradient=True,
        [[-1.5111,  1.    ],
         [ 0.    , -0.5000]])'''
 

From 57e4411ab59e3796fb8dfe8406a2a0932037694c Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Mon, 2 Nov 2020 09:52:44 +0800
Subject: [PATCH 091/185] [Dy2stat] Support to modify value of buffer tensor
 (#28328)

* [Dy2stat] Support to modify value of buffer tensor

* remove "defaultTest"

* fix name confliction
---
 python/paddle/fluid/dygraph/layers.py         | 15 +++++--
 .../fluid/tests/unittests/test_base_layer.py  | 43 +++++++++++++++++++
 2 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/dygraph/layers.py b/python/paddle/fluid/dygraph/layers.py
index 6fa531c573daa..10786c662072c 100644
--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -1023,13 +1023,20 @@ def _remove_if_exist(*dicts):
                         self._non_persistable_buffer_names_set.add(name)
                     _buffers[name] = value
                 elif _buffers is not None and name in _buffers:
-                    if value is not None:
+                    # Note(Aurelius84): In Dy2stat, the value of the Buffer may be modified in 
+                    # decorated function, such as `self.buffer = new_tensor`. So we update its
+                    # value via `assign`.
+                    if type(value) == framework.Variable:
+                        from paddle import assign
+                        assign(value, _buffers[name])
+                    elif value is not None:
                         raise TypeError(
                             "assignment to buffers '{}' should be of type core.VarBase or None, but got '{}'"
                             .format(name, type(value).__name__))
-                    # Assigning None will remove the buffer, but if re-assign a new varBase to it,
-                    # it will be remarked as a buffer with same `persistable` attribute.
-                    _buffers[name] = None
+                    else:
+                        # Assigning None will remove the buffer, but if re-assign a new varBase to it,
+                        # it will be remarked as a buffer with same `persistable` attribute.
+                        _buffers[name] = None
                 else:
                     object.__setattr__(self, name, value)
 
diff --git a/python/paddle/fluid/tests/unittests/test_base_layer.py b/python/paddle/fluid/tests/unittests/test_base_layer.py
index 875f6211a7fbd..31879dae0dad0 100644
--- a/python/paddle/fluid/tests/unittests/test_base_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_base_layer.py
@@ -15,9 +15,11 @@
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import to_variable
 from paddle.fluid.framework import ParamBase
+from paddle.jit import ProgramTranslator
 
 
 class L1(fluid.Layer):
@@ -288,5 +290,46 @@ def assert_var_base_equal(self, var1, var2):
         self.assertTrue(np.array_equal(var1.numpy(), var2.numpy()))
 
 
+class BufferNetWithModification(paddle.nn.Layer):
+    def __init__(self, shape):
+        super(BufferNetWithModification, self).__init__()
+
+        self.buffer1 = paddle.zeros(shape, 'int32')
+        self.buffer2 = paddle.zeros(shape, 'int32')
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        self.buffer1 += x
+        self.buffer2 = self.buffer1 + x
+
+        out = self.buffer1 + self.buffer2
+
+        return out
+
+
+class TestModifiedBuffer(unittest.TestCase):
+    def setUp(self):
+        paddle.disable_static()
+        self.prog_trans = ProgramTranslator()
+        self.shape = [10, 16]
+
+    def _run(self, to_static=False):
+        self.prog_trans.enable(to_static)
+
+        x = paddle.ones([1], 'int32')
+        net = BufferNetWithModification(self.shape)
+        out = net(x)
+
+        return out, net.buffer1, net.buffer2
+
+    def test_modified(self):
+        dy_outs = self._run(False)
+        st_outs = self._run(True)
+
+        for i in range(len(dy_outs)):
+            self.assertTrue(
+                np.array_equal(dy_outs[i].numpy(), st_outs[i].numpy()))
+
+
 if __name__ == '__main__':
     unittest.main()

From b96869bc31edf0d8f81e722f1db2699603f95350 Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Mon, 2 Nov 2020 14:21:49 +0800
Subject: [PATCH 092/185] Fix lr setting of AdamW when lr is an instance of
 LRScheduler (#28300)

* Fix lr setting of AdamW when lr is an instance of LRScheduler.
test=develop

* Fix static graph test mode in test_adamw_op.py.
test=develop
---
 .../fluid/tests/unittests/test_adamw_op.py     | 18 ++++++++++++++++++
 python/paddle/optimizer/adamw.py               |  4 ++--
 2 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_adamw_op.py b/python/paddle/fluid/tests/unittests/test_adamw_op.py
index b799508f6b8d5..f5399a3aaab5b 100644
--- a/python/paddle/fluid/tests/unittests/test_adamw_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamw_op.py
@@ -47,6 +47,7 @@ def test_adamw_op_coverage(self):
         assert (adam.__str__() is not None)
 
     def test_adamw_op(self):
+        paddle.enable_static()
         place = fluid.CPUPlace()
         shape = [2, 3, 8, 8]
         exe = fluid.Executor(place)
@@ -75,6 +76,7 @@ def test_adamw_op(self):
         data_np = np.random.random(shape).astype('float32')
         rets = exe.run(train_prog, feed={"data": data_np}, fetch_list=[loss])
         assert rets[0] is not None
+        paddle.disable_static()
 
     def test_adamw_op_invalid_input(self):
         paddle.disable_static()
@@ -89,6 +91,22 @@ def test_adamw_op_invalid_input(self):
             adam = paddle.optimizer.AdamW(
                 0.1, epsilon=-1, parameters=linear.parameters())
 
+    def test_adamw_lr_decay(self):
+        paddle.disable_static()
+        value = np.arange(26).reshape(2, 13).astype("float32")
+        a = paddle.to_tensor(value)
+        linear = paddle.nn.Linear(13, 5)
+        adam = paddle.optimizer.AdamW(
+            learning_rate=paddle.optimizer.lr.NoamDecay(
+                d_model=512, warmup_steps=4000),
+            parameters=linear.parameters(),
+            apply_decay_param_fun=lambda name: True,
+            weight_decay=0.01)
+        out = linear(a)
+        out.backward()
+        adam.step()
+        adam.clear_gradients()
+
 
 if __name__ == "__main__":
     unittest.main()
diff --git a/python/paddle/optimizer/adamw.py b/python/paddle/optimizer/adamw.py
index eaa0509029459..2cf3881d04676 100644
--- a/python/paddle/optimizer/adamw.py
+++ b/python/paddle/optimizer/adamw.py
@@ -57,7 +57,7 @@ class AdamW(Adam):
             The default value is 1e-08.
         weight_decay (float|Tensor, optional): The weight decay coefficient, it can be float or Tensor. The default value is 0.01.
         apply_decay_param_fun (function|None, optional): If it is not None,
-            only tensors that makes apply_decay_param_fun(Tensor)==True
+            only tensors that makes apply_decay_param_fun(Tensor.name)==True
             will be updated. It only works when we want to specify tensors.
             Default: None.
         grad_clip (GradientClipBase, optional): Gradient cliping strategy, it's an instance of
@@ -168,7 +168,7 @@ def _scale_parameters(self, params_and_grads):
             if isinstance(self._learning_rate, float):
                 learning_rate = self._learning_rate
             else:
-                self._learning_rate()
+                learning_rate = self._learning_rate()
             with param.block.program._optimized_guard(
                 [param, grad]), framework.name_scope('weight decay'):
                 if param.name not in self._params_name:

From 5262b0258534c17833f3cf826921af21b4349190 Mon Sep 17 00:00:00 2001
From: wangguanzhong <jerrywgz@126.com>
Date: Mon, 2 Nov 2020 14:53:07 +0800
Subject: [PATCH 093/185] add generate_proposals_v2 op (#28214)

* add generate_proposals_v2 op
---
 .../fluid/operators/detection/CMakeLists.txt  |   2 +
 .../fluid/operators/detection/bbox_util.cu.h  | 285 ++++++++++++++++
 paddle/fluid/operators/detection/bbox_util.h  | 142 +++++++-
 .../detection/generate_proposals_op.cc        | 237 +------------
 .../detection/generate_proposals_op.cu        | 256 +-------------
 .../detection/generate_proposals_v2_op.cc     | 314 ++++++++++++++++++
 .../detection/generate_proposals_v2_op.cu     | 229 +++++++++++++
 paddle/fluid/operators/detection/nms_util.h   |  69 ++++
 paddle/fluid/pybind/op_function_generator.cc  |   1 +
 .../test_generate_proposals_v2_op.py          | 238 +++++++++++++
 tools/static_mode_white_list.py               |   1 +
 11 files changed, 1271 insertions(+), 503 deletions(-)
 create mode 100644 paddle/fluid/operators/detection/bbox_util.cu.h
 create mode 100644 paddle/fluid/operators/detection/generate_proposals_v2_op.cc
 create mode 100644 paddle/fluid/operators/detection/generate_proposals_v2_op.cu
 create mode 100644 python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py

diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt
index c2b7c27ab4adb..1915323f3c324 100644
--- a/paddle/fluid/operators/detection/CMakeLists.txt
+++ b/paddle/fluid/operators/detection/CMakeLists.txt
@@ -46,10 +46,12 @@ if(WITH_GPU)
       set(TMPDEPS memory cub)
   endif()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc generate_proposals_op.cu DEPS ${TMPDEPS})
+  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc generate_proposals_v2_op.cu DEPS ${TMPDEPS})
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc distribute_fpn_proposals_op.cu DEPS ${TMPDEPS})
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc collect_fpn_proposals_op.cu DEPS ${TMPDEPS})
 else()
   detection_library(generate_proposals_op SRCS generate_proposals_op.cc)
+  detection_library(generate_proposals_v2_op SRCS generate_proposals_v2_op.cc)
   detection_library(distribute_fpn_proposals_op SRCS distribute_fpn_proposals_op.cc)
   detection_library(collect_fpn_proposals_op SRCS collect_fpn_proposals_op.cc)
 endif()
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
new file mode 100644
index 0000000000000..8840765841d2b
--- /dev/null
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -0,0 +1,285 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <cfloat>
+#include <string>
+#include <vector>
+#include "cub/cub.cuh"
+#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/for_range.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
+
+int const kThreadsPerBlock = sizeof(uint64_t) * 8;
+
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
+
+struct RangeInitFunctor {
+  int start_;
+  int delta_;
+  int *out_;
+  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
+};
+
+template <typename T>
+static void SortDescending(const platform::CUDADeviceContext &ctx,
+                           const Tensor &value, Tensor *value_out,
+                           Tensor *index_out) {
+  int num = static_cast<int>(value.numel());
+  Tensor index_in_t;
+  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
+  platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
+  for_range(RangeInitFunctor{0, 1, idx_in});
+
+  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
+
+  const T *keys_in = value.data<T>();
+  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
+
+  // Determine temporary device storage requirements
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
+  // Allocate temporary storage
+  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
+
+  // Run sorting operation
+  cub::DeviceRadixSort::SortPairsDescending<T, int>(
+      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
+      idx_out, num);
+}
+
+template <typename T>
+struct BoxDecodeAndClipFunctor {
+  const T *anchor;
+  const T *deltas;
+  const T *var;
+  const int *index;
+  const T *im_info;
+
+  T *proposals;
+
+  BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
+                          const int *index, const T *im_info, T *proposals)
+      : anchor(anchor),
+        deltas(deltas),
+        var(var),
+        index(index),
+        im_info(im_info),
+        proposals(proposals) {}
+
+  T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
+
+  __device__ void operator()(size_t i) {
+    int k = index[i] * 4;
+    T axmin = anchor[k];
+    T aymin = anchor[k + 1];
+    T axmax = anchor[k + 2];
+    T aymax = anchor[k + 3];
+
+    T w = axmax - axmin + 1.0;
+    T h = aymax - aymin + 1.0;
+    T cx = axmin + 0.5 * w;
+    T cy = aymin + 0.5 * h;
+
+    T dxmin = deltas[k];
+    T dymin = deltas[k + 1];
+    T dxmax = deltas[k + 2];
+    T dymax = deltas[k + 3];
+
+    T d_cx, d_cy, d_w, d_h;
+    if (var) {
+      d_cx = cx + dxmin * w * var[k];
+      d_cy = cy + dymin * h * var[k + 1];
+      d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w;
+      d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h;
+    } else {
+      d_cx = cx + dxmin * w;
+      d_cy = cy + dymin * h;
+      d_w = exp(Min(dxmax, bbox_clip_default)) * w;
+      d_h = exp(Min(dymax, bbox_clip_default)) * h;
+    }
+
+    T oxmin = d_cx - d_w * 0.5;
+    T oymin = d_cy - d_h * 0.5;
+    T oxmax = d_cx + d_w * 0.5 - 1.;
+    T oymax = d_cy + d_h * 0.5 - 1.;
+
+    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
+    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
+    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
+  }
+
+  __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
+
+  __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; }
+};
+
+template <typename T, int BlockSize>
+static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
+                                    const T min_size, const int num,
+                                    int *keep_num, int *keep,
+                                    bool is_scale = true) {
+  T im_h = im_info[0];
+  T im_w = im_info[1];
+
+  int cnt = 0;
+  __shared__ int keep_index[BlockSize];
+
+  CUDA_KERNEL_LOOP(i, num) {
+    keep_index[threadIdx.x] = -1;
+    __syncthreads();
+
+    int k = i * 4;
+    T xmin = bboxes[k];
+    T ymin = bboxes[k + 1];
+    T xmax = bboxes[k + 2];
+    T ymax = bboxes[k + 3];
+
+    T w = xmax - xmin + 1.0;
+    T h = ymax - ymin + 1.0;
+    T cx = xmin + w / 2.;
+    T cy = ymin + h / 2.;
+
+    if (is_scale) {
+      w = (xmax - xmin) / im_info[2] + 1.;
+      h = (ymax - ymin) / im_info[2] + 1.;
+    }
+
+    if (w >= min_size && h >= min_size && cx <= im_w && cy <= im_h) {
+      keep_index[threadIdx.x] = i;
+    }
+    __syncthreads();
+    if (threadIdx.x == 0) {
+      int size = (num - i) < BlockSize ? num - i : BlockSize;
+      for (int j = 0; j < size; ++j) {
+        if (keep_index[j] > -1) {
+          keep[cnt++] = keep_index[j];
+        }
+      }
+    }
+    __syncthreads();
+  }
+  if (threadIdx.x == 0) {
+    keep_num[0] = cnt;
+  }
+}
+
+static __device__ float IoU(const float *a, const float *b) {
+  float left = max(a[0], b[0]), right = min(a[2], b[2]);
+  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
+  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
+  float inter_s = width * height;
+  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
+  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
+  return inter_s / (s_a + s_b - inter_s);
+}
+
+static __global__ void NMSKernel(const int n_boxes,
+                                 const float nms_overlap_thresh,
+                                 const float *dev_boxes, uint64_t *dev_mask) {
+  const int row_start = blockIdx.y;
+  const int col_start = blockIdx.x;
+
+  const int row_size =
+      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
+  const int col_size =
+      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
+
+  __shared__ float block_boxes[kThreadsPerBlock * 4];
+  if (threadIdx.x < col_size) {
+    block_boxes[threadIdx.x * 4 + 0] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
+    block_boxes[threadIdx.x * 4 + 1] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
+    block_boxes[threadIdx.x * 4 + 2] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
+    block_boxes[threadIdx.x * 4 + 3] =
+        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
+  }
+  __syncthreads();
+
+  if (threadIdx.x < row_size) {
+    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
+    const float *cur_box = dev_boxes + cur_box_idx * 4;
+    int i = 0;
+    uint64_t t = 0;
+    int start = 0;
+    if (row_start == col_start) {
+      start = threadIdx.x + 1;
+    }
+    for (i = start; i < col_size; i++) {
+      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
+        t |= 1ULL << i;
+      }
+    }
+    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
+    dev_mask[cur_box_idx * col_blocks + col_start] = t;
+  }
+}
+
+template <typename T>
+static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
+                const Tensor &sorted_indices, const T nms_threshold,
+                Tensor *keep_out) {
+  int boxes_num = proposals.dims()[0];
+  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
+  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
+              DIVUP(boxes_num, kThreadsPerBlock));
+  dim3 threads(kThreadsPerBlock);
+
+  const T *boxes = proposals.data<T>();
+  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
+  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes,
+                                 mask.CUDAMutableData(BOOST_GET_CONST(
+                                     platform::CUDAPlace, ctx.GetPlace())));
+
+  std::vector<uint64_t> remv(col_blocks);
+  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
+
+  std::vector<int> keep_vec;
+  int num_to_keep = 0;
+  for (int i = 0; i < boxes_num; i++) {
+    int nblock = i / kThreadsPerBlock;
+    int inblock = i % kThreadsPerBlock;
+
+    if (!(remv[nblock] & (1ULL << inblock))) {
+      ++num_to_keep;
+      keep_vec.push_back(i);
+      uint64_t *p = &mask[0] + i * col_blocks;
+      for (int j = nblock; j < col_blocks; j++) {
+        remv[j] |= p[j];
+      }
+    }
+  }
+  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
+  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
+               sizeof(int) * num_to_keep, ctx.stream());
+  ctx.Wait();
+}
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/detection/bbox_util.h b/paddle/fluid/operators/detection/bbox_util.h
index 6c9fea1fd4419..b7a23c48fb8c7 100644
--- a/paddle/fluid/operators/detection/bbox_util.h
+++ b/paddle/fluid/operators/detection/bbox_util.h
@@ -21,6 +21,8 @@ limitations under the License. */
 namespace paddle {
 namespace operators {
 
+static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
+
 struct RangeInitFunctor {
   int start;
   int delta;
@@ -125,17 +127,45 @@ void BboxOverlaps(const framework::Tensor& r_boxes,
   }
 }
 
+// Calculate max IoU between each box and ground-truth and
+// each row represents one box
+template <typename T>
+void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
+  const T* iou_data = iou.data<T>();
+  int row = iou.dims()[0];
+  int col = iou.dims()[1];
+  T* max_iou_data = max_iou->data<T>();
+  for (int i = 0; i < row; ++i) {
+    const T* v = iou_data + i * col;
+    T max_v = *std::max_element(v, v + col);
+    max_iou_data[i] = max_v;
+  }
+}
+
+static void AppendProposals(framework::Tensor* dst, int64_t offset,
+                            const framework::Tensor& src) {
+  auto* out_data = dst->data<void>();
+  auto* to_add_data = src.data<void>();
+  size_t size_of_t = framework::SizeOfType(src.type());
+  offset *= size_of_t;
+  std::memcpy(
+      reinterpret_cast<void*>(reinterpret_cast<uintptr_t>(out_data) + offset),
+      to_add_data, src.numel() * size_of_t);
+}
+
 template <class T>
 void ClipTiledBoxes(const platform::DeviceContext& ctx,
                     const framework::Tensor& im_info,
                     const framework::Tensor& input_boxes,
-                    framework::Tensor* out) {
+                    framework::Tensor* out, bool is_scale = true) {
   T* out_data = out->mutable_data<T>(ctx.GetPlace());
   const T* im_info_data = im_info.data<T>();
   const T* input_boxes_data = input_boxes.data<T>();
   T zero(0);
-  T im_w = round(im_info_data[1] / im_info_data[2]);
-  T im_h = round(im_info_data[0] / im_info_data[2]);
+  T im_w =
+      is_scale ? round(im_info_data[1] / im_info_data[2]) : im_info_data[1];
+  T im_h =
+      is_scale ? round(im_info_data[0] / im_info_data[2]) : im_info_data[0];
   for (int64_t i = 0; i < input_boxes.numel(); ++i) {
     if (i % 4 == 0) {
       out_data[i] = std::max(std::min(input_boxes_data[i], im_w - 1), zero);
@@ -149,19 +179,101 @@ void ClipTiledBoxes(const platform::DeviceContext& ctx,
   }
 }
 
-// Calculate max IoU between each box and ground-truth and
-// each row represents one box
-template <typename T>
-void MaxIoU(const framework::Tensor& iou, framework::Tensor* max_iou) {
-  const T* iou_data = iou.data<T>();
-  int row = iou.dims()[0];
-  int col = iou.dims()[1];
-  T* max_iou_data = max_iou->data<T>();
-  for (int i = 0; i < row; ++i) {
-    const T* v = iou_data + i * col;
-    T max_v = *std::max_element(v, v + col);
-    max_iou_data[i] = max_v;
+// Filter the box with small area
+template <class T>
+void FilterBoxes(const platform::DeviceContext& ctx,
+                 const framework::Tensor* boxes, float min_size,
+                 const framework::Tensor& im_info, bool is_scale,
+                 framework::Tensor* keep) {
+  const T* im_info_data = im_info.data<T>();
+  const T* boxes_data = boxes->data<T>();
+  keep->Resize({boxes->dims()[0]});
+  min_size = std::max(min_size, 1.0f);
+  int* keep_data = keep->mutable_data<int>(ctx.GetPlace());
+
+  int keep_len = 0;
+  for (int i = 0; i < boxes->dims()[0]; ++i) {
+    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
+    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
+    T x_ctr = boxes_data[4 * i] + ws / 2;
+    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
+
+    if (is_scale) {
+      ws = (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_info_data[2] + 1;
+      hs =
+          (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_info_data[2] + 1;
+    }
+
+    if (ws >= min_size && hs >= min_size && x_ctr <= im_info_data[1] &&
+        y_ctr <= im_info_data[0]) {
+      keep_data[keep_len++] = i;
+    }
+  }
+  keep->Resize({keep_len});
+}
+
+template <class T>
+static void BoxCoder(const platform::DeviceContext& ctx,
+                     framework::Tensor* all_anchors,
+                     framework::Tensor* bbox_deltas,
+                     framework::Tensor* variances,
+                     framework::Tensor* proposals) {
+  T* proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
+
+  int64_t row = all_anchors->dims()[0];
+  int64_t len = all_anchors->dims()[1];
+
+  auto* bbox_deltas_data = bbox_deltas->data<T>();
+  auto* anchor_data = all_anchors->data<T>();
+  const T* variances_data = nullptr;
+  if (variances) {
+    variances_data = variances->data<T>();
+  }
+
+  for (int64_t i = 0; i < row; ++i) {
+    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
+    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
+
+    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
+    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
+
+    T bbox_center_x = 0, bbox_center_y = 0;
+    T bbox_width = 0, bbox_height = 0;
+
+    if (variances) {
+      bbox_center_x =
+          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
+          anchor_center_x;
+      bbox_center_y = variances_data[i * len + 1] *
+                          bbox_deltas_data[i * len + 1] * anchor_height +
+                      anchor_center_y;
+      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
+                                            bbox_deltas_data[i * len + 2],
+                                        kBBoxClipDefault)) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
+                                             bbox_deltas_data[i * len + 3],
+                                         kBBoxClipDefault)) *
+                    anchor_height;
+    } else {
+      bbox_center_x =
+          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
+      bbox_center_y =
+          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
+      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
+                                        kBBoxClipDefault)) *
+                   anchor_width;
+      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
+                                         kBBoxClipDefault)) *
+                    anchor_height;
+    }
+
+    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
+    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
+    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
+    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
   }
+  // return proposals;
 }
 
 }  // namespace operators
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index 06e560f86d4e0..2bf5e6c5e04da 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/detection/nms_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/math_function.h"
 
@@ -27,18 +29,6 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-static void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) {
-  auto *out_data = dst->data<void>();
-  auto *to_add_data = src.data<void>();
-  size_t size_of_t = framework::SizeOfType(src.type());
-  offset *= size_of_t;
-  std::memcpy(
-      reinterpret_cast<void *>(reinterpret_cast<uintptr_t>(out_data) + offset),
-      to_add_data, src.numel() * size_of_t);
-}
-
 class GenerateProposalsOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
@@ -77,225 +67,6 @@ class GenerateProposalsOp : public framework::OperatorWithKernel {
   }
 };
 
-template <class T>
-static inline void BoxCoder(const platform::DeviceContext &ctx,
-                            Tensor *all_anchors, Tensor *bbox_deltas,
-                            Tensor *variances, Tensor *proposals) {
-  T *proposals_data = proposals->mutable_data<T>(ctx.GetPlace());
-
-  int64_t row = all_anchors->dims()[0];
-  int64_t len = all_anchors->dims()[1];
-
-  auto *bbox_deltas_data = bbox_deltas->data<T>();
-  auto *anchor_data = all_anchors->data<T>();
-  const T *variances_data = nullptr;
-  if (variances) {
-    variances_data = variances->data<T>();
-  }
-
-  for (int64_t i = 0; i < row; ++i) {
-    T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0;
-    T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0;
-
-    T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width;
-    T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height;
-
-    T bbox_center_x = 0, bbox_center_y = 0;
-    T bbox_width = 0, bbox_height = 0;
-
-    if (variances) {
-      bbox_center_x =
-          variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width +
-          anchor_center_x;
-      bbox_center_y = variances_data[i * len + 1] *
-                          bbox_deltas_data[i * len + 1] * anchor_height +
-                      anchor_center_y;
-      bbox_width = std::exp(std::min<T>(variances_data[i * len + 2] *
-                                            bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(variances_data[i * len + 3] *
-                                             bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    } else {
-      bbox_center_x =
-          bbox_deltas_data[i * len] * anchor_width + anchor_center_x;
-      bbox_center_y =
-          bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y;
-      bbox_width = std::exp(std::min<T>(bbox_deltas_data[i * len + 2],
-                                        kBBoxClipDefault)) *
-                   anchor_width;
-      bbox_height = std::exp(std::min<T>(bbox_deltas_data[i * len + 3],
-                                         kBBoxClipDefault)) *
-                    anchor_height;
-    }
-
-    proposals_data[i * len] = bbox_center_x - bbox_width / 2;
-    proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2;
-    proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1;
-    proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1;
-  }
-  // return proposals;
-}
-
-template <class T>
-static inline void ClipTiledBoxes(const platform::DeviceContext &ctx,
-                                  const Tensor &im_info, Tensor *boxes) {
-  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
-  const T *im_info_data = im_info.data<T>();
-  T zero(0);
-  for (int64_t i = 0; i < boxes->numel(); ++i) {
-    if (i % 4 == 0) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else if (i % 4 == 1) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    } else if (i % 4 == 2) {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero);
-    } else {
-      boxes_data[i] =
-          std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero);
-    }
-  }
-}
-
-template <class T>
-static inline void FilterBoxes(const platform::DeviceContext &ctx,
-                               Tensor *boxes, float min_size,
-                               const Tensor &im_info, Tensor *keep) {
-  const T *im_info_data = im_info.data<T>();
-  T *boxes_data = boxes->mutable_data<T>(ctx.GetPlace());
-  T im_scale = im_info_data[2];
-  keep->Resize({boxes->dims()[0]});
-  min_size = std::max(min_size, 1.0f);
-  int *keep_data = keep->mutable_data<int>(ctx.GetPlace());
-
-  int keep_len = 0;
-  for (int i = 0; i < boxes->dims()[0]; ++i) {
-    T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1;
-    T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1;
-    T ws_origin_scale =
-        (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1;
-    T hs_origin_scale =
-        (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1;
-    T x_ctr = boxes_data[4 * i] + ws / 2;
-    T y_ctr = boxes_data[4 * i + 1] + hs / 2;
-    if (ws_origin_scale >= min_size && hs_origin_scale >= min_size &&
-        x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) {
-      keep_data[keep_len++] = i;
-    }
-  }
-  keep->Resize({keep_len});
-}
-
-template <class T>
-static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
-    const std::vector<T> &scores) {
-  std::vector<std::pair<T, int>> sorted_indices;
-  sorted_indices.reserve(scores.size());
-  for (size_t i = 0; i < scores.size(); ++i) {
-    sorted_indices.emplace_back(scores[i], i);
-  }
-  // Sort the score pair according to the scores in descending order
-  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
-                   [](const std::pair<T, int> &a, const std::pair<T, int> &b) {
-                     return a.first < b.first;
-                   });
-  return sorted_indices;
-}
-
-template <class T>
-static inline T BBoxArea(const T *box, bool normalized) {
-  if (box[2] < box[0] || box[3] < box[1]) {
-    // If coordinate values are is invalid
-    // (e.g. xmax < xmin or ymax < ymin), return 0.
-    return static_cast<T>(0.);
-  } else {
-    const T w = box[2] - box[0];
-    const T h = box[3] - box[1];
-    if (normalized) {
-      return w * h;
-    } else {
-      // If coordinate values are not within range [0, 1].
-      return (w + 1) * (h + 1);
-    }
-  }
-}
-
-template <class T>
-static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) {
-  if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] ||
-      box2[3] < box1[1]) {
-    return static_cast<T>(0.);
-  } else {
-    const T inter_xmin = std::max(box1[0], box2[0]);
-    const T inter_ymin = std::max(box1[1], box2[1]);
-    const T inter_xmax = std::min(box1[2], box2[2]);
-    const T inter_ymax = std::min(box1[3], box2[3]);
-    const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1);
-    const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1);
-    const T inter_area = inter_w * inter_h;
-    const T bbox1_area = BBoxArea<T>(box1, normalized);
-    const T bbox2_area = BBoxArea<T>(box2, normalized);
-    return inter_area / (bbox1_area + bbox2_area - inter_area);
-  }
-}
-
-template <typename T>
-static inline Tensor VectorToTensor(const std::vector<T> &selected_indices,
-                                    int selected_num) {
-  Tensor keep_nms;
-  keep_nms.Resize({selected_num});
-  auto *keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
-  for (int i = 0; i < selected_num; ++i) {
-    keep_data[i] = selected_indices[i];
-  }
-  return keep_nms;
-}
-
-template <class T>
-static inline Tensor NMS(const platform::DeviceContext &ctx, Tensor *bbox,
-                         Tensor *scores, T nms_threshold, float eta) {
-  int64_t num_boxes = bbox->dims()[0];
-  // 4: [xmin ymin xmax ymax]
-  int64_t box_size = bbox->dims()[1];
-
-  std::vector<T> scores_data(num_boxes);
-  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
-  std::vector<std::pair<T, int>> sorted_indices =
-      GetSortedScoreIndex<T>(scores_data);
-
-  std::vector<int> selected_indices;
-  int selected_num = 0;
-  T adaptive_threshold = nms_threshold;
-  const T *bbox_data = bbox->data<T>();
-  while (sorted_indices.size() != 0) {
-    int idx = sorted_indices.back().second;
-    bool flag = true;
-    for (int kept_idx : selected_indices) {
-      if (flag) {
-        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
-                                      bbox_data + kept_idx * box_size, false);
-        flag = (overlap <= adaptive_threshold);
-      } else {
-        break;
-      }
-    }
-    if (flag) {
-      selected_indices.push_back(idx);
-      ++selected_num;
-    }
-    sorted_indices.erase(sorted_indices.end() - 1);
-    if (flag && eta < 1 && adaptive_threshold > 0.5) {
-      adaptive_threshold *= eta;
-    }
-  }
-  return VectorToTensor(selected_indices, selected_num);
-}
-
 template <typename T>
 class GenerateProposalsKernel : public framework::OpKernel<T> {
  public:
@@ -434,10 +205,10 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
     BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
 
-    ClipTiledBoxes<T>(ctx, im_info_slice, &proposals);
+    ClipTiledBoxes<T>(ctx, im_info_slice, proposals, &proposals, false);
 
     Tensor keep;
-    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, &keep);
+    FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
       math::SetConstant<platform::CPUDeviceContext, T> set_zero;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 71323ea966a6c..8359fbab519b3 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -16,13 +16,11 @@ limitations under the License. */
 #include <stdio.h>
 #include <string>
 #include <vector>
-#include "cub/cub.cuh"
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/gather.cu.h"
+#include "paddle/fluid/operators/detection/bbox_util.cu.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/for_range.h"
 
 namespace paddle {
 namespace operators {
@@ -31,258 +29,6 @@ using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
 namespace {
-
-#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
-
-int const kThreadsPerBlock = sizeof(uint64_t) * 8;
-
-static const double kBBoxClipDefault = std::log(1000.0 / 16.0);
-
-struct RangeInitFunctor {
-  int start_;
-  int delta_;
-  int *out_;
-  __device__ void operator()(size_t i) { out_[i] = start_ + i * delta_; }
-};
-
-template <typename T>
-static void SortDescending(const platform::CUDADeviceContext &ctx,
-                           const Tensor &value, Tensor *value_out,
-                           Tensor *index_out) {
-  int num = static_cast<int>(value.numel());
-  Tensor index_in_t;
-  int *idx_in = index_in_t.mutable_data<int>({num}, ctx.GetPlace());
-  platform::ForRange<platform::CUDADeviceContext> for_range(ctx, num);
-  for_range(RangeInitFunctor{0, 1, idx_in});
-
-  int *idx_out = index_out->mutable_data<int>({num}, ctx.GetPlace());
-
-  const T *keys_in = value.data<T>();
-  T *keys_out = value_out->mutable_data<T>({num}, ctx.GetPlace());
-
-  // Determine temporary device storage requirements
-  size_t temp_storage_bytes = 0;
-  cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      nullptr, temp_storage_bytes, keys_in, keys_out, idx_in, idx_out, num);
-  // Allocate temporary storage
-  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  auto d_temp_storage = memory::Alloc(place, temp_storage_bytes);
-
-  // Run sorting operation
-  cub::DeviceRadixSort::SortPairsDescending<T, int>(
-      d_temp_storage->ptr(), temp_storage_bytes, keys_in, keys_out, idx_in,
-      idx_out, num);
-}
-
-template <typename T>
-struct BoxDecodeAndClipFunctor {
-  const T *anchor;
-  const T *deltas;
-  const T *var;
-  const int *index;
-  const T *im_info;
-
-  T *proposals;
-
-  BoxDecodeAndClipFunctor(const T *anchor, const T *deltas, const T *var,
-                          const int *index, const T *im_info, T *proposals)
-      : anchor(anchor),
-        deltas(deltas),
-        var(var),
-        index(index),
-        im_info(im_info),
-        proposals(proposals) {}
-
-  T bbox_clip_default{static_cast<T>(kBBoxClipDefault)};
-
-  __device__ void operator()(size_t i) {
-    int k = index[i] * 4;
-    T axmin = anchor[k];
-    T aymin = anchor[k + 1];
-    T axmax = anchor[k + 2];
-    T aymax = anchor[k + 3];
-
-    T w = axmax - axmin + 1.0;
-    T h = aymax - aymin + 1.0;
-    T cx = axmin + 0.5 * w;
-    T cy = aymin + 0.5 * h;
-
-    T dxmin = deltas[k];
-    T dymin = deltas[k + 1];
-    T dxmax = deltas[k + 2];
-    T dymax = deltas[k + 3];
-
-    T d_cx, d_cy, d_w, d_h;
-    if (var) {
-      d_cx = cx + dxmin * w * var[k];
-      d_cy = cy + dymin * h * var[k + 1];
-      d_w = exp(Min(dxmax * var[k + 2], bbox_clip_default)) * w;
-      d_h = exp(Min(dymax * var[k + 3], bbox_clip_default)) * h;
-    } else {
-      d_cx = cx + dxmin * w;
-      d_cy = cy + dymin * h;
-      d_w = exp(Min(dxmax, bbox_clip_default)) * w;
-      d_h = exp(Min(dymax, bbox_clip_default)) * h;
-    }
-
-    T oxmin = d_cx - d_w * 0.5;
-    T oymin = d_cy - d_h * 0.5;
-    T oxmax = d_cx + d_w * 0.5 - 1.;
-    T oymax = d_cy + d_h * 0.5 - 1.;
-
-    proposals[i * 4] = Max(Min(oxmin, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 1] = Max(Min(oymin, im_info[0] - 1.), 0.);
-    proposals[i * 4 + 2] = Max(Min(oxmax, im_info[1] - 1.), 0.);
-    proposals[i * 4 + 3] = Max(Min(oymax, im_info[0] - 1.), 0.);
-  }
-
-  __device__ __forceinline__ T Min(T a, T b) const { return a > b ? b : a; }
-
-  __device__ __forceinline__ T Max(T a, T b) const { return a > b ? a : b; }
-};
-
-template <typename T, int BlockSize>
-static __global__ void FilterBBoxes(const T *bboxes, const T *im_info,
-                                    const T min_size, const int num,
-                                    int *keep_num, int *keep) {
-  T im_h = im_info[0];
-  T im_w = im_info[1];
-  T im_scale = im_info[2];
-
-  int cnt = 0;
-  __shared__ int keep_index[BlockSize];
-
-  CUDA_KERNEL_LOOP(i, num) {
-    keep_index[threadIdx.x] = -1;
-    __syncthreads();
-
-    int k = i * 4;
-    T xmin = bboxes[k];
-    T ymin = bboxes[k + 1];
-    T xmax = bboxes[k + 2];
-    T ymax = bboxes[k + 3];
-
-    T w = xmax - xmin + 1.0;
-    T h = ymax - ymin + 1.0;
-    T cx = xmin + w / 2.;
-    T cy = ymin + h / 2.;
-
-    T w_s = (xmax - xmin) / im_scale + 1.;
-    T h_s = (ymax - ymin) / im_scale + 1.;
-
-    if (w_s >= min_size && h_s >= min_size && cx <= im_w && cy <= im_h) {
-      keep_index[threadIdx.x] = i;
-    }
-    __syncthreads();
-    if (threadIdx.x == 0) {
-      int size = (num - i) < BlockSize ? num - i : BlockSize;
-      for (int j = 0; j < size; ++j) {
-        if (keep_index[j] > -1) {
-          keep[cnt++] = keep_index[j];
-        }
-      }
-    }
-    __syncthreads();
-  }
-  if (threadIdx.x == 0) {
-    keep_num[0] = cnt;
-  }
-}
-
-static __device__ inline float IoU(const float *a, const float *b) {
-  float left = max(a[0], b[0]), right = min(a[2], b[2]);
-  float top = max(a[1], b[1]), bottom = min(a[3], b[3]);
-  float width = max(right - left + 1, 0.f), height = max(bottom - top + 1, 0.f);
-  float inter_s = width * height;
-  float s_a = (a[2] - a[0] + 1) * (a[3] - a[1] + 1);
-  float s_b = (b[2] - b[0] + 1) * (b[3] - b[1] + 1);
-  return inter_s / (s_a + s_b - inter_s);
-}
-
-static __global__ void NMSKernel(const int n_boxes,
-                                 const float nms_overlap_thresh,
-                                 const float *dev_boxes, uint64_t *dev_mask) {
-  const int row_start = blockIdx.y;
-  const int col_start = blockIdx.x;
-
-  const int row_size =
-      min(n_boxes - row_start * kThreadsPerBlock, kThreadsPerBlock);
-  const int col_size =
-      min(n_boxes - col_start * kThreadsPerBlock, kThreadsPerBlock);
-
-  __shared__ float block_boxes[kThreadsPerBlock * 4];
-  if (threadIdx.x < col_size) {
-    block_boxes[threadIdx.x * 4 + 0] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 0];
-    block_boxes[threadIdx.x * 4 + 1] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 1];
-    block_boxes[threadIdx.x * 4 + 2] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 2];
-    block_boxes[threadIdx.x * 4 + 3] =
-        dev_boxes[(kThreadsPerBlock * col_start + threadIdx.x) * 4 + 3];
-  }
-  __syncthreads();
-
-  if (threadIdx.x < row_size) {
-    const int cur_box_idx = kThreadsPerBlock * row_start + threadIdx.x;
-    const float *cur_box = dev_boxes + cur_box_idx * 4;
-    int i = 0;
-    uint64_t t = 0;
-    int start = 0;
-    if (row_start == col_start) {
-      start = threadIdx.x + 1;
-    }
-    for (i = start; i < col_size; i++) {
-      if (IoU(cur_box, block_boxes + i * 4) > nms_overlap_thresh) {
-        t |= 1ULL << i;
-      }
-    }
-    const int col_blocks = DIVUP(n_boxes, kThreadsPerBlock);
-    dev_mask[cur_box_idx * col_blocks + col_start] = t;
-  }
-}
-
-template <typename T>
-static void NMS(const platform::CUDADeviceContext &ctx, const Tensor &proposals,
-                const Tensor &sorted_indices, const T nms_threshold,
-                Tensor *keep_out) {
-  int boxes_num = proposals.dims()[0];
-  const int col_blocks = DIVUP(boxes_num, kThreadsPerBlock);
-  dim3 blocks(DIVUP(boxes_num, kThreadsPerBlock),
-              DIVUP(boxes_num, kThreadsPerBlock));
-  dim3 threads(kThreadsPerBlock);
-
-  const T *boxes = proposals.data<T>();
-  auto place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
-  framework::Vector<uint64_t> mask(boxes_num * col_blocks);
-  NMSKernel<<<blocks, threads>>>(boxes_num, nms_threshold, boxes,
-                                 mask.CUDAMutableData(BOOST_GET_CONST(
-                                     platform::CUDAPlace, ctx.GetPlace())));
-
-  std::vector<uint64_t> remv(col_blocks);
-  memset(&remv[0], 0, sizeof(uint64_t) * col_blocks);
-
-  std::vector<int> keep_vec;
-  int num_to_keep = 0;
-  for (int i = 0; i < boxes_num; i++) {
-    int nblock = i / kThreadsPerBlock;
-    int inblock = i % kThreadsPerBlock;
-
-    if (!(remv[nblock] & (1ULL << inblock))) {
-      ++num_to_keep;
-      keep_vec.push_back(i);
-      uint64_t *p = &mask[0] + i * col_blocks;
-      for (int j = nblock; j < col_blocks; j++) {
-        remv[j] |= p[j];
-      }
-    }
-  }
-  int *keep = keep_out->mutable_data<int>({num_to_keep}, ctx.GetPlace());
-  memory::Copy(place, keep, platform::CPUPlace(), keep_vec.data(),
-               sizeof(int) * num_to_keep, ctx.stream());
-  ctx.Wait();
-}
-
 template <typename T>
 static std::pair<Tensor, Tensor> ProposalForOneImage(
     const platform::CUDADeviceContext &ctx, const Tensor &im_info,
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
new file mode 100644
index 0000000000000..7c2fd599fa6a2
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -0,0 +1,314 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cmath>
+#include <cstring>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/operators/detection/bbox_util.h"
+#include "paddle/fluid/operators/detection/nms_util.h"
+#include "paddle/fluid/operators/gather.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+class GenerateProposalsV2Op : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Scores"), true,
+        platform::errors::NotFound("Input(Scores) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("BboxDeltas"), true,
+        platform::errors::NotFound("Input(BboxDeltas) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("ImShape"), true,
+        platform::errors::NotFound("Input(ImShape) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Anchors"), true,
+        platform::errors::NotFound("Input(Anchors) shouldn't be null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Variances"), true,
+        platform::errors::NotFound("Input(Variances) shouldn't be null."));
+
+    ctx->SetOutputDim("RpnRois", {-1, 4});
+    ctx->SetOutputDim("RpnRoiProbs", {-1, 1});
+    if (!ctx->IsRuntime()) {
+      ctx->SetLoDLevel("RpnRois", std::max(ctx->GetLoDLevel("Scores"), 1));
+      ctx->SetLoDLevel("RpnRoiProbs", std::max(ctx->GetLoDLevel("Scores"), 1));
+    }
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Anchors"),
+        ctx.device_context());
+  }
+};
+
+template <typename T>
+class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_shape = context.Input<Tensor>("ImShape");
+    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"), "Input",
+                                   "Anchors", "GenerateProposals");
+    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
+                                     "Input", "Variances", "GenerateProposals");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+
+    auto &dev_ctx =
+        context.template device_context<platform::CPUDeviceContext>();
+
+    auto &scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+
+    auto &bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+
+    framework::LoD lod;
+    lod.resize(1);
+    auto &lod0 = lod[0];
+    lod0.push_back(0);
+    anchors.Resize({anchors.numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
+    std::vector<int> tmp_num;
+
+    int64_t num_proposals = 0;
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_shape_slice = im_shape->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> tensor_pair =
+          ProposalForOneImage(dev_ctx, im_shape_slice, anchors, variances,
+                              bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                              post_nms_top_n, nms_thresh, min_size, eta);
+      Tensor &proposals = tensor_pair.first;
+      Tensor &scores = tensor_pair.second;
+
+      AppendProposals(rpn_rois, 4 * num_proposals, proposals);
+      AppendProposals(rpn_roi_probs, num_proposals, scores);
+      num_proposals += proposals.dims()[0];
+      lod0.push_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
+    }
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      for (int i = 0; i < num; i++) {
+        num_data[i] = tmp_num[i];
+      }
+      rpn_rois_num->Resize({num});
+    }
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+
+  std::pair<Tensor, Tensor> ProposalForOneImage(
+      const platform::CPUDeviceContext &ctx, const Tensor &im_shape_slice,
+      const Tensor &anchors, const Tensor &variances,
+      const Tensor &bbox_deltas_slice,  // [M, 4]
+      const Tensor &scores_slice,       // [N, 1]
+      int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+      float eta) const {
+    auto *scores_data = scores_slice.data<T>();
+
+    // Sort index
+    Tensor index_t;
+    index_t.Resize({scores_slice.numel()});
+    int *index = index_t.mutable_data<int>(ctx.GetPlace());
+    for (int i = 0; i < scores_slice.numel(); ++i) {
+      index[i] = i;
+    }
+    auto compare = [scores_data](const int64_t &i, const int64_t &j) {
+      return scores_data[i] > scores_data[j];
+    };
+
+    if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) {
+      std::sort(index, index + scores_slice.numel(), compare);
+    } else {
+      std::nth_element(index, index + pre_nms_top_n,
+                       index + scores_slice.numel(), compare);
+      index_t.Resize({pre_nms_top_n});
+    }
+
+    Tensor scores_sel, bbox_sel, anchor_sel, var_sel;
+    scores_sel.mutable_data<T>({index_t.numel(), 1}, ctx.GetPlace());
+    bbox_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    anchor_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    var_sel.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+
+    CPUGather<T>(ctx, scores_slice, index_t, &scores_sel);
+    CPUGather<T>(ctx, bbox_deltas_slice, index_t, &bbox_sel);
+    CPUGather<T>(ctx, anchors, index_t, &anchor_sel);
+    CPUGather<T>(ctx, variances, index_t, &var_sel);
+
+    Tensor proposals;
+    proposals.mutable_data<T>({index_t.numel(), 4}, ctx.GetPlace());
+    BoxCoder<T>(ctx, &anchor_sel, &bbox_sel, &var_sel, &proposals);
+
+    ClipTiledBoxes<T>(ctx, im_shape_slice, proposals, &proposals, false);
+
+    Tensor keep;
+    FilterBoxes<T>(ctx, &proposals, min_size, im_shape_slice, false, &keep);
+    // Handle the case when there is no keep index left
+    if (keep.numel() == 0) {
+      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
+      set_zero(ctx, &bbox_sel, static_cast<T>(0));
+      Tensor scores_filter;
+      scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+      set_zero(ctx, &scores_filter, static_cast<T>(0));
+      return std::make_pair(bbox_sel, scores_filter);
+    }
+
+    Tensor scores_filter;
+    bbox_sel.mutable_data<T>({keep.numel(), 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({keep.numel(), 1}, ctx.GetPlace());
+    CPUGather<T>(ctx, proposals, keep, &bbox_sel);
+    CPUGather<T>(ctx, scores_sel, keep, &scores_filter);
+    if (nms_thresh <= 0) {
+      return std::make_pair(bbox_sel, scores_filter);
+    }
+
+    Tensor keep_nms = NMS<T>(ctx, &bbox_sel, &scores_filter, nms_thresh, eta);
+
+    if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+      keep_nms.Resize({post_nms_top_n});
+    }
+
+    proposals.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+    scores_sel.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+    CPUGather<T>(ctx, bbox_sel, keep_nms, &proposals);
+    CPUGather<T>(ctx, scores_filter, keep_nms, &scores_sel);
+
+    return std::make_pair(proposals, scores_sel);
+  }
+};
+
+class GenerateProposalsV2OpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Scores",
+             "(Tensor) The scores from conv is in shape (N, A, H, W), "
+             "N is batch size, A is number of anchors, "
+             "H and W are height and width of the feature map");
+    AddInput("BboxDeltas",
+             "(Tensor) Bounding box deltas from conv is in "
+             "shape (N, 4*A, H, W).");
+    AddInput("ImShape",
+             "(Tensor) Image shape in shape (N, 2), "
+             "in format (height, width)");
+    AddInput("Anchors",
+             "(Tensor) Bounding box anchors from anchor_generator_op "
+             "is in shape (A, H, W, 4).");
+    AddInput("Variances",
+             "(Tensor) Bounding box variances with same shape as `Anchors`.");
+
+    AddOutput("RpnRois",
+              "(LoDTensor), Output proposals with shape (rois_num, 4).");
+    AddOutput("RpnRoiProbs",
+              "(LoDTensor) Scores of proposals with shape (rois_num, 1).");
+    AddOutput("RpnRoisNum", "(Tensor), The number of Rpn RoIs in each image")
+        .AsDispensable();
+    AddAttr<int>("pre_nms_topN",
+                 "Number of top scoring RPN proposals to keep before "
+                 "applying NMS.");
+    AddAttr<int>("post_nms_topN",
+                 "Number of top scoring RPN proposals to keep after "
+                 "applying NMS");
+    AddAttr<float>("nms_thresh", "NMS threshold used on RPN proposals.");
+    AddAttr<float>("min_size",
+                   "Proposal height and width both need to be greater "
+                   "than this min_size.");
+    AddAttr<float>("eta", "The parameter for adaptive NMS.");
+    AddComment(R"DOC(
+This operator is the second version of generate_proposals op to generate 
+bounding box proposals for Faster RCNN.
+The proposals are generated for a list of images based on image
+score 'Scores', bounding box regression result 'BboxDeltas' as
+well as predefined bounding box shapes 'anchors'. Greedy
+non-maximum suppression is applied to generate the final bounding
+boxes.
+
+The difference between this version and the first version is that the image
+ scale is no long needed now, so the input requires im_shape instead of im_info.
+The change aims to unify the input for all kinds of objective detection 
+such as YOLO-v3 and Faster R-CNN. As a result, the min_size represents the 
+size on input image instead of original image which is slightly different 
+to before and will not effect the result.
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(
+    generate_proposals_v2, ops::GenerateProposalsV2Op,
+    ops::GenerateProposalsV2OpMaker,
+    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
+    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OP_CPU_KERNEL(generate_proposals_v2,
+                       ops::GenerateProposalsV2Kernel<float>,
+                       ops::GenerateProposalsV2Kernel<double>);
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
new file mode 100644
index 0000000000000..70020cdc64ef5
--- /dev/null
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -0,0 +1,229 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <paddle/fluid/memory/allocation/allocator.h>
+#include <stdio.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/mixed_vector.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/memory.h"
+#include "paddle/fluid/operators/detection/bbox_util.cu.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+using LoDTensor = framework::LoDTensor;
+
+namespace {
+template <typename T>
+static std::pair<Tensor, Tensor> ProposalForOneImage(
+    const platform::CUDADeviceContext &ctx, const Tensor &im_shape,
+    const Tensor &anchors, const Tensor &variances,
+    const Tensor &bbox_deltas,  // [M, 4]
+    const Tensor &scores,       // [N, 1]
+    int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size,
+    float eta) {
+  // 1. pre nms
+  Tensor scores_sort, index_sort;
+  SortDescending<T>(ctx, scores, &scores_sort, &index_sort);
+  int num = scores.numel();
+  int pre_nms_num = (pre_nms_top_n <= 0 || pre_nms_top_n > num) ? scores.numel()
+                                                                : pre_nms_top_n;
+  scores_sort.Resize({pre_nms_num, 1});
+  index_sort.Resize({pre_nms_num, 1});
+
+  // 2. box decode and clipping
+  Tensor proposals;
+  proposals.mutable_data<T>({pre_nms_num, 4}, ctx.GetPlace());
+
+  {
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, pre_nms_num);
+    for_range(BoxDecodeAndClipFunctor<T>{
+        anchors.data<T>(), bbox_deltas.data<T>(), variances.data<T>(),
+        index_sort.data<int>(), im_shape.data<T>(), proposals.data<T>()});
+  }
+
+  // 3. filter
+  Tensor keep_index, keep_num_t;
+  keep_index.mutable_data<int>({pre_nms_num}, ctx.GetPlace());
+  keep_num_t.mutable_data<int>({1}, ctx.GetPlace());
+  min_size = std::max(min_size, 1.0f);
+  auto stream = ctx.stream();
+  FilterBBoxes<T, 512><<<1, 512, 0, stream>>>(
+      proposals.data<T>(), im_shape.data<T>(), min_size, pre_nms_num,
+      keep_num_t.data<int>(), keep_index.data<int>(), false);
+  int keep_num;
+  const auto gpu_place = BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace());
+  memory::Copy(platform::CPUPlace(), &keep_num, gpu_place,
+               keep_num_t.data<int>(), sizeof(int), ctx.stream());
+  ctx.Wait();
+  keep_index.Resize({keep_num});
+
+  Tensor scores_filter, proposals_filter;
+  // Handle the case when there is no keep index left
+  if (keep_num == 0) {
+    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
+    scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
+    set_zero(ctx, &proposals_filter, static_cast<T>(0));
+    set_zero(ctx, &scores_filter, static_cast<T>(0));
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+  proposals_filter.mutable_data<T>({keep_num, 4}, ctx.GetPlace());
+  scores_filter.mutable_data<T>({keep_num, 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals, keep_index, &proposals_filter);
+  GPUGather<T>(ctx, scores_sort, keep_index, &scores_filter);
+
+  if (nms_thresh <= 0) {
+    return std::make_pair(proposals_filter, scores_filter);
+  }
+
+  // 4. nms
+  Tensor keep_nms;
+  NMS<T>(ctx, proposals_filter, keep_index, nms_thresh, &keep_nms);
+  if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) {
+    keep_nms.Resize({post_nms_top_n});
+  }
+
+  Tensor scores_nms, proposals_nms;
+  proposals_nms.mutable_data<T>({keep_nms.numel(), 4}, ctx.GetPlace());
+  scores_nms.mutable_data<T>({keep_nms.numel(), 1}, ctx.GetPlace());
+  GPUGather<T>(ctx, proposals_filter, keep_nms, &proposals_nms);
+  GPUGather<T>(ctx, scores_filter, keep_nms, &scores_nms);
+
+  return std::make_pair(proposals_nms, scores_nms);
+}
+}  // namespace
+
+template <typename DeviceContext, typename T>
+class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    auto *scores = context.Input<Tensor>("Scores");
+    auto *bbox_deltas = context.Input<Tensor>("BboxDeltas");
+    auto *im_shape = context.Input<Tensor>("ImShape");
+    auto anchors = GET_DATA_SAFELY(context.Input<Tensor>("Anchors"), "Input",
+                                   "Anchors", "GenerateProposals");
+    auto variances = GET_DATA_SAFELY(context.Input<Tensor>("Variances"),
+                                     "Input", "Variances", "GenerateProposals");
+
+    auto *rpn_rois = context.Output<LoDTensor>("RpnRois");
+    auto *rpn_roi_probs = context.Output<LoDTensor>("RpnRoiProbs");
+
+    int pre_nms_top_n = context.Attr<int>("pre_nms_topN");
+    int post_nms_top_n = context.Attr<int>("post_nms_topN");
+    float nms_thresh = context.Attr<float>("nms_thresh");
+    float min_size = context.Attr<float>("min_size");
+    float eta = context.Attr<float>("eta");
+    PADDLE_ENFORCE_GE(eta, 1.,
+                      platform::errors::InvalidArgument(
+                          "Not support adaptive NMS. The attribute 'eta' "
+                          "should not less than 1. But received eta=[%d]",
+                          eta));
+
+    auto &dev_ctx = context.template device_context<DeviceContext>();
+
+    auto scores_dim = scores->dims();
+    int64_t num = scores_dim[0];
+    int64_t c_score = scores_dim[1];
+    int64_t h_score = scores_dim[2];
+    int64_t w_score = scores_dim[3];
+
+    auto bbox_dim = bbox_deltas->dims();
+    int64_t c_bbox = bbox_dim[1];
+    int64_t h_bbox = bbox_dim[2];
+    int64_t w_bbox = bbox_dim[3];
+
+    Tensor bbox_deltas_swap, scores_swap;
+    bbox_deltas_swap.mutable_data<T>({num, h_bbox, w_bbox, c_bbox},
+                                     dev_ctx.GetPlace());
+    scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
+                                dev_ctx.GetPlace());
+
+    math::Transpose<DeviceContext, T, 4> trans;
+    std::vector<int> axis = {0, 2, 3, 1};
+    trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
+    trans(dev_ctx, *scores, &scores_swap, axis);
+
+    anchors.Resize({anchors.numel() / 4, 4});
+    variances.Resize({variances.numel() / 4, 4});
+
+    rpn_rois->mutable_data<T>({bbox_deltas->numel() / 4, 4},
+                              context.GetPlace());
+    rpn_roi_probs->mutable_data<T>({scores->numel(), 1}, context.GetPlace());
+
+    T *rpn_rois_data = rpn_rois->data<T>();
+    T *rpn_roi_probs_data = rpn_roi_probs->data<T>();
+
+    auto place = BOOST_GET_CONST(platform::CUDAPlace, dev_ctx.GetPlace());
+    auto cpu_place = platform::CPUPlace();
+
+    int64_t num_proposals = 0;
+    std::vector<size_t> offset(1, 0);
+    std::vector<int> tmp_num;
+
+    for (int64_t i = 0; i < num; ++i) {
+      Tensor im_shape_slice = im_shape->Slice(i, i + 1);
+      Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1);
+      Tensor scores_slice = scores_swap.Slice(i, i + 1);
+
+      bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4});
+      scores_slice.Resize({h_score * w_score * c_score, 1});
+
+      std::pair<Tensor, Tensor> box_score_pair =
+          ProposalForOneImage<T>(dev_ctx, im_shape_slice, anchors, variances,
+                                 bbox_deltas_slice, scores_slice, pre_nms_top_n,
+                                 post_nms_top_n, nms_thresh, min_size, eta);
+
+      Tensor &proposals = box_score_pair.first;
+      Tensor &scores = box_score_pair.second;
+
+      memory::Copy(place, rpn_rois_data + num_proposals * 4, place,
+                   proposals.data<T>(), sizeof(T) * proposals.numel(),
+                   dev_ctx.stream());
+      memory::Copy(place, rpn_roi_probs_data + num_proposals, place,
+                   scores.data<T>(), sizeof(T) * scores.numel(),
+                   dev_ctx.stream());
+      dev_ctx.Wait();
+      num_proposals += proposals.dims()[0];
+      offset.emplace_back(num_proposals);
+      tmp_num.push_back(proposals.dims()[0]);
+    }
+    if (context.HasOutput("RpnRoisNum")) {
+      auto *rpn_rois_num = context.Output<Tensor>("RpnRoisNum");
+      rpn_rois_num->mutable_data<int>({num}, context.GetPlace());
+      int *num_data = rpn_rois_num->data<int>();
+      memory::Copy(place, num_data, cpu_place, &tmp_num[0], sizeof(int) * num,
+                   dev_ctx.stream());
+      rpn_rois_num->Resize({num});
+    }
+    framework::LoD lod;
+    lod.emplace_back(offset);
+    rpn_rois->set_lod(lod);
+    rpn_roi_probs->set_lod(lod);
+    rpn_rois->Resize({num_proposals, 4});
+    rpn_roi_probs->Resize({num_proposals, 1});
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(generate_proposals_v2,
+                        ops::CUDAGenerateProposalsV2Kernel<
+                            paddle::platform::CUDADeviceContext, float>);
diff --git a/paddle/fluid/operators/detection/nms_util.h b/paddle/fluid/operators/detection/nms_util.h
index 067bfce51949c..febdee8263553 100644
--- a/paddle/fluid/operators/detection/nms_util.h
+++ b/paddle/fluid/operators/detection/nms_util.h
@@ -99,5 +99,74 @@ T PolyIoU(const T* box1, const T* box2, const size_t box_size,
   }
 }
 
+template <class T>
+static inline std::vector<std::pair<T, int>> GetSortedScoreIndex(
+    const std::vector<T>& scores) {
+  std::vector<std::pair<T, int>> sorted_indices;
+  sorted_indices.reserve(scores.size());
+  for (size_t i = 0; i < scores.size(); ++i) {
+    sorted_indices.emplace_back(scores[i], i);
+  }
+  // Sort the score pair according to the scores in descending order
+  std::stable_sort(sorted_indices.begin(), sorted_indices.end(),
+                   [](const std::pair<T, int>& a, const std::pair<T, int>& b) {
+                     return a.first < b.first;
+                   });
+  return sorted_indices;
+}
+
+template <typename T>
+static inline framework::Tensor VectorToTensor(
+    const std::vector<T>& selected_indices, int selected_num) {
+  framework::Tensor keep_nms;
+  keep_nms.Resize({selected_num});
+  auto* keep_data = keep_nms.mutable_data<T>(platform::CPUPlace());
+  for (int i = 0; i < selected_num; ++i) {
+    keep_data[i] = selected_indices[i];
+  }
+  return keep_nms;
+}
+
+template <class T>
+framework::Tensor NMS(const platform::DeviceContext& ctx,
+                      framework::Tensor* bbox, framework::Tensor* scores,
+                      T nms_threshold, float eta) {
+  int64_t num_boxes = bbox->dims()[0];
+  // 4: [xmin ymin xmax ymax]
+  int64_t box_size = bbox->dims()[1];
+
+  std::vector<T> scores_data(num_boxes);
+  std::copy_n(scores->data<T>(), num_boxes, scores_data.begin());
+  std::vector<std::pair<T, int>> sorted_indices =
+      GetSortedScoreIndex<T>(scores_data);
+
+  std::vector<int> selected_indices;
+  int selected_num = 0;
+  T adaptive_threshold = nms_threshold;
+  const T* bbox_data = bbox->data<T>();
+  while (sorted_indices.size() != 0) {
+    int idx = sorted_indices.back().second;
+    bool flag = true;
+    for (int kept_idx : selected_indices) {
+      if (flag) {
+        T overlap = JaccardOverlap<T>(bbox_data + idx * box_size,
+                                      bbox_data + kept_idx * box_size, false);
+        flag = (overlap <= adaptive_threshold);
+      } else {
+        break;
+      }
+    }
+    if (flag) {
+      selected_indices.push_back(idx);
+      ++selected_num;
+    }
+    sorted_indices.erase(sorted_indices.end() - 1);
+    if (flag && eta < 1 && adaptive_threshold > 0.5) {
+      adaptive_threshold *= eta;
+    }
+  }
+  return VectorToTensor(selected_indices, selected_num);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/pybind/op_function_generator.cc b/paddle/fluid/pybind/op_function_generator.cc
index 92006bff2cc16..10914cf0ab7ba 100644
--- a/paddle/fluid/pybind/op_function_generator.cc
+++ b/paddle/fluid/pybind/op_function_generator.cc
@@ -81,6 +81,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
      {"MultiFpnRois", "RestoreIndex", "MultiLevelRoIsNum"}},
     {"moving_average_abs_max_scale", {"OutScale", "OutAccum", "OutState"}},
     {"multiclass_nms3", {"Out", "NmsRoisNum"}},
+    {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
 };
 
 // NOTE(zhiqiu): Commonly, the outputs in auto-generated OP function are
diff --git a/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
new file mode 100644
index 0000000000000..26c443008db50
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_generate_proposals_v2_op.py
@@ -0,0 +1,238 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+import sys
+import math
+import paddle
+import paddle.fluid as fluid
+from op_test import OpTest
+from test_multiclass_nms_op import nms
+from test_anchor_generator_op import anchor_generator_in_python
+import copy
+from test_generate_proposals_op import clip_tiled_boxes, box_coder, nms
+
+
+def generate_proposals_v2_in_python(scores, bbox_deltas, im_shape, anchors,
+                                    variances, pre_nms_topN, post_nms_topN,
+                                    nms_thresh, min_size, eta):
+    all_anchors = anchors.reshape(-1, 4)
+    rois = np.empty((0, 5), dtype=np.float32)
+    roi_probs = np.empty((0, 1), dtype=np.float32)
+
+    rpn_rois = []
+    rpn_roi_probs = []
+    rois_num = []
+    num_images = scores.shape[0]
+    for img_idx in range(num_images):
+        img_i_boxes, img_i_probs = proposal_for_one_image(
+            im_shape[img_idx, :], all_anchors, variances,
+            bbox_deltas[img_idx, :, :, :], scores[img_idx, :, :, :],
+            pre_nms_topN, post_nms_topN, nms_thresh, min_size, eta)
+        rois_num.append(img_i_probs.shape[0])
+        rpn_rois.append(img_i_boxes)
+        rpn_roi_probs.append(img_i_probs)
+
+    return rpn_rois, rpn_roi_probs, rois_num
+
+
+def proposal_for_one_image(im_shape, all_anchors, variances, bbox_deltas,
+                           scores, pre_nms_topN, post_nms_topN, nms_thresh,
+                           min_size, eta):
+    # Transpose and reshape predicted bbox transformations to get them
+    # into the same order as the anchors:
+    #   - bbox deltas will be (4 * A, H, W) format from conv output
+    #   - transpose to (H, W, 4 * A)
+    #   - reshape to (H * W * A, 4) where rows are ordered by (H, W, A)
+    #     in slowest to fastest order to match the enumerated anchors
+    bbox_deltas = bbox_deltas.transpose((1, 2, 0)).reshape(-1, 4)
+    all_anchors = all_anchors.reshape(-1, 4)
+    variances = variances.reshape(-1, 4)
+    # Same story for the scores:
+    #   - scores are (A, H, W) format from conv output
+    #   - transpose to (H, W, A)
+    #   - reshape to (H * W * A, 1) where rows are ordered by (H, W, A)
+    #     to match the order of anchors and bbox_deltas
+    scores = scores.transpose((1, 2, 0)).reshape(-1, 1)
+
+    # sort all (proposal, score) pairs by score from highest to lowest
+    # take top pre_nms_topN (e.g. 6000)
+    if pre_nms_topN <= 0 or pre_nms_topN >= len(scores):
+        order = np.argsort(-scores.squeeze())
+    else:
+        # Avoid sorting possibly large arrays;
+        # First partition to get top K unsorted
+        # and then sort just those
+        inds = np.argpartition(-scores.squeeze(), pre_nms_topN)[:pre_nms_topN]
+        order = np.argsort(-scores[inds].squeeze())
+        order = inds[order]
+    scores = scores[order, :]
+    bbox_deltas = bbox_deltas[order, :]
+    all_anchors = all_anchors[order, :]
+    proposals = box_coder(all_anchors, bbox_deltas, variances)
+    # clip proposals to image (may result in proposals with zero area
+    # that will be removed in the next step)
+    proposals = clip_tiled_boxes(proposals, im_shape)
+    # remove predicted boxes with height or width < min_size
+    keep = filter_boxes(proposals, min_size, im_shape)
+    if len(keep) == 0:
+        proposals = np.zeros((1, 4)).astype('float32')
+        scores = np.zeros((1, 1)).astype('float32')
+        return proposals, scores
+    proposals = proposals[keep, :]
+    scores = scores[keep, :]
+
+    # apply loose nms (e.g. threshold = 0.7)
+    # take post_nms_topN (e.g. 1000)
+    # return the top proposals
+    if nms_thresh > 0:
+        keep = nms(boxes=proposals,
+                   scores=scores,
+                   nms_threshold=nms_thresh,
+                   eta=eta)
+        if post_nms_topN > 0 and post_nms_topN < len(keep):
+            keep = keep[:post_nms_topN]
+        proposals = proposals[keep, :]
+        scores = scores[keep, :]
+
+    return proposals, scores
+
+
+def filter_boxes(boxes, min_size, im_shape):
+    """Only keep boxes with both sides >= min_size and center within the image.
+    """
+    # Scale min_size to match image scale
+    min_size = max(min_size, 1.0)
+    ws = boxes[:, 2] - boxes[:, 0] + 1
+    hs = boxes[:, 3] - boxes[:, 1] + 1
+    x_ctr = boxes[:, 0] + ws / 2.
+    y_ctr = boxes[:, 1] + hs / 2.
+    keep = np.where((ws >= min_size) & (hs >= min_size) & (x_ctr < im_shape[1])
+                    & (y_ctr < im_shape[0]))[0]
+    return keep
+
+
+class TestGenerateProposalsV2Op(OpTest):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {
+            'Scores': self.scores,
+            'BboxDeltas': self.bbox_deltas,
+            'ImShape': self.im_shape.astype(np.float32),
+            'Anchors': self.anchors,
+            'Variances': self.variances
+        }
+
+        self.attrs = {
+            'pre_nms_topN': self.pre_nms_topN,
+            'post_nms_topN': self.post_nms_topN,
+            'nms_thresh': self.nms_thresh,
+            'min_size': self.min_size,
+            'eta': self.eta
+        }
+
+        self.outputs = {
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def setUp(self):
+        self.op_type = "generate_proposals_v2"
+        self.set_data()
+
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 3.0
+        self.eta = 1.
+
+    def init_test_input(self):
+        batch_size = 1
+        input_channels = 20
+        layer_h = 16
+        layer_w = 16
+        input_feat = np.random.random(
+            (batch_size, input_channels, layer_h, layer_w)).astype('float32')
+        self.anchors, self.variances = anchor_generator_in_python(
+            input_feat=input_feat,
+            anchor_sizes=[16., 32.],
+            aspect_ratios=[0.5, 1.0],
+            variances=[1.0, 1.0, 1.0, 1.0],
+            stride=[16.0, 16.0],
+            offset=0.5)
+        self.im_shape = np.array([[64, 64]]).astype('float32')
+        num_anchors = self.anchors.shape[2]
+        self.scores = np.random.random(
+            (batch_size, num_anchors, layer_h, layer_w)).astype('float32')
+        self.bbox_deltas = np.random.random(
+            (batch_size, num_anchors * 4, layer_h, layer_w)).astype('float32')
+
+    def init_test_output(self):
+        self.rpn_rois, self.rpn_roi_probs, self.rois_num = generate_proposals_v2_in_python(
+            self.scores, self.bbox_deltas, self.im_shape, self.anchors,
+            self.variances, self.pre_nms_topN, self.post_nms_topN,
+            self.nms_thresh, self.min_size, self.eta)
+
+
+class TestGenerateProposalsV2OutLodOp(TestGenerateProposalsV2Op):
+    def set_data(self):
+        self.init_test_params()
+        self.init_test_input()
+        self.init_test_output()
+        self.inputs = {
+            'Scores': self.scores,
+            'BboxDeltas': self.bbox_deltas,
+            'ImShape': self.im_shape.astype(np.float32),
+            'Anchors': self.anchors,
+            'Variances': self.variances
+        }
+
+        self.attrs = {
+            'pre_nms_topN': self.pre_nms_topN,
+            'post_nms_topN': self.post_nms_topN,
+            'nms_thresh': self.nms_thresh,
+            'min_size': self.min_size,
+            'eta': self.eta,
+            'return_rois_num': True
+        }
+
+        self.outputs = {
+            'RpnRois': (self.rpn_rois[0], [self.rois_num]),
+            'RpnRoiProbs': (self.rpn_roi_probs[0], [self.rois_num]),
+            'RpnRoisNum': (np.asarray(
+                self.rois_num, dtype=np.int32))
+        }
+
+
+class TestGenerateProposalsV2OpNoBoxLeft(TestGenerateProposalsV2Op):
+    def init_test_params(self):
+        self.pre_nms_topN = 12000  # train 12000, test 2000
+        self.post_nms_topN = 5000  # train 6000, test 1000
+        self.nms_thresh = 0.7
+        self.min_size = 1000.0
+        self.eta = 1.
+
+
+if __name__ == '__main__':
+    paddle.enable_static()
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index defa4f13495d2..be11663719441 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -673,4 +673,5 @@
     'test_sgd_op_xpu',
     'test_shape_op_xpu',
     'test_slice_op_xpu',
+    'test_generate_proposals_v2_op',
 ]

From acc11c2a62919048b54846de640c35374f9c2234 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Mon, 2 Nov 2020 16:43:54 +0800
Subject: [PATCH 094/185] Retry CUDA Initialization to Fix Random Failure,
 test=develop (#28323)

This PR is follow up of #28213. On that PR we tried to decrease GPU usage, however the CI still randomly failed. So I added retry logic for the initialization of nccl and cusolver. If the initialization failed, we can retry to avoid the random failure.
---
 paddle/fluid/platform/device_context.h        |  9 ++++-----
 paddle/fluid/platform/enforce.h               | 19 +++++++++++++++++++
 paddle/fluid/platform/nccl_helper.h           |  6 +++---
 ...test_parallel_executor_test_while_train.py |  2 +-
 4 files changed, 27 insertions(+), 9 deletions(-)

diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index e1438a1eefa62..e8b1d587121dc 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -214,8 +214,8 @@ class CUDAContext {
             << "Please recompile or reinstall Paddle with compatible CUDNN "
                "version.";
       }
-      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
-      PADDLE_ENFORCE_CUDA_SUCCESS(
+      PADDLE_RETRY_CUDA_SUCCESS(dynload::cudnnCreate(&cudnn_handle_));
+      PADDLE_RETRY_CUDA_SUCCESS(
           dynload::cudnnSetStream(cudnn_handle_, RawStream()));
     } else {
       cudnn_handle_ = nullptr;
@@ -223,9 +223,8 @@ class CUDAContext {
   }
 
   void InitCuSolverContext() {
-    PADDLE_ENFORCE_CUDA_SUCCESS(
-        dynload::cusolverDnCreate(&cusolver_dn_handle_));
-    PADDLE_ENFORCE_CUDA_SUCCESS(
+    PADDLE_RETRY_CUDA_SUCCESS(dynload::cusolverDnCreate(&cusolver_dn_handle_));
+    PADDLE_RETRY_CUDA_SUCCESS(
         dynload::cusolverDnSetStream(cusolver_dn_handle_, RawStream()));
   }
 
diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h
index 6a27249817027..fc57d3a4d08ac 100644
--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -904,6 +904,25 @@ DEFINE_CUDA_STATUS_TYPE(ncclResult_t, ncclSuccess);
     }                                                            \
   } while (0)
 
+#define PADDLE_RETRY_CUDA_SUCCESS(COND)                                 \
+  do {                                                                  \
+    auto __cond__ = (COND);                                             \
+    int retry_count = 1;                                                \
+    using __CUDA_STATUS_TYPE__ = decltype(__cond__);                    \
+    constexpr auto __success_type__ =                                   \
+        ::paddle::platform::details::CudaStatusType<                    \
+            __CUDA_STATUS_TYPE__>::kSuccess;                            \
+    while (UNLIKELY(__cond__ != __success_type__) && retry_count < 5) { \
+      __cond__ = (COND);                                                \
+      ++retry_count;                                                    \
+    }                                                                   \
+    if (UNLIKELY(__cond__ != __success_type__)) {                       \
+      auto __summary__ = ::paddle::platform::errors::External(          \
+          ::paddle::platform::build_nvidia_error_msg(__cond__));        \
+      __THROW_ERROR_INTERNAL__(__summary__);                            \
+    }                                                                   \
+  } while (0)
+
 #undef DEFINE_CUDA_STATUS_TYPE
 #endif  // PADDLE_WITH_CUDA
 
diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h
index 22550de5b3fad..c2f4d6ff2fffb 100644
--- a/paddle/fluid/platform/nccl_helper.h
+++ b/paddle/fluid/platform/nccl_helper.h
@@ -114,7 +114,7 @@ struct NCCLContextMap {
     // if num_trainers == 1, should create a new nccl id for local comms.
     if (num_trainers == 1 && nccl_id == nullptr) {
       std::lock_guard<std::mutex> guard(NCCLGroupGuard::NCCLMutex());
-      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
+      PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitAll(
           comms.get(), static_cast<int>(order_.size()), order_.data()));
     } else {
       PADDLE_ENFORCE_NOT_NULL(nccl_id, platform::errors::InvalidArgument(
@@ -132,8 +132,8 @@ struct NCCLContextMap {
           }
           VLOG(1) << "init nccl rank:" << rank << ", nranks:" << nranks
                   << ", gpu_id:" << gpu_id << ", dev_id:" << order_[i];
-          PADDLE_ENFORCE_CUDA_SUCCESS(cudaSetDevice(gpu_id));
-          PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
+          PADDLE_RETRY_CUDA_SUCCESS(cudaSetDevice(gpu_id));
+          PADDLE_RETRY_CUDA_SUCCESS(platform::dynload::ncclCommInitRank(
               comms.get() + i, nranks, *nccl_id, rank));
         }
       }
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
index 76d93259a647e..fd47dc37e7694 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
@@ -36,7 +36,7 @@ def check_network_convergence(self, use_cuda, build_strategy=None):
             opt = fluid.optimizer.SGD(learning_rate=0.001)
             opt.minimize(loss)
 
-            batch_size = 16
+            batch_size = 32
             image = np.random.normal(size=(batch_size, 784)).astype('float32')
             label = np.random.randint(0, 10, (batch_size, 1), dtype="int64")
 

From 0f4b6247c889bc38d4ea1302f831ae7f1dc8b980 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Mon, 2 Nov 2020 16:58:03 +0800
Subject: [PATCH 095/185] refine the gpu config for performance optimization
 (#28291)

---
 paddle/fluid/platform/gpu_launch_config.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/fluid/platform/gpu_launch_config.h b/paddle/fluid/platform/gpu_launch_config.h
index d2d57995b728e..3953abe142d20 100755
--- a/paddle/fluid/platform/gpu_launch_config.h
+++ b/paddle/fluid/platform/gpu_launch_config.h
@@ -53,10 +53,8 @@ inline GpuLaunchConfig GetGpuLaunchConfig1D(
 
   // Need get from device
   const int thread_per_block = std::min(1024, context.GetMaxThreadsPerBlock());
-  // Suppose block count small than factor * sm, factor is a experiments value.
-  int factor = 4;
   const int block_count =
-      std::min(DivUp(physical_thread_count, thread_per_block), factor * sm);
+      std::min(DivUp(physical_thread_count, thread_per_block), sm);
 
   GpuLaunchConfig config;
   config.theory_thread_count.x = theory_thread_count;

From 9a600df373e48928c0e812c8b0f285562c3585f7 Mon Sep 17 00:00:00 2001
From: Guo Sheng <whucsgs@163.com>
Date: Tue, 3 Nov 2020 09:31:02 +0800
Subject: [PATCH 096/185] Add rnn_op (#28197)

* Add rnn_op.
test=develop

* Fix rnn_op grad maker's drop_empty_grad.
test=develop
---
 paddle/fluid/operators/rnn_op.cc              | 255 +++++++
 paddle/fluid/operators/rnn_op.cu.cc           | 630 ++++++++++++++++++
 paddle/fluid/platform/cudnn_helper.h          |   6 +
 .../tests/unittests/rnn/test_rnn_nets.py      | 124 ++--
 python/paddle/nn/layer/rnn.py                 |  26 +-
 5 files changed, 970 insertions(+), 71 deletions(-)
 create mode 100644 paddle/fluid/operators/rnn_op.cc
 create mode 100644 paddle/fluid/operators/rnn_op.cu.cc

diff --git a/paddle/fluid/operators/rnn_op.cc b/paddle/fluid/operators/rnn_op.cc
new file mode 100644
index 0000000000000..dfdd32e10b9a9
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op.cc
@@ -0,0 +1,255 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <memory>
+#include <string>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace operators {
+
+class RNNOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "RNN");
+    OP_INOUT_CHECK(ctx->HasInputs("PreState"), "Input", "PreState", "RNN");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "RNN");
+    OP_INOUT_CHECK(ctx->HasOutputs("State"), "Output", "State", "RNN");
+
+    auto in_dims = ctx->GetInputDim("Input");
+    auto pre_state_dims = ctx->GetInputsDim("PreState");
+
+    PADDLE_ENFORCE_EQ(in_dims.size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of Input in RNN  must be 3. But "
+                          "received Input's rank is %d.",
+                          in_dims.size()));
+
+    if (ctx->HasInput("SequenceLength")) {
+      auto seq_dims = ctx->GetInputDim("SequenceLength");
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], seq_dims[0],
+          platform::errors::InvalidArgument(
+              "The size of SequenceLength has to equal the batch_size. But "
+              "received batch_size is %d and the size of SequenceLength is %d.",
+              in_dims[1], seq_dims[0]));
+    }
+
+    PADDLE_ENFORCE_EQ(pre_state_dims[0].size(), 3,
+                      platform::errors::InvalidArgument(
+                          "The rank of PreState in RNN  must be 3. But "
+                          "the received rank is %d.",
+                          pre_state_dims[0].size()));
+    size_t i = 0;
+    for (; i < pre_state_dims.size(); ++i) {
+      PADDLE_ENFORCE_EQ(
+          in_dims[1], pre_state_dims[i][1],
+          platform::errors::InvalidArgument(
+              "The second dimension size (representing for batch size) of "
+              "Input and PreState should be equal. But received %d and %d.",
+              in_dims[1], pre_state_dims[i][1]));
+      PADDLE_ENFORCE_EQ(
+          pre_state_dims[0], pre_state_dims[i],
+          platform::errors::InvalidArgument(
+              "The dims of all tensors in PreState should be same. But "
+              "received PreState[0] is %s and PreState[%d] is %s.",
+              pre_state_dims[0], i, pre_state_dims[i]));
+    }
+    auto mode = ctx->Attrs().Get<std::string>("mode");
+    size_t num_state = mode == "LSTM" ? 2 : 1;
+    PADDLE_ENFORCE_EQ(
+        i, num_state,
+        platform::errors::InvalidArgument(
+            "The number of tensors in PreState of %s should be %d, "
+            "but received %d.",
+            mode, 2, i));
+
+    auto out_dims = in_dims;
+    auto hidden_size = ctx->Attrs().Get<int>("hidden_size");
+    bool is_bidirec = ctx->Attrs().Get<bool>("is_bidirec");
+    out_dims[2] = is_bidirec ? hidden_size * 2 : hidden_size;
+    ctx->SetOutputDim("Out", out_dims);
+    ctx->SetOutputsDim("State", pre_state_dims);
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input"),
+        ctx.device_context());
+  }
+};
+
+class RNNOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput(
+        "Input",
+        "(Tensor) RNN input tensor, which support variable-time length input "
+        "sequence."
+        "The shape of the Tensor MUST be ( seq_len * batch_size * input_size)"
+        "seq_len is the total time step in this mini-batch (CAN be change in "
+        "different batch)"
+        "batch_size is the instance number of this batch"
+        "input_size is the hidden size of the input."
+        "input_size and the hidden_size in the next may not be same");
+    AddInput("PreState",
+             "(Tensor) the initial hidden state of the LSTM"
+             "input. This is a tensor with shape (num_layers x batch_size x "
+             "hidden_size)"
+             "and When is_bidirec is True, the shape will be (num_layers*2 x "
+             "batch_size x hidden_size)")
+        .AsDuplicable();
+    AddInput("WeightList",
+             "(vector<Tensor>), stores weight and bias data when the weight "
+             "use the list format. ")
+        .AsDuplicable();
+    AddInput("SequenceLength",
+             "(Tensor) When the input data is padding, "
+             "set this parameter. This parameter represents "
+             "the variable sequence lengths in a batch. "
+             "The size of the vector has to equal the batch_size.")
+        .AsDispensable();
+    AddOutput("DropoutState",
+              "Store the global drop state when training, needed by cudnn rnn.")
+        .AsDispensable();
+    // maybe need add intermediate outputs for cpu kernel
+    AddOutput("Reserve",
+              "(Tensor, a temporary output Tensor to store the reserve_data "
+              "of cudnn kernel.")
+        .AsIntermediate();
+    AddOutput("Out",
+              "(Tensor) the hidden state of LSTM operator. "
+              "The shape is ( seq_len x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be ( seq_len x "
+              "batch_size x hidden_size * 2) ");
+    AddOutput("State",
+              "(Tensor) the hidden state of the last step. "
+              "The shape is ( num_layers x batch_size x hidden_size) if "
+              "is_bidirec is False"
+              "and When is_bidirec is True, the shape will be (num_layers*2 x "
+              "batch_size x hidden_size)")
+        .AsDuplicable();
+    AddAttr<float>(
+        "dropout_prob",
+        "dropout prob of the dropout op"
+        "the dropout ONLY work between rnn layers, not between time steps"
+        "There is no dropout work on the Out tensor")
+        .SetDefault(0.0);
+    AddAttr<bool>("is_bidirec", "whether it is bidirectional rnn")
+        .SetDefault(false);
+    AddAttr<int>("input_size", "input size ot the Input Tensor").SetDefault(10);
+    AddAttr<int>("hidden_size", "hidden size of rnn").SetDefault(100);
+    AddAttr<int>("num_layers", "the total layer number").SetDefault(1);
+    AddAttr<std::string>(
+        "mode",
+        "(string) rnn types, including: LSTM, GRU, RNN_RELU, RNN_TANH.");
+    AddAttr<bool>("is_test", "True if in test phase.").SetDefault(false);
+    AddAttr<int>("seed", "seed to used if fix_seed is True").SetDefault(0);
+    AddComment(R"DOC(
+)DOC");
+  }
+};
+
+class RNNGradOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Input"), "Input", "Input", "RNN");
+    OP_INOUT_CHECK(ctx->HasInputs("PreState"), "Input", "PreState", "RNN");
+    OP_INOUT_CHECK(ctx->HasInput("Out"), "Input", "Out", "RNN");
+    // OP_INOUT_CHECK(ctx->HasInputs("State"), "Input", "State", "RNN");
+
+    auto SetOutGradDim = [&ctx](const std::string& name) {
+      auto g_name = framework::GradVarName(name);
+      if (ctx->HasOutput(g_name)) {
+        ctx->SetOutputDim(g_name, ctx->GetInputDim(name));
+      }
+    };
+
+    SetOutGradDim("Input");
+    if (ctx->HasOutputs(framework::GradVarName("WeightList"))) {
+      ctx->SetOutputsDim(framework::GradVarName("WeightList"),
+                         ctx->GetInputsDim("WeightList"));
+    }
+    if (ctx->HasOutputs(framework::GradVarName("PreState"))) {
+      ctx->SetOutputsDim(framework::GradVarName("PreState"),
+                         ctx->GetInputsDim("PreState"));
+    }
+  }
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Out")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class RNNGradOpMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("rnn_grad");
+    op->SetInput("Input", this->Input("Input"));
+    op->SetInput("PreState", this->Input("PreState"));
+    op->SetInput("WeightList", this->Input("WeightList"));
+    if (this->HasInput("SequenceLength")) {
+      op->SetInput("SequenceLength", this->Input("SequenceLength"));
+    }
+    op->SetInput("DropoutState", this->Output("DropoutState"));
+    op->SetInput("Reserve", this->Output("Reserve"));
+    op->SetInput("Out", this->Output("Out"));
+    op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out"));
+    op->SetInput(framework::GradVarName("State"), this->OutputGrad("State"));
+
+    op->SetOutput(framework::GradVarName("WeightList"),
+                  this->InputGrad("WeightList", false));
+
+    op->SetOutput(framework::GradVarName("Input"), this->InputGrad("Input"));
+    op->SetOutput(framework::GradVarName("PreState"),
+                  this->InputGrad("PreState", false));
+    op->SetAttrMap(this->Attrs());
+  }
+};
+
+template <typename T>
+class NotImpleKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unimplemented(
+        "CPU is not support for this kernel now. Will be add in the future"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(rnn, ops::RNNOp, ops::RNNOpMaker,
+                  ops::RNNGradOpMaker<paddle::framework::OpDesc>,
+                  ops::RNNGradOpMaker<paddle::imperative::OpBase>);
+REGISTER_OPERATOR(rnn_grad, ops::RNNGradOp);
+
+REGISTER_OP_CPU_KERNEL(rnn, ops::NotImpleKernel<float>);
+REGISTER_OP_CPU_KERNEL(rnn_grad, ops::NotImpleKernel<float>);
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
new file mode 100644
index 0000000000000..568db79722324
--- /dev/null
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -0,0 +1,630 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/generator.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/utils.h"
+#include "paddle/fluid/platform/cudnn_helper.h"
+#include "paddle/fluid/platform/dynload/cudnn.h"
+
+namespace paddle {
+namespace platform {
+class CUDADeviceContext;
+struct CUDAPlace;
+}  // namespace platform
+}  // namespace paddle
+
+namespace paddle {
+namespace operators {
+
+using LoDTensor = framework::LoDTensor;
+using Tensor = framework::Tensor;
+
+class RNNDescriptors {
+ public:
+  RNNDescriptors(int seq_length, int batch_size, int input_size,
+                 int hidden_size, int num_layers, float dropout_prob, int seed,
+                 int weight_numel, cudnnRNNMode_t mode, bool is_bidirec,
+                 bool is_test)
+      : seq_length_(seq_length),
+        batch_size_(batch_size),
+        input_size_(input_size),
+        hidden_size_(hidden_size),
+        num_layers_(num_layers),
+        dropout_prob_(dropout_prob),
+        seed_(seed),
+        weight_numel_(weight_numel),
+        mode_(mode),
+        is_bidirec_(is_bidirec),
+        is_test_(is_test) {}
+
+  template <typename T>
+  void Create(const cudnnHandle_t &handle, const platform::Place &place,
+              const std::vector<int> &sequence_length, size_t *workspace_size,
+              size_t *reserve_size, framework::Tensor *dropout_state) {
+    int numDirections = is_bidirec_ ? 2 : 1;
+    cudnnDataType_t cudnn_type = platform::CudnnDataType<T>::type;
+
+    // ------------------- cudnn x, y descriptors ---------------------
+    std::vector<int> dims_x = {batch_size_, input_size_, 1};
+    std::vector<int> strides_x = {input_size_, 1, 1};
+    std::vector<int> dims_y = {batch_size_, hidden_size_ * numDirections, 1};
+    std::vector<int> strides_y = {hidden_size_ * numDirections, 1, 1};
+    for (int i = 0; i < seq_length_; ++i) {
+      x_descs_.emplace_back(x_desc_.descriptor<T>(dims_x, strides_x));
+      y_descs_.emplace_back(y_desc_.descriptor<T>(dims_y, strides_y));
+    }
+
+#if CUDNN_VERSION >= 7201
+    if (!sequence_length.empty()) {
+      x_seq_desc_.descriptor<T>(seq_length_, batch_size_, input_size_, true,
+                                sequence_length);
+      y_seq_desc_.descriptor<T>(seq_length_, batch_size_,
+                                hidden_size_ * numDirections, true,
+                                sequence_length);
+    }
+#endif
+
+    // ------------------- cudnn hx, hy, cx, cy descriptors----------
+    std::vector<int> dims_hx = {num_layers_ * numDirections, batch_size_,
+                                hidden_size_};
+    std::vector<int> strides_hx = {hidden_size_ * batch_size_, hidden_size_, 1};
+    init_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    init_c_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_h_desc_.descriptor<T>(dims_hx, strides_hx);
+    last_c_desc_.descriptor<T>(dims_hx, strides_hx);
+
+    // ------------------- cudnn dropout descriptors ---------------------
+    size_t state_size;
+    if (!is_test_ && !dropout_state->IsInitialized()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(
+          platform::dynload::cudnnDropoutGetStatesSize(handle, &state_size));
+      dropout_state->mutable_data<uint8_t>({static_cast<int64_t>(state_size)},
+                                           place);
+    }
+    dropout_desc_.descriptor(handle, place, dropout_state->IsInitialized(),
+                             dropout_prob_, is_test_ ? nullptr : dropout_state,
+                             seed_, state_size);
+
+// ------------------- cudnn rnn descriptors ---------------------
+#if CUDNN_VERSION >= 6000
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor_v6(
+        handle, rnn_desc_.desc(), hidden_size_, num_layers_,
+        dropout_desc_.desc(), CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_,
+        CUDNN_RNN_ALGO_STANDARD, cudnn_type));
+#else
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNDescriptor(
+        rnn_desc_.desc(), hidden_size_, num_layers_, dropout_desc_.desc(),
+        CUDNN_LINEAR_INPUT,
+        is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, mode_,
+        cudnn_type));
+#endif
+
+#if CUDNN_VERSION >= 7201
+    if (!sequence_length.empty()) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnSetRNNPaddingMode(
+          rnn_desc_.desc(), CUDNN_RNN_PADDED_IO_ENABLED));
+    }
+#endif
+
+    // ------------------- cudnn weights_size ---------------------
+    size_t weights_size_;
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNParamsSize(
+        handle, rnn_desc_.desc(), x_descs_[0], &weights_size_, cudnn_type));
+    PADDLE_ENFORCE_EQ(
+        weights_size_, sizeof(T) * weight_numel_,
+        platform::errors::InvalidArgument(
+            "The cudnn rnn and setting weight size should be same."));
+    // ------------------- cudnn weight descriptors ---------------------
+    platform::DataLayout layout = platform::DataLayout::kNCHW;
+    int dim_tmp = weights_size_ / sizeof(T);
+    std::vector<int> dim_w = {dim_tmp, 1, 1};
+    weight_desc_.descriptor<T>(layout, dim_w);
+    // ------------------- cudnn workspace, reserve size ---------------------
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnGetRNNWorkspaceSize(
+        handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+        workspace_size));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::cudnnGetRNNTrainingReserveSize(
+            handle, rnn_desc_.desc(), seq_length_, x_descs_.data(),
+            reserve_size));
+  }
+  cudnnTensorDescriptor_t *x_descs() { return x_descs_.data(); }
+  cudnnTensorDescriptor_t *y_descs() { return y_descs_.data(); }
+#if CUDNN_VERSION >= 7201
+  cudnnRNNDataDescriptor_t x_seq_desc() { return x_seq_desc_.desc(); }
+  cudnnRNNDataDescriptor_t y_seq_desc() { return y_seq_desc_.desc(); }
+#endif
+  cudnnTensorDescriptor_t init_h_desc() { return init_h_desc_.desc(); }
+  cudnnTensorDescriptor_t init_c_desc() { return init_c_desc_.desc(); }
+  cudnnTensorDescriptor_t last_h_desc() { return last_h_desc_.desc(); }
+  cudnnTensorDescriptor_t last_c_desc() { return last_c_desc_.desc(); }
+  cudnnRNNDescriptor_t rnn_desc() { return rnn_desc_.desc(); }
+  cudnnDropoutDescriptor_t dropout_desc() { return dropout_desc_.desc(); }
+  cudnnFilterDescriptor_t weight_desc() { return weight_desc_.desc(); }
+
+ private:
+  int seq_length_;
+  int batch_size_;
+  int input_size_;
+  int hidden_size_;
+  int num_layers_;
+  float dropout_prob_;
+  int seed_;
+  int weight_numel_;
+  cudnnRNNMode_t mode_;
+  bool is_bidirec_;
+  bool is_test_;
+  std::vector<cudnnTensorDescriptor_t> x_descs_;
+  std::vector<cudnnTensorDescriptor_t> y_descs_;
+
+  platform::ScopedTensorDescriptor x_desc_;
+  platform::ScopedTensorDescriptor y_desc_;
+#if CUDNN_VERSION >= 7201
+  platform::ScopedRNNTensorDescriptor x_seq_desc_;
+  platform::ScopedRNNTensorDescriptor y_seq_desc_;
+#endif
+  platform::ScopedTensorDescriptor init_h_desc_;
+  platform::ScopedTensorDescriptor init_c_desc_;
+  platform::ScopedTensorDescriptor last_h_desc_;
+  platform::ScopedTensorDescriptor last_c_desc_;
+  platform::ScopedDropoutDescriptor dropout_desc_;
+  platform::ScopedFilterDescriptor weight_desc_;
+  platform::ScopedRNNDescriptor rnn_desc_;
+};
+
+template <typename T, typename Type>
+bool is_continuous(const Type &weight_list) {
+  bool continuous = true;
+  for (size_t i = 0; i < weight_list.size() - 1; ++i) {
+    auto *in_data = weight_list[i]->template data<T>();
+    auto *in_after_data = weight_list[i + 1]->template data<T>();
+    auto in_size = weight_list[i]->numel();
+    bool temp = in_data + in_size == in_after_data;
+    continuous = continuous && temp;
+  }
+  return continuous;
+}
+
+template <typename T>
+void weight_to_tensor(const platform::Place &place, cudaStream_t stream,
+                      const std::vector<const Tensor *> &weight_list,
+                      Tensor *weight) {
+  auto weight_data = weight->data<T>();
+  int weight_offset = 0;
+  for (size_t i = 0; i < weight_list.size(); ++i) {
+    const T *in_data = weight_list[i]->data<T>();
+    auto in_size = weight_list[i]->numel();
+
+    memory::Copy(BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+                 weight_data + weight_offset,
+                 BOOST_GET_CONST(platform::CUDAPlace, weight_list[i]->place()),
+                 in_data, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
+template <typename T>
+void weight_to_tensor_list(const platform::Place &place, cudaStream_t stream,
+                           std::vector<Tensor *> *weight_grad,
+                           const std::vector<const Tensor *> &weight_input,
+                           const Tensor *weight) {
+  int weight_offset = 0;
+  auto *weight_data = weight->data<T>();
+  for (size_t i = 0; i < weight_input.size(); ++i) {
+    auto in_size = weight_input[i]->numel();
+    T *weight_grad_data = (*weight_grad)[i]->mutable_data<T>(place);
+    const T *src = weight_data + weight_offset;
+
+    memory::Copy(
+        BOOST_GET_CONST(platform::CUDAPlace, (*weight_grad)[i]->place()),
+        weight_grad_data, BOOST_GET_CONST(platform::CUDAPlace, weight->place()),
+        src, in_size * sizeof(T), stream);
+    weight_offset += in_size;
+  }
+}
+
+template <typename T>
+class RNNCudnnKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    const Tensor *x = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+
+    Tensor *out = ctx.Output<Tensor>("Out");
+    auto state = ctx.MultiOutput<Tensor>("State");
+    Tensor *reserve = ctx.Output<Tensor>("Reserve");
+    Tensor *state_out = ctx.Output<Tensor>("DropoutState");
+
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    auto mode = ctx.Attr<std::string>("mode");
+    cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
+    if (mode == "LSTM")
+      rnn_mode = CUDNN_LSTM;
+    else if (mode == "GRU")
+      rnn_mode = CUDNN_GRU;
+    else if (mode == "RNN_RELU")
+      rnn_mode = CUDNN_RNN_RELU;
+    else if (mode == "RNN_TANH")
+      rnn_mode = CUDNN_RNN_TANH;
+    else
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: "
+          "%s.",
+          mode));
+
+    bool is_test = ctx.Attr<bool>("is_test");
+    int seed = ctx.Attr<int>("seed");
+    if (!is_test) {
+      int device_id =
+          BOOST_GET_CONST(platform::CUDAPlace, ctx.GetPlace()).GetDeviceId();
+      auto gen_cuda = framework::GetDefaultCUDAGenerator(device_id);
+      if (gen_cuda->GetIsInitPy() && seed == 0) {
+        // If perform `manual_seed` in python and inner seed is not specified
+        // (equals 0), use global generator generated seed.
+        seed = static_cast<int>(gen_cuda->Random64());
+      } else if (seed == 0) {
+        // use random generated seed
+        std::random_device rd;
+        seed = rd();
+      }  // else use `ctx.Attr<int>("seed")` specified seed
+    }
+
+    const T *x_data = x->data<T>();
+    const T *init_h_data = pre_state[0]->data<T>();
+    const T *init_c_data = nullptr;
+    T *out_data = out->mutable_data<T>(ctx.GetPlace());
+    T *last_h_data = state[0]->mutable_data<T>(ctx.GetPlace());
+    T *last_c_data = nullptr;
+    if (rnn_mode == CUDNN_LSTM) {
+      init_c_data = pre_state[1]->data<T>();
+      last_c_data = state[1]->mutable_data<T>(ctx.GetPlace());
+    }
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+
+    int seq_length = x->dims()[0];
+    int batch_size = x->dims()[1];
+    int input_size = x->dims()[2];
+
+    size_t workspace_size;
+    size_t reserve_size;
+    Tensor weight_whole;
+    T *w_data = nullptr;
+    auto place = ctx.GetPlace();
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    auto weight_list = ctx.MultiInput<framework::Tensor>("WeightList");
+    auto weight_numel = std::accumulate(
+        weight_list.begin(), weight_list.end(), 0,
+        [](int64_t num, const Tensor *t) { return num + t->numel(); });
+    bool continuous =
+        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+    if (!continuous) {
+      LOG_FIRST_N(WARNING, 2)
+          << "If the memory space of the Input WeightList is not continuous, "
+             "less efficient calculation will be called. Please call "
+             "flatten_parameters() to make the input memory continuous.";
+      weight_whole.mutable_data<T>({weight_numel}, place);
+      weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+      w_data = weight_whole.data<T>();
+      if (is_test) {  // maybe also reset small weights' ptr for training
+        int offset = 0;
+        for (size_t i = 0; i < weight_list.size(); ++i) {
+          size_t len = weight_list[i]->numel();
+          auto dim = weight_list[i]->dims();
+          const_cast<Tensor *>(weight_list[i])
+              ->ShareDataWith(
+                  weight_whole.Slice(static_cast<int64_t>(offset),
+                                     static_cast<int64_t>(offset + len)))
+              .Resize(dim);
+          offset += len;
+        }
+      }
+    } else {
+      w_data = const_cast<T *>(weight_list[0]->data<T>());
+    }
+
+    RNNDescriptors rnn(seq_length, batch_size, input_size, hidden_size,
+                       num_layers, dropout_prob, seed, weight_numel, rnn_mode,
+                       is_bidirec, is_test);
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
+                  &reserve_size, state_out);
+
+    framework::Tensor workspace_data_;
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
+
+    auto *reserve_data = reserve->mutable_data<uint8_t>(
+        {static_cast<int64_t>(reserve_size)}, ctx.GetPlace());
+
+    if (is_test) {
+      RNNInferece(has_seq_length, handle, seq_length, &rnn, x_data, init_h_data,
+                  init_c_data, w_data, out_data, last_h_data, last_c_data,
+                  &workspace_data_, workspace_size);
+    } else {
+      if (!has_seq_length) {
+        // for train
+        // This interface is used when the input/output is unpadded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardTraining(
+            handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data,
+            rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+            rnn.weight_desc(), w_data, rnn.y_descs(), out_data,
+            rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+            workspace_data_.data<uint8_t>(), workspace_size, reserve_data,
+            reserve_size));
+      } else {
+#if CUDNN_VERSION >= 7201
+        // for train
+        // This interface is used when the input/output is padded.
+        PADDLE_ENFORCE_CUDA_SUCCESS(
+            platform::dynload::cudnnRNNForwardTrainingEx(
+                handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data,
+                rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+                rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data,
+                rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data,
+                nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+                nullptr, workspace_data_.data<uint8_t>(), workspace_size,
+                reserve_data, reserve_size));
+#else
+        PADDLE_THROW(platform::errors::Unavailable(
+            "The padded input is supported by "
+            "cudnnRNNForwardTrainingEx, but it only works when "
+            "the version of cudnn is larger than 7.2.1"));
+#endif
+      }
+    }
+  }
+
+  void RNNInferece(const bool &has_seq_length, const cudnnHandle_t &handle,
+                   const int &seq_length, RNNDescriptors *rnn, const T *x_data,
+                   const T *init_h_data, const T *init_c_data, const T *w_data,
+                   T *out_data, T *last_h_data, T *last_c_data,
+                   framework::Tensor *workspace_data,
+                   const size_t &workspace_size) const {
+    if (!has_seq_length) {
+      // for inference
+      // This interface is used when the input/output is unpadded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInference(
+          handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data,
+          rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+          rnn->weight_desc(), w_data, rnn->y_descs(), out_data,
+          rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+          workspace_data->data<uint8_t>(), workspace_size));
+    } else {
+#if CUDNN_VERSION >= 7201
+      // for inference
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNForwardInferenceEx(
+          handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data,
+          rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data,
+          rnn->weight_desc(), w_data, rnn->y_seq_desc(), out_data,
+          rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data,
+          nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr,
+          nullptr, workspace_data->data<uint8_t>(), workspace_size));
+#else
+      // CUDNN VERSION has to >=7.2.1
+      PADDLE_THROW(platform::errors::Unavailable(
+          "The padded input is supported by "
+          "cudnnRNNForwardInferenceEx, but it only works when "
+          "the version of cudnn is larger than 7.2.1"));
+#endif
+    }
+  }
+};
+
+template <typename T>
+class RNNGradCudnnKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    auto *input = ctx.Input<Tensor>("Input");
+    auto pre_state = ctx.MultiInput<Tensor>("PreState");
+    auto weight_list = ctx.MultiInput<Tensor>("WeightList");
+    auto *state_out = ctx.Input<Tensor>("DropoutState");
+    auto *reserve = ctx.Input<Tensor>("Reserve");
+    auto *out = ctx.Input<Tensor>("Out");
+    // auto state = ctx.MultiInput<Tensor>("State");
+
+    auto *out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
+    auto state_grad = ctx.MultiInput<Tensor>(framework::GradVarName("State"));
+
+    auto *in_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
+    auto pre_state_grad =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("PreState"));
+    auto weight_grad_list =
+        ctx.MultiOutput<Tensor>(framework::GradVarName("WeightList"));
+
+    float dropout_prob = ctx.Attr<float>("dropout_prob");
+    bool is_bidirec = ctx.Attr<bool>("is_bidirec");
+    int hidden_size = ctx.Attr<int>("hidden_size");
+    int num_layers = ctx.Attr<int>("num_layers");
+    auto mode = ctx.Attr<std::string>("mode");
+    cudnnRNNMode_t rnn_mode = CUDNN_LSTM;
+    if (mode == "LSTM")
+      rnn_mode = CUDNN_LSTM;
+    else if (mode == "GRU")
+      rnn_mode = CUDNN_GRU;
+    else if (mode == "RNN_RELU")
+      rnn_mode = CUDNN_RNN_RELU;
+    else if (mode == "RNN_TANH")
+      rnn_mode = CUDNN_RNN_TANH;
+    else
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: "
+          "%s.",
+          mode));
+    bool is_test = ctx.Attr<bool>("is_test");
+    int seed = ctx.Attr<int>("seed");
+
+    auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+    auto handle = dev_ctx.cudnn_handle();
+
+    auto place = ctx.GetPlace();
+    auto weight_numel = std::accumulate(
+        weight_list.begin(), weight_list.end(), 0,
+        [](int64_t num, const Tensor *t) { return num + t->numel(); });
+    bool continuous =
+        is_continuous<T, std::vector<const Tensor *>>(weight_list);
+
+    auto stream = reinterpret_cast<const platform::CUDADeviceContext &>(
+                      ctx.device_context())
+                      .stream();
+    Tensor weight_whole;
+    T *weight_data = nullptr;
+
+    if (!continuous) {
+      weight_whole.mutable_data<T>({weight_numel}, place);
+      weight_to_tensor<T>(place, stream, weight_list, &weight_whole);
+      weight_data = weight_whole.data<T>();
+    } else {
+      weight_data = const_cast<T *>(weight_list[0]->data<T>());
+    }
+
+    Tensor weight_grad;
+    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
+    zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
+    T *weight_grad_data = weight_grad.data<T>();
+
+    int offset = 0;
+    for (size_t i = 0; i < weight_grad_list.size(); ++i) {
+      size_t len = weight_grad_list[i]->numel();
+      auto dim = weight_grad_list[i]->dims();
+      weight_grad_list[i]
+          ->ShareDataWith(weight_grad.Slice(static_cast<int64_t>(offset),
+                                            static_cast<int64_t>(offset + len)))
+          .Resize(dim);
+      offset += len;
+    }
+
+    auto *init_h_data = pre_state[0]->data<T>();
+    // auto *last_h_data = state[0]->data<T>();
+    auto *last_h_grad_data = state_grad[0]->data<T>();
+    const T *init_c_data = nullptr;
+    // const T *last_c_data = nullptr;
+    const T *last_c_grad_data = nullptr;
+    T *init_h_grad_data =
+        pre_state_grad.size() != 0 && pre_state_grad[0]
+            ? pre_state_grad[0]->mutable_data<T>(ctx.GetPlace())
+            : nullptr;
+    T *init_c_grad_data = nullptr;
+    if (rnn_mode == CUDNN_LSTM) {
+      init_c_data = pre_state[1]->data<T>();
+      // last_c_data = state[1]->data<T>();
+      last_c_grad_data = state_grad[1]->data<T>();
+      init_c_grad_data =
+          pre_state_grad.size() != 0 && pre_state_grad[1]
+              ? pre_state_grad[1]->mutable_data<T>(ctx.GetPlace())
+              : nullptr;
+    }
+    auto *out_data = out->data<T>();
+    auto *out_grad_data = out_grad->data<T>();
+    // maybe need check exist
+    auto *in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
+
+    bool has_seq_length = ctx.HasInput("SequenceLength");
+    std::vector<int> SequenceLength;
+    if (has_seq_length) {
+      auto *sequence_length = ctx.Input<Tensor>("SequenceLength");
+      SequenceLength = operators::GetDataFromTensor<int>(sequence_length);
+    }
+
+    auto input_dims = input->dims();
+    int seq_length = input_dims[0];
+    int batch_size = input_dims[1];
+    int input_size = input_dims[2];
+
+    size_t workspace_size;
+    size_t reserve_size;
+
+    RNNDescriptors rnn(seq_length, batch_size, input_size, hidden_size,
+                       num_layers, dropout_prob, seed, weight_numel, rnn_mode,
+                       is_bidirec, is_test);
+
+    rnn.Create<T>(handle, ctx.GetPlace(), SequenceLength, &workspace_size,
+                  &reserve_size, const_cast<Tensor *>(state_out));
+
+    framework::Tensor workspace_data_;
+    workspace_data_.mutable_data<uint8_t>(
+        {static_cast<int64_t>(workspace_size)}, ctx.GetPlace());
+    const uint8_t *reserve_data = reserve->data<uint8_t>();
+
+    if (!has_seq_length) {
+      // This interface is used when the input/output is unpadded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardData(
+          handle, rnn.rnn_desc(), seq_length, rnn.y_descs(), out_data,
+          rnn.y_descs(), out_grad_data, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_descs(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, workspace_data_.data<uint8_t>(),
+          workspace_size, const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeights(
+          handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), input->data<T>(),
+          rnn.init_h_desc(), init_h_data, rnn.y_descs(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
+          weight_grad_data, const_cast<uint8_t *>(reserve_data), reserve_size));
+    } else {
+#if CUDNN_VERSION >= 7201
+      // for train
+      // This interface is used when the input/output is padded.
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardDataEx(
+          handle, rnn.rnn_desc(), rnn.y_seq_desc(), out_data, rnn.y_seq_desc(),
+          out_grad_data, nullptr, nullptr, rnn.last_h_desc(), last_h_grad_data,
+          rnn.last_c_desc(), last_c_grad_data, rnn.weight_desc(), weight_data,
+          rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data,
+          rnn.x_seq_desc(), in_grad_data, rnn.init_h_desc(), init_h_grad_data,
+          rnn.init_c_desc(), init_c_grad_data, nullptr, nullptr,
+          workspace_data_.data<uint8_t>(), workspace_size,
+          const_cast<uint8_t *>(reserve_data), reserve_size));
+
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::cudnnRNNBackwardWeightsEx(
+          handle, rnn.rnn_desc(), rnn.x_seq_desc(), input->data<T>(),
+          rnn.init_h_desc(), init_h_data, rnn.y_seq_desc(), out->data<T>(),
+          workspace_data_.data<uint8_t>(), workspace_size, rnn.weight_desc(),
+          weight_grad_data, const_cast<uint8_t *>(reserve_data), reserve_size));
+#else
+      PADDLE_THROW(platform::errors::Unavailable(
+          "The padded input of rnn is supported by cudnnRNNBackwardDataEx, "
+          "cudnnRNNBackwardWeightsEx, but it only works when the version "
+          "of cudnn is larger than 7.2.1"));
+#endif
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(rnn, ops::RNNCudnnKernel<float>,
+                        ops::RNNCudnnKernel<double>);
+REGISTER_OP_CUDA_KERNEL(rnn_grad, ops::RNNGradCudnnKernel<float>,
+                        ops::RNNGradCudnnKernel<double>);
diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h
index e983e36895353..e591852cc9580 100644
--- a/paddle/fluid/platform/cudnn_helper.h
+++ b/paddle/fluid/platform/cudnn_helper.h
@@ -361,6 +361,12 @@ class ScopedDropoutDescriptor {
                                              float dropout_prob_,
                                              framework::Tensor* dropout_state_,
                                              int seed, size_t state_size) {
+    if (dropout_state_ == nullptr) {  // for no dropout or test
+      PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
+          desc_, handle, 0 /* dropout */, nullptr, 0 /* state_size */,
+          0 /* seed */));
+      return desc_;
+    }
     auto* dropout_state_data = dropout_state_->data<uint8_t>();
     if (!initialized) {
       PADDLE_ENFORCE_CUDA_SUCCESS(dynload::cudnnSetDropoutDescriptor(
diff --git a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
index 2eec265b5d27a..87bdee8a91d21 100644
--- a/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
+++ b/python/paddle/fluid/tests/unittests/rnn/test_rnn_nets.py
@@ -93,10 +93,14 @@ def test_with_input_lengths(self):
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
+    def test_predict(self):
+        predict_test_util(self.place, "SimpleRNN")
+
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
         self.test_with_input_lengths()
+        self.test_predict()
 
 
 class TestGRU(unittest.TestCase):
@@ -175,10 +179,14 @@ def test_with_input_lengths(self):
         np.testing.assert_allclose(y1, y2.numpy(), atol=1e-8, rtol=1e-5)
         np.testing.assert_allclose(h1, h2.numpy(), atol=1e-8, rtol=1e-5)
 
+    def test_predict(self):
+        predict_test_util(self.place, "GRU")
+
     def runTest(self):
         self.test_with_initial_state()
         self.test_with_zero_state()
         self.test_with_input_lengths()
+        self.test_predict()
 
 
 class TestLSTM(unittest.TestCase):
@@ -258,61 +266,7 @@ def test_with_input_lengths(self):
         np.testing.assert_allclose(c1, c2.numpy(), atol=1e-8, rtol=1e-5)
 
     def test_predict(self):
-        place = paddle.set_device(self.place)
-        paddle.seed(123)
-        np.random.seed(123)
-
-        class Net(paddle.nn.Layer):
-            def __init__(self):
-                super(Net, self).__init__()
-                self.rnn1 = paddle.nn.LSTM(
-                    16, 32, 2, direction="bidirectional", dropout=0.1)
-
-            def forward(self, input):
-                return self.rnn1(input)
-
-        x = paddle.randn((4, 10, 16))
-        x.stop_gradient = False
-        seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
-        mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype)
-        mask = paddle.unsqueeze(mask, [2])
-        rnn = Net()
-        y, (h, c) = rnn(x)
-        y = y * mask
-        loss = paddle.mean(y)
-        loss.backward()
-        optimizer = paddle.optimizer.Adam(
-            learning_rate=0.1, parameters=rnn.parameters())
-        optimizer.step()
-        rnn.eval()
-        y, (h, c) = rnn(x)
-        # `jit.to_static` would include a train_program, eval mode might cause
-        # some errors currently, such as dropout grad op gets `is_test == True`.
-        rnn.train()
-
-        rnn = paddle.jit.to_static(
-            rnn,
-            [paddle.static.InputSpec(
-                shape=[None, None, 16], dtype=x.dtype)])
-        paddle.jit.save(rnn, "./inference/lstm_infer")
-
-        paddle.enable_static()
-
-        new_scope = paddle.static.Scope()
-        with paddle.static.scope_guard(new_scope):
-            exe = paddle.static.Executor(place)
-            [inference_program, feed_target_names,
-             fetch_targets] = paddle.static.load_inference_model(
-                 dirname="./inference",
-                 executor=exe,
-                 model_filename="lstm_infer.pdmodel",
-                 params_filename="lstm_infer.pdiparams")
-            results = exe.run(inference_program,
-                              feed={feed_target_names[0]: x.numpy()},
-                              fetch_list=fetch_targets)
-            np.testing.assert_equal(
-                y.numpy(), results[0])  # eval results equal predict results
-        paddle.disable_static()
+        predict_test_util(self.place, "LSTM")
 
     def runTest(self):
         self.test_with_initial_state()
@@ -321,6 +275,66 @@ def runTest(self):
         self.test_predict()
 
 
+def predict_test_util(place, mode):
+    place = paddle.set_device(place)
+    paddle.seed(123)
+    np.random.seed(123)
+
+    class Net(paddle.nn.Layer):
+        def __init__(self):
+            super(Net, self).__init__()
+            self.rnn = getattr(paddle.nn, mode)(16,
+                                                32,
+                                                2,
+                                                direction="bidirectional",
+                                                dropout=0.1)
+
+        def forward(self, input):
+            return self.rnn(input)
+
+    x = paddle.randn((4, 10, 16))
+    x.stop_gradient = False
+    seq_len = paddle.to_tensor(np.array([10, 6, 8, 5]))
+    mask = sequence_mask(seq_len, maxlen=10, dtype=x.dtype)
+    mask = paddle.unsqueeze(mask, [2])
+    rnn = Net()
+    y, _ = rnn(x)
+    y = y * mask
+    loss = paddle.mean(y)
+    loss.backward()
+    optimizer = paddle.optimizer.Adam(
+        learning_rate=0.1, parameters=rnn.parameters())
+    optimizer.step()
+    rnn.eval()
+    y, _ = rnn(x)
+    # `jit.to_static` would include a train_program, eval mode might cause
+    # some errors currently, such as dropout grad op gets `is_test == True`.
+    rnn.train()
+
+    rnn = paddle.jit.to_static(
+        rnn, [paddle.static.InputSpec(
+            shape=[None, None, 16], dtype=x.dtype)])
+    paddle.jit.save(rnn, "./inference/%s_infer" % mode)
+
+    paddle.enable_static()
+
+    new_scope = paddle.static.Scope()
+    with paddle.static.scope_guard(new_scope):
+        exe = paddle.static.Executor(place)
+        [inference_program, feed_target_names,
+         fetch_targets] = paddle.static.load_inference_model(
+             dirname="./inference",
+             executor=exe,
+             model_filename="%s_infer.pdmodel" % mode,
+             params_filename="%s_infer.pdiparams" % mode)
+        results = exe.run(inference_program,
+                          feed={feed_target_names[0]: x.numpy()},
+                          fetch_list=fetch_targets)
+        np.testing.assert_equal(
+            y.numpy(), results[0])  # eval results equal predict results
+    paddle.disable_static()
+
+
 def load_tests(loader, tests, pattern):
     suite = unittest.TestSuite()
     devices = ["cpu", "gpu"] if paddle.fluid.is_compiled_with_cuda() \
diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index 33904524862d4..ee989f27ebf72 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -990,7 +990,6 @@ def __init__(self,
         self.could_use_cudnn &= direction != "backward"
         self.could_use_cudnn &= len(self.parameters()) == num_layers * 4 * (
             2 if direction == "bidirectional" else 1)
-        self.could_use_cudnn &= mode == "LSTM"  # currently only support LSTM
 
         # Expose params as RNN's attribute, which can make it compatible when
         # replacing small ops composed rnn with cpp rnn kernel.
@@ -1062,22 +1061,18 @@ def flatten_parameters(self):
     def _cudnn_impl(self, inputs, initial_states, sequence_length):
         if not self.time_major:
             inputs = paddle.tensor.transpose(inputs, [1, 0, 2])
-        # unify LSTM/GRU/SimpleRNN later, currently only support LSTM
-        # TODO(guosheng): use `core.ops.cudnn_lstm` in dygraph mode if support
-        # specify output, since `dropout_state` should be a persistable tensor
-        # rather than a temporary on.
         out = self._helper.create_variable_for_type_inference(inputs.dtype)
-        last_h = self._helper.create_variable_for_type_inference(inputs.dtype)
-        last_c = self._helper.create_variable_for_type_inference(inputs.dtype)
+        state = [
+            self._helper.create_variable_for_type_inference(inputs.dtype)
+            for i in range(self.state_components)
+        ]
         reserve = self._helper.create_variable_for_type_inference(
             dtype=fluid.core.VarDesc.VarType.UINT8, stop_gradient=True)
 
         inputs = {
             'Input': inputs,
-            # 'W': self._flat_weight,  # would be unused_var
             'WeightList': self._all_weights,
-            'InitH': initial_states[0],
-            'InitC': initial_states[1],
+            'PreState': initial_states,
             'SequenceLength': sequence_length
         }
         attrs = {
@@ -1086,23 +1081,22 @@ def _cudnn_impl(self, inputs, initial_states, sequence_length):
             'input_size': self.input_size,
             'hidden_size': self.hidden_size,
             'num_layers': self.num_layers,
+            'mode': self.mode,
             'is_test': not self.training
         }
 
         outputs = {
             'Out': out,
-            'LastH': last_h,
-            'LastC': last_c,
+            'State': state,
             'Reserve': reserve,
-            'StateOut': self._dropout_state,
+            'DropoutState': self._dropout_state,
         }
 
         self._helper.append_op(
-            type="cudnn_lstm", inputs=inputs, outputs=outputs, attrs=attrs)
+            type="rnn", inputs=inputs, outputs=outputs, attrs=attrs)
         out = paddle.tensor.transpose(out,
                                       [1, 0, 2]) if not self.time_major else out
-        states = (last_h, last_c)
-        return out, states
+        return out, tuple(state) if len(state) > 1 else state[0]
 
     def forward(self, inputs, initial_states=None, sequence_length=None):
         batch_index = 1 if self.time_major else 0

From f41104efa3b4f6422f80b321df45f540f2d893b6 Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Tue, 3 Nov 2020 10:09:41 +0800
Subject: [PATCH 097/185] fix compile out of memory temporary (#28346)

---
 cmake/external/pybind11.cmake   | 1 +
 paddle/scripts/paddle_build.bat | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 8722b9003b7ef..05cc77f23baaa 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -34,6 +34,7 @@ ExternalProject_Add(
         "${PYBIND_DOWNLOAD_CMD}"
         PREFIX            ${PYBIND_PREFIX_DIR}
         SOURCE_DIR        ${PYBIND_SOURCE_DIR}
+        UPDATE_COMMAND    ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""
         INSTALL_COMMAND   ""
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 6725abefa8c2b..296deed1c8e6e 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -248,7 +248,8 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
+::msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
+msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 1 (

From 6115c14fca7c7cc4dd4f25f45aff5dc8e88c0752 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Mon, 2 Nov 2020 20:47:03 -0600
Subject: [PATCH 098/185] Pool2d cuda kernel supports fp16 (#28316)

* pool2d cuda kernel supports fp16

* fix compile issue of template

* add ut
---
 paddle/fluid/operators/math/pooling.cu        | 121 ++++++++++++------
 paddle/fluid/operators/math/pooling.h         |   3 +-
 paddle/fluid/operators/pool_op.cc             |   7 +-
 paddle/fluid/operators/pool_op.cu.cc          |  16 ++-
 paddle/fluid/operators/pool_op.h              |   3 +-
 .../fluid/tests/unittests/test_pool2d_op.py   |  42 ++++++
 .../fluid/tests/unittests/test_pool3d_op.py   |  26 ++++
 7 files changed, 166 insertions(+), 52 deletions(-)

diff --git a/paddle/fluid/operators/math/pooling.cu b/paddle/fluid/operators/math/pooling.cu
index 22164131468a4..b64dbb771a339 100644
--- a/paddle/fluid/operators/math/pooling.cu
+++ b/paddle/fluid/operators/math/pooling.cu
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include <algorithm>
 #include <vector>
+
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
 
@@ -126,7 +127,7 @@ __global__ void KernelPool2DGrad(
       phend = min(h_offset / stride_height + 1, output_height);
       pwend = min(w_offset / stride_width + 1, output_width);
     }
-    T gradient = 0;
+    T gradient = static_cast<T>(0.0);
     T input = input_data[index];
 
     int output_stride;
@@ -264,12 +265,12 @@ void Pool2dDirectCUDAFunctor<PoolProcess, T>::operator()(
 }
 
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -351,12 +352,12 @@ class Pool2dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
   }
 };
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, typename T>
 class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -448,12 +449,12 @@ class Pool2dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCHW or NHWC format.
-* Ksize, strides are two elements. These two elements represent height
-* and width, respectively.
-* Paddings are four elements. These four elements represent height_up,
-* height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCHW or NHWC format.
+ * Ksize, strides are two elements. These two elements represent height
+ * and width, respectively.
+ * Paddings are four elements. These four elements represent height_up,
+ * height_down, width_left and width_right, respectively.
+ */
 template <typename T>
 class MaxPool2dGradFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -549,6 +550,8 @@ template class Pool2dDirectCUDAFunctor<paddle::operators::math::AvgPool<float>,
 
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool2dGradFunctor<platform::CUDADeviceContext, double>;
+template class MaxPool2dGradFunctor<platform::CUDADeviceContext,
+                                    paddle::platform::float16>;
 
 template class Pool2dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
@@ -571,6 +574,23 @@ template class Pool2dGradFunctor<platform::CUDADeviceContext,
                                  paddle::operators::math::AvgPoolGrad<double>,
                                  double>;
 
+template class Pool2dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool2dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool2dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool2dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+
 template <typename PoolProcess, typename T>
 __global__ void KernelPool3D(
     const int nthreads, const T* input_data, const int channels,
@@ -712,7 +732,7 @@ __global__ void KernelPool3DGrad(
       pwend = min((w_offset) / stride_width + 1, output_width);
     }
 
-    T gradient = 0;
+    T gradient = static_cast<T>(0.0);
     T input = input_data[index];
 
     int output_stride;
@@ -848,13 +868,13 @@ __global__ void KernelMaxPool3DGrad(
 }
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -952,13 +972,13 @@ class Pool3dFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 };
 
 /*
-* Tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * Tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <typename PoolProcess, class T>
 class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
  public:
@@ -1064,13 +1084,13 @@ class Pool3dGradFunctor<platform::CUDADeviceContext, PoolProcess, T> {
 };
 
 /*
-* tensors are in NCDHW or NDHWC format.
-* Ksize, strides, paddings are three elements. These three elements represent
-* depth, height and width, respectively.
-* Paddings are six elements. These six elements represent depth_forth,
-* depth_back,
-* height_up, height_down, width_left and width_right, respectively.
-*/
+ * tensors are in NCDHW or NDHWC format.
+ * Ksize, strides, paddings are three elements. These three elements represent
+ * depth, height and width, respectively.
+ * Paddings are six elements. These six elements represent depth_forth,
+ * depth_back,
+ * height_up, height_down, width_left and width_right, respectively.
+ */
 template <class T>
 class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
  public:
@@ -1174,6 +1194,8 @@ class MaxPool3dGradFunctor<platform::CUDADeviceContext, T> {
 
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, float>;
 template class MaxPool3dGradFunctor<platform::CUDADeviceContext, double>;
+template class MaxPool3dGradFunctor<platform::CUDADeviceContext,
+                                    paddle::platform::float16>;
 
 template class Pool3dFunctor<platform::CUDADeviceContext,
                              paddle::operators::math::MaxPool<float>, float>;
@@ -1196,6 +1218,23 @@ template class Pool3dGradFunctor<platform::CUDADeviceContext,
                                  paddle::operators::math::AvgPoolGrad<double>,
                                  double>;
 
+template class Pool3dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool3dFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPool<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool3dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::MaxPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+template class Pool3dGradFunctor<
+    platform::CUDADeviceContext,
+    paddle::operators::math::AvgPoolGrad<paddle::platform::float16>,
+    paddle::platform::float16>;
+
 template <typename T1, typename T2>
 __global__ void KernelMaxPool2dWithIdx(
     const int nthreads, const T1* input_data, const int channels,
diff --git a/paddle/fluid/operators/math/pooling.h b/paddle/fluid/operators/math/pooling.h
index 572295f138d59..5a6ae224789a2 100644
--- a/paddle/fluid/operators/math/pooling.h
+++ b/paddle/fluid/operators/math/pooling.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,7 +57,7 @@ class MaxPoolGrad {
  public:
   DEVICE inline void compute(const T& x, const T& y, const T& dy, T scale,
                              T* dx) {
-    *dx += dy * (x == y);
+    *dx += dy * static_cast<T>(x == y);
   }
 };
 
diff --git a/paddle/fluid/operators/pool_op.cc b/paddle/fluid/operators/pool_op.cc
index ba468b7960557..5b0980a98513b 100644
--- a/paddle/fluid/operators/pool_op.cc
+++ b/paddle/fluid/operators/pool_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/pool_op.h"
+
 #include <unordered_map>
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/cudnn_helper.h"
@@ -219,11 +220,7 @@ framework::OpKernelType PoolOpGrad::GetExpectedKernelType(
 #endif
 
   auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
-  if (input_data_type == framework::proto::VarType::FP16) {
-    PADDLE_ENFORCE_EQ(library_, framework::LibraryType::kCUDNN,
-                      platform::errors::InvalidArgument(
-                          "Float16 can only be used when CUDNN is used"));
-  }
+
   return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout_,
                                  library_);
 }
diff --git a/paddle/fluid/operators/pool_op.cu.cc b/paddle/fluid/operators/pool_op.cu.cc
index 37bc14e2cbb34..6b1e9f93033aa 100644
--- a/paddle/fluid/operators/pool_op.cu.cc
+++ b/paddle/fluid/operators/pool_op.cu.cc
@@ -18,16 +18,24 @@ namespace ops = paddle::operators;
 
 REGISTER_OP_CUDA_KERNEL(
     pool2d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
     pool2d_grad,
     ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>);
 
 REGISTER_OP_CUDA_KERNEL(
     pool3d, ops::PoolKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolKernel<paddle::platform::CUDADeviceContext,
+                    paddle::platform::float16>);
 REGISTER_OP_CUDA_KERNEL(
     pool3d_grad,
     ops::PoolGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>);
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::PoolGradKernel<paddle::platform::CUDADeviceContext,
+                        paddle::platform::float16>);
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 677c724069cf4..71bef11b67225 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <algorithm>
 #include <string>
 #include <vector>
+
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
@@ -257,7 +258,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
-      set_constant(dev_ctx, in_x_grad, 0.0);
+      set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));
 
       switch (ksize.size()) {
         case 2: {
diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
index 8553fa8b99a92..e6d41902a7c6d 100644
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
@@ -475,6 +475,41 @@ def test_check_grad(self):
     globals()[cls_name] = TestCUDNNFp16Case
 
 
+def create_test_fp16_class(parent, check_grad=True):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestFp16Case(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = False
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(
+                        place,
+                        atol=1e-3,
+                        check_dygraph=(self.use_mkldnn == False))
+
+        def test_check_grad(self):
+            # TODO(wangzhongpu): support mkldnn op in dygraph mode
+            place = core.CUDAPlace(0)
+            if core.is_float16_supported(
+                    place) and self.pool_type != "max" and check_grad:
+                self.check_grad_with_place(
+                    place,
+                    set(['X']),
+                    'Out',
+                    max_relative_error=0.07,
+                    check_dygraph=(self.use_mkldnn == False))
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
+    TestFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestFp16Case
+
+
 create_test_cudnn_fp16_class(TestPool2D_Op)
 create_test_cudnn_fp16_class(TestCase1, check_grad=False)
 create_test_cudnn_fp16_class(TestCase2)
@@ -482,6 +517,13 @@ def test_check_grad(self):
 create_test_cudnn_fp16_class(TestCase4)
 create_test_cudnn_fp16_class(TestCase5)
 
+create_test_fp16_class(TestPool2D_Op)
+create_test_fp16_class(TestCase1, check_grad=False)
+create_test_fp16_class(TestCase2)
+create_test_fp16_class(TestCase3)
+create_test_fp16_class(TestCase4)
+create_test_fp16_class(TestCase5)
+
 #--------------------test pool2d use ceil mode--------------------
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_pool3d_op.py b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
index fade1691210a4..2d20cfc4cfc9b 100644
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
@@ -405,6 +405,25 @@ def test_check_output(self):
     globals()[cls_name] = TestCUDNNFp16Case
 
 
+def create_test_fp16_class(parent):
+    @unittest.skipIf(not core.is_compiled_with_cuda(),
+                     "core is not compiled with CUDA")
+    class TestFp16Case(parent):
+        def init_kernel_type(self):
+            self.use_cudnn = False
+            self.dtype = np.float16
+
+        def test_check_output(self):
+            if core.is_compiled_with_cuda():
+                place = core.CUDAPlace(0)
+                if core.is_float16_supported(place):
+                    self.check_output_with_place(place, atol=1e-2)
+
+    cls_name = "{0}_{1}".format(parent.__name__, "Fp16Op")
+    TestFp16Case.__name__ = cls_name
+    globals()[cls_name] = TestFp16Case
+
+
 create_test_cudnn_fp16_class(TestPool3D_Op)
 create_test_cudnn_fp16_class(TestCase1)
 create_test_cudnn_fp16_class(TestCase2)
@@ -412,6 +431,13 @@ def test_check_output(self):
 create_test_cudnn_fp16_class(TestCase4)
 create_test_cudnn_fp16_class(TestCase5)
 
+create_test_fp16_class(TestPool3D_Op)
+create_test_fp16_class(TestCase1)
+create_test_fp16_class(TestCase2)
+create_test_fp16_class(TestCase3)
+create_test_fp16_class(TestCase4)
+create_test_fp16_class(TestCase5)
+
 
 # ---- test ceil mode ------
 def create_test_cudnn_use_ceil_class(parent):

From 17db031a058dc8ff2b5f791140e5698d3c368dbb Mon Sep 17 00:00:00 2001
From: Double_V <liuvv0203@163.com>
Date: Tue, 3 Nov 2020 13:10:19 +0800
Subject: [PATCH 099/185] fix pool bug, test=develop (#28359)

---
 python/paddle/nn/functional/pooling.py | 6 +++---
 python/paddle/nn/layer/pooling.py      | 2 ++
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/python/paddle/nn/functional/pooling.py b/python/paddle/nn/functional/pooling.py
index 73e3cb31221f1..40166f4d36e94 100755
--- a/python/paddle/nn/functional/pooling.py
+++ b/python/paddle/nn/functional/pooling.py
@@ -230,7 +230,7 @@ def avg_pool1d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'strides', stride, 'paddings', padding, 'padding_algorithm',
             padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            'use_mkldnn', False, 'exclusive', exclusive, 'data_format',
             data_format)
         return squeeze(output, [2])
 
@@ -338,7 +338,7 @@ def avg_pool2d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'global_pooling',
             False, 'padding_algorithm', padding_algorithm, 'strides', stride,
             'paddings', padding, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            'use_mkldnn', False, 'exclusive', exclusive, 'data_format',
             data_format)
         if divisor_override is None:
             return output
@@ -452,7 +452,7 @@ def avg_pool3d(x,
             x, 'pooling_type', 'avg', 'ksize', kernel_size, 'strides', stride,
             'paddings', padding, 'global_pooling', False, 'padding_algorithm',
             padding_algorithm, 'use_cudnn', True, 'ceil_mode', ceil_mode,
-            'use_mkldnn', False, 'exclusive', not exclusive, 'data_format',
+            'use_mkldnn', False, 'exclusive', exclusive, 'data_format',
             data_format)
         if divisor_override is None:
             return output
diff --git a/python/paddle/nn/layer/pooling.py b/python/paddle/nn/layer/pooling.py
index 0b0a4909f8550..07cd0f61aa716 100755
--- a/python/paddle/nn/layer/pooling.py
+++ b/python/paddle/nn/layer/pooling.py
@@ -503,6 +503,7 @@ def forward(self, x):
             stride=self.stride,
             padding=self.padding,
             return_mask=self.return_mask,
+            ceil_mode=self.ceil_mode,
             data_format=self.data_format,
             name=self.name)
 
@@ -594,6 +595,7 @@ def forward(self, x):
             stride=self.stride,
             padding=self.padding,
             return_mask=self.return_mask,
+            ceil_mode=self.ceil_mode,
             data_format=self.data_format,
             name=self.name)
 

From 953302d9eb65786741063dd9b5fbc9b46f64a44f Mon Sep 17 00:00:00 2001
From: chen zhiyu <quby@sina.com>
Date: Tue, 3 Nov 2020 14:17:15 +0800
Subject: [PATCH 100/185] add musl docker build script (#28027)

* add musl docker build script

* rm space test=document_fix

* fix some docs and types errors test=document_fix
---
 paddle/scripts/musl_build/Dockerfile      |  15 ++++
 paddle/scripts/musl_build/README.md       |  90 +++++++++++++++++++
 paddle/scripts/musl_build/build_docker.sh |  50 +++++++++++
 paddle/scripts/musl_build/build_inside.sh |  64 ++++++++++++++
 paddle/scripts/musl_build/build_paddle.sh | 101 ++++++++++++++++++++++
 paddle/scripts/musl_build/config.sh       |  24 +++++
 6 files changed, 344 insertions(+)
 create mode 100644 paddle/scripts/musl_build/Dockerfile
 create mode 100644 paddle/scripts/musl_build/README.md
 create mode 100755 paddle/scripts/musl_build/build_docker.sh
 create mode 100755 paddle/scripts/musl_build/build_inside.sh
 create mode 100755 paddle/scripts/musl_build/build_paddle.sh
 create mode 100755 paddle/scripts/musl_build/config.sh

diff --git a/paddle/scripts/musl_build/Dockerfile b/paddle/scripts/musl_build/Dockerfile
new file mode 100644
index 0000000000000..649f39b08932b
--- /dev/null
+++ b/paddle/scripts/musl_build/Dockerfile
@@ -0,0 +1,15 @@
+FROM python:3.7-alpine3.10
+
+WORKDIR /root
+
+RUN apk update
+
+RUN apk add --no-cache \
+    g++  gfortran make cmake patchelf git \
+    linux-headers \
+    freetype-dev libjpeg-turbo-dev zlib-dev
+
+RUN apk add --no-cache --force-overwrite \
+    lapack-dev openblas-dev 
+
+ENTRYPOINT [ "/bin/sh" ]
diff --git a/paddle/scripts/musl_build/README.md b/paddle/scripts/musl_build/README.md
new file mode 100644
index 0000000000000..99aabfbabb793
--- /dev/null
+++ b/paddle/scripts/musl_build/README.md
@@ -0,0 +1,90 @@
+Paddle for Linux-musl Usage Guide
+===========================================
+
+# introduction
+Paddle can be built for linux-musl such as alpine, and be used in libos-liked SGX TEE environment. Currently supported commericial product TEE Scone, and community maintanced TEE Occlum. We also working on to support open source TEE Graphene.
+
+
+# build automaticly
+1. clone paddle source from github
+   
+```bash
+git clone https://github.com/PaddlePaddle/Paddle.git
+```
+
+2. setup build directory
+
+```bash
+# enter paddle directory
+cd  ./Paddle
+
+# create and enter building directory
+mkdir -p build && cd build
+```
+
+3. build docker for compiling. use environment HTTP_PROXY/HTTPS_PROXY for proxy setup.
+
+```bash
+# setup proxy address
+export HTTP_PROXY='http://127.0.0.1:8080'
+export HTTPS_PROXY='https://127.0.0.1:8080'
+
+# invoke build script
+../paddle/scripts/musl_build/build_docker.sh
+```
+
+4. compile paddle in previous built docker. proxy setup method is same as previous step.
+output wheel package will save to "dist" directory.
+
+```bash
+# setup proxy addresss
+export HTTP_PROXY='http://127.0.0.1:8080'
+export HTTPS_PROXY='https://127.0.0.1:8080'
+
+# invoke build paddle script
+../paddle/scripts/musl_build/build_paddle.sh
+
+# find output wheel package
+ls dist/*.whl
+```
+
+# build paddle manually  
+
+1. start up the building docker, and enter the shell in the container
+```bash
+# checkout paddle source code
+git clone https://github.com/PaddlePaddle/Paddle.git
+
+# entery paddle directory
+cd ./Paddle
+
+# build docker image
+../paddle/scripts/musl_build/build_docker.sh
+
+# enter the container interactive shell
+BUILD_AUTO=0 ../paddle/scripts/musl_build/build_paddle.sh
+```
+
+2. Type commands to compile source manually
+```sh
+# compile paddle by commands
+# paddle is mount to /paddle directory
+# working directory is /root
+mkdir build && cd build
+
+# install python requirement
+pip install -r /paddle/python/requirements.txt
+
+# configure project with cmake
+cmake /paddle -DWITH_MUSL=ON DWITH_CRYPTO=OFF -DWITH_MKL=OFF -DWITH_GPU=OFF -DWITH_TESTING=OFF
+
+# run the make to build project
+make
+```
+
+# files
+- build_docker.sh: docker building script
+- build_paddle.sh: paddle building script
+- build_inside.sh: build_paddle.sh will invoke this script inside the docker for compiling.
+- config.sh: build config script for configure compiling option setting.
+- Dockerfile: build docker defination file.
diff --git a/paddle/scripts/musl_build/build_docker.sh b/paddle/scripts/musl_build/build_docker.sh
new file mode 100755
index 0000000000000..7abb1031b5282
--- /dev/null
+++ b/paddle/scripts/musl_build/build_docker.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CUR_DIR=$(dirname "${BASH_SOURCE[0]}")
+CUR_DIR=$(realpath "$CUR_DIR")
+
+# shellcheck disable=1090
+source "$CUR_DIR/config.sh"
+
+# exit when any command fails
+set -e
+
+declare -a ENV_ARGS
+if [ "$HTTP_PROXY" ]; then
+    ENV_ARGS+=("--build-arg" "http_proxy=$HTTP_PROXY")
+    echo "using http proxy: $HTTP_PROXY"
+fi
+
+if [ "$HTTPS_PROXY" ]; then
+    ENV_ARGS+=("--build-arg" "https_proxy=$HTTPS_PROXY")
+    echo "using https proxy: $HTTPS_PROXY"
+fi
+
+echo "clean up docker images: $BUILD_IMAGE"
+docker rmi -f "$BUILD_IMAGE"
+
+echo "build docker image: $BUILD_IMAGE"
+
+# shellcheck disable=2086
+docker build \
+    -t "$BUILD_IMAGE" \
+    -f "$CUR_DIR/Dockerfile" \
+    --rm=false \
+    --network host \
+    ${ENV_ARGS[*]} \
+    --output type=tar,dest=build.tar \
+    .
diff --git a/paddle/scripts/musl_build/build_inside.sh b/paddle/scripts/musl_build/build_inside.sh
new file mode 100755
index 0000000000000..65407c7d433ba
--- /dev/null
+++ b/paddle/scripts/musl_build/build_inside.sh
@@ -0,0 +1,64 @@
+#!/bin/sh
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+PADDLE_DIR=/paddle
+BUILD_DIR=$PWD
+
+echo "paddle: $PADDLE_DIR"
+echo "python: $PYTHON_VERSION"
+echo "http_proxy: $HTTP_PROXY"
+echo "https_proxy: $HTTPS_PROXY"
+
+# exit when any command fails
+set -e
+
+echo "create build dir: $BUILD_DIR"
+mkdir -p "$BUILD_DIR"
+
+if [ "$HTTP_PROXY" ]; then
+    git config --global http.proxy "$HTTP_PROXY"
+fi
+
+if [ "$HTTP_PROXY" ]; then
+    git config --global https.proxy "$HTTPS_PROXY"
+fi
+
+PIP_ARGS=""
+if [ "$PIP_INDEX" ]; then
+    PIP_DOMAIN=$(echo "$PIP_INDEX" | awk -F/ '{print $3}')
+    PIP_ARGS="-i $PIP_INDEX --trusted-host $PIP_DOMAIN"
+    echo "pip index: $PIP_INDEX"
+fi
+
+PYTHON_REQS=$PADDLE_DIR/python/requirements.txt
+echo "install python requirements: $PYTHON_REQS"
+
+# shellcheck disable=2086
+pip install $PIP_ARGS --timeout 300 --no-cache-dir -r $PYTHON_REQS
+
+echo "configure with cmake"
+cmake "$PADDLE_DIR" \
+    -DWITH_MUSL=ON \
+    -DWITH_CRYPTO=OFF \
+    -DWITH_MKL=OFF \
+    -DWITH_GPU=OFF
+
+echo "compile with make: $*"
+# shellcheck disable=2068
+make $@
+
+echo "save python dist directory to /output"
+cp -r python/dist /output/
diff --git a/paddle/scripts/musl_build/build_paddle.sh b/paddle/scripts/musl_build/build_paddle.sh
new file mode 100755
index 0000000000000..ecec9182dc248
--- /dev/null
+++ b/paddle/scripts/musl_build/build_paddle.sh
@@ -0,0 +1,101 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CUR_DIR=$(dirname "${BASH_SOURCE[0]}")
+CUR_DIR=$(realpath "$CUR_DIR")
+
+# shellcheck disable=1090
+source "$CUR_DIR/config.sh"
+
+# exit when any command fails
+set -e
+
+# check build mode auto/man
+BUILD_AUTO=${BUILD_AUTO:-1}
+
+
+declare -a ENV_ARGS
+if [ "$HTTP_PROXY" ]; then
+    ENV_ARGS+=("--env" "HTTP_PROXY=$HTTP_PROXY")
+    echo "using http proxy: $HTTP_PROXY"
+fi
+
+if [ "$HTTPS_PROXY" ]; then
+    ENV_ARGS+=("--env" "HTTPS_PROXY=$HTTPS_PROXY")
+    echo "using https proxy: $HTTPS_PROXY"
+fi
+
+if [ "$PIP_INDEX" ]; then
+    ENV_ARGS+=("--env" "PIP_INDEX=$PIP_INDEX")
+fi
+
+echo "compile paddle in docker"
+echo "docker image: $BUILD_IMAGE"
+
+BUILD_ID=$(docker images -q "$BUILD_IMAGE")
+if [ ! "$BUILD_ID" ]; then
+    echo "docker image is not existed, and try to build."
+
+    "$CUR_DIR/build_docker.sh"
+fi
+
+BUILD_NAME="paddle-musl-build-$(date +%Y%m%d-%H%M%S)"
+echo "container name: $BUILD_NAME"
+
+MOUNT_DIR="/paddle"
+echo "mount paddle: $PADDLE_DIR => $MOUNT_DIR"
+
+
+if [ "$BUILD_AUTO" -eq "1" ]; then
+    echo "enter automatic build mode"
+
+    # no exit when fails
+    set +e
+
+    BUILD_SCRIPT=$MOUNT_DIR/paddle/scripts/musl_build/build_inside.sh
+    echo "build script: $BUILD_SCRIPT"
+
+    OUTPUT_DIR="output"
+    mkdir -p $OUTPUT_DIR
+    OUTPUT_DIR=$(realpath $OUTPUT_DIR)
+    echo "build output: $OUTPUT_DIR"
+
+    # shellcheck disable=2086,2068
+    docker run \
+        -v "$PADDLE_DIR":"$MOUNT_DIR" \
+        -v "$OUTPUT_DIR":/output \
+        --rm \
+        --workdir /root \
+        --network host \
+        ${ENV_ARGS[*]} \
+        --name "$BUILD_NAME" \
+        "$BUILD_IMAGE" \
+        "$BUILD_SCRIPT" $@
+
+    echo "list output: $OUTPUT_DIR"
+    ls "$OUTPUT_DIR"
+else
+    echo "enter manual build mode"
+
+    # shellcheck disable=2086
+    docker run \
+        -it \
+        -v "$PADDLE_DIR":"$MOUNT_DIR" \
+        --workdir /root \
+        --network host ${ENV_ARGS[*]}\
+        --name "$BUILD_NAME" \
+        "$BUILD_IMAGE"
+fi
diff --git a/paddle/scripts/musl_build/config.sh b/paddle/scripts/musl_build/config.sh
new file mode 100755
index 0000000000000..d7ec3a8dbb2e1
--- /dev/null
+++ b/paddle/scripts/musl_build/config.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CUR_DIR=$(dirname "${BASH_SOURCE[0]}")
+CUR_DIR=$(realpath "$CUR_DIR")
+
+# shellcheck disable=2034
+PADDLE_DIR=$(realpath "$CUR_DIR/../../../")
+
+# shellcheck disable=2034
+BUILD_IMAGE="paddle-musl-build:2.0"

From 09fd2b2aab0d1dfd90c0dbe1d6489958994d6f34 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 3 Nov 2020 00:23:49 -0600
Subject: [PATCH 101/185] Paddle support compile on sw (#27858)

---
 CMakeLists.txt                          | 11 ++++++++++-
 cmake/cblas.cmake                       |  4 ++++
 cmake/external/eigen.cmake              |  5 +++++
 cmake/flags.cmake                       |  2 +-
 paddle/fluid/operators/math/blas.h      |  2 +-
 paddle/fluid/operators/search_compute.h |  8 ++++----
 paddle/fluid/platform/cpu_helper.cc     |  3 +++
 paddle/fluid/platform/cpu_info.cc       |  3 ++-
 paddle/fluid/platform/cpu_info.h        |  3 ++-
 paddle/fluid/platform/device_tracer.cc  | 24 ++++++++++++++++++++++++
 python/CMakeLists.txt                   |  2 +-
 python/setup.py.in                      |  3 ++-
 12 files changed, 59 insertions(+), 11 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 80820c6487c50..91820123da483 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -138,6 +138,7 @@ option(WITH_LITE   "Compile Paddle Fluid with Lite Engine" OFF)
 option(WITH_NCCL   "Compile PaddlePaddle with NCCL support"             ON)
 option(WITH_CRYPTO   "Compile PaddlePaddle with crypto support"         ON)
 option(WITH_ARM   "Compile PaddlePaddle with arm support"         OFF)
+option(WITH_SW   "Compile PaddlePaddle with sw support"         OFF)
 option(WITH_MUSL        "Compile with musl libc instead of gblic"  OFF)
 
 # PY_VERSION
@@ -257,10 +258,18 @@ if(WITH_ARM)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC")
     set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_ARM=ON" FORCE)
     set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_ARM=ON." FORCE)
-    set(WITH_GPU OFF CACHE STRING "Disable GPU when compiling WITH_ARM=ON." FORCE)
     add_definitions(-DPADDLE_WITH_ARM)
 endif()
 
+if (WITH_SW)
+    # mieee flag solves floating-point exceptions under sw and ALPHA architectures
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fPIC -mieee")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC -mieee")
+    set(WITH_XBYAK OFF CACHE STRING "Disable XBYAK when compiling WITH_SW=ON" FORCE)
+    set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling WITH_SW=ON." FORCE)
+    add_definitions(-DPADDLE_WITH_SW)
+endif()
+
 set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 32042864be436..75bb8bdda2180 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -101,6 +101,8 @@ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS)
         ${REFERENCE_CBLAS_INCLUDE_SEARCH_PATHS})
   find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
+  find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS
+        ${REFERENCE_BLAS_LIB_SEARCH_PATHS})
 
   if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
     set(CBLAS_PROVIDER REFERENCE_CBLAS)
@@ -127,5 +129,7 @@ endif()
 include_directories(${CBLAS_INC_DIR})
 if(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
   target_link_libraries(cblas ${CBLAS_LIBRARIES})
+elseif(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
+  target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY})
 endif()
 
diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake
index b1e3897891027..f27dcd06ef8e2 100644
--- a/cmake/external/eigen.cmake
+++ b/cmake/external/eigen.cmake
@@ -97,3 +97,8 @@ endif()
 add_library(eigen3 INTERFACE)
 
 add_dependencies(eigen3 extern_eigen3)
+
+# sw not support thread_local semantic
+if(WITH_SW)
+  add_definitions(-DEIGEN_AVOID_THREAD_LOCAL)
+endif()
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index ed0bf8396b3fa..ef7d3f2f5ba9d 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -183,7 +183,7 @@ set(GPU_COMMON_FLAGS
     -Wno-error=unused-function  # Warnings in Numpy Header.
     -Wno-error=array-bounds # Warnings in Eigen::array
 )
-if (NOT WITH_NV_JETSON AND NOT WITH_ARM)
+if (NOT WITH_NV_JETSON AND NOT WITH_ARM AND NOT WITH_SW)
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -m64")
 endif()
 endif(NOT WIN32)
diff --git a/paddle/fluid/operators/math/blas.h b/paddle/fluid/operators/math/blas.h
index 562e2de3bd311..6e61031ec1cdb 100644
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -32,7 +32,7 @@ class Tensor;
 #include <libxsmm.h>
 #endif
 
-#ifdef PADDLE_USE_OPENBLAS
+#if defined(PADDLE_USE_OPENBLAS) || defined(PADDLE_USE_REFERENCE_CBLAS)
 #include <cblas.h>
 #endif
 
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index d166b350af30f..df30231051741 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 
-#if !defined(PADDLE_WITH_ARM)
+#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
 #include <immintrin.h>
 #endif
 #include <cfloat>
@@ -74,7 +74,7 @@ void call_gemm_batched(const framework::ExecutionContext& ctx,
   }
 }
 
-#if !defined(PADDLE_WITH_ARM)
+#if !defined(PADDLE_WITH_ARM) && !defined(PADDLE_WITH_SW)
 
 #define __m256x __m256
 
@@ -114,7 +114,7 @@ inline void axpy(const T* x, T* y, size_t len, const T alpha) {
         _mm256_add_px(_mm256_load_px(y + jjj),
                       _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj))));
   }
-#elif defined(PADDLE_WITH_ARM)
+#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
   PADDLE_THROW(platform::errors::Unimplemented("axpy is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
@@ -143,7 +143,7 @@ inline void axpy_noadd(const T* x, T* y, size_t len, const T alpha) {
   for (jjj = 0; jjj < lll; jjj += AVX_STEP_SIZE) {
     _mm256_store_px(y + jjj, _mm256_mul_px(mm_alpha, _mm256_load_px(x + jjj)));
   }
-#elif defined(PADDLE_WITH_ARM)
+#elif defined(PADDLE_WITH_ARM) || defined(PADDLE_WITH_SW)
   PADDLE_THROW(platform::errors::Unimplemented("axpy_noadd is not supported"));
 #else
   lll = len & ~SSE_CUT_LEN_MASK;
diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc
index a402f397348a4..46fdc2b45700f 100644
--- a/paddle/fluid/platform/cpu_helper.cc
+++ b/paddle/fluid/platform/cpu_helper.cc
@@ -42,6 +42,9 @@ void SetNumThreads(int num_threads) {
   int real_num_threads = num_threads > 1 ? num_threads : 1;
   platform::dynload::MKL_Set_Num_Threads(real_num_threads);
   omp_set_num_threads(real_num_threads);
+#elif defined(PADDLE_USE_REFERENCE_CBLAS)
+  // cblas not support multi-thread
+  return;
 #else
   PADDLE_THROW(platform::errors::Unimplemented(
       "This library (except OPENBLAS, MKLML) is not supported yet, so the"
diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc
index 2df1f291f9f8c..6f25df107f6ec 100644
--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
@@ -140,7 +140,8 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
   if (cpu_isa == isa_any) {
     return true;
   } else {
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW)
     int reg[4];
     cpuid(reg, 0);
     int nIds = reg[0];
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index c071246c51250..10870b2b728a4 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -40,7 +40,8 @@ limitations under the License. */
 #ifdef _WIN32
 #define cpuid(reg, x) __cpuidex(reg, x, 0)
 #else
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM)
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW)
 #include <cpuid.h>
 inline void cpuid(int reg[4], int x) {
   __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc
index ec934c3b980c3..bbf8e4d5ca783 100644
--- a/paddle/fluid/platform/device_tracer.cc
+++ b/paddle/fluid/platform/device_tracer.cc
@@ -37,9 +37,16 @@ namespace paddle {
 namespace platform {
 namespace {
 // Tracking the nested block stacks of each thread.
+#ifdef PADDLE_WITH_SW
+// sw not supported thread_local
+std::deque<int> block_id_stack;
+std::deque<Event *> annotation_stack;
+#else
+// Tracking the nested event stacks.
 thread_local std::deque<int> block_id_stack;
 // Tracking the nested event stacks.
 thread_local std::deque<Event *> annotation_stack;
+#endif
 // stack to strore event sunch as pe and so on
 static std::deque<Event *> main_thread_annotation_stack{};
 static std::deque<std::string> main_thread_annotation_stack_name{};
@@ -288,8 +295,13 @@ class DeviceTracerImpl : public DeviceTracer {
   }
 
   void AddAnnotation(uint32_t id, Event *event) {
+#ifdef PADDLE_WITH_SW
+    std::forward_list<std::pair<uint32_t, Event *>> *local_correlations_pairs =
+        nullptr;
+#else
     thread_local std::forward_list<std::pair<uint32_t, Event *>>
         *local_correlations_pairs = nullptr;
+#endif
     if (local_correlations_pairs == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       correlations_pairs.emplace_front();
@@ -304,7 +316,11 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+#else
     thread_local std::forward_list<CPURecord> *local_cpu_records_ = nullptr;
+#endif
     if (local_cpu_records_ == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       cpu_records_.emplace_front();
@@ -335,8 +351,12 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(3) << alloc_in << ", " << free_in << " Cannot be traced.";
       return;
     }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<MemInfoRecord> *local_mem_info_record = nullptr;
+#else
     thread_local std::forward_list<MemInfoRecord> *local_mem_info_record =
         nullptr;
+#endif
     if (local_mem_info_record == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       mem_info_record_.emplace_front();
@@ -353,8 +373,12 @@ class DeviceTracerImpl : public DeviceTracer {
       VLOG(1) << "Empty timeline annotation.";
       return;
     }
+#ifdef PADDLE_WITH_SW
+    std::forward_list<ActiveKindRecord> *local_active_kind_records = nullptr;
+#else
     thread_local std::forward_list<ActiveKindRecord>
         *local_active_kind_records = nullptr;
+#endif
     if (local_active_kind_records == nullptr) {
       std::lock_guard<std::mutex> l(trace_mu_);
       active_kind_records_.emplace_front();
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 8244b91d32dd8..c7ee43a3bc07d 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -106,7 +106,7 @@ if(APPLE)
     message(FATAL_ERROR "install_name_tool not found, please check.\n")
   endif()
 endif()
-if(LINUX)
+if(LINUX AND NOT WITH_SW)
   find_program(PATCHELF_EXECUTABLE patchelf)
   if(NOT PATCHELF_EXECUTABLE)
     message(FATAL_ERROR "patchelf not found, please install it.\n"
diff --git a/python/setup.py.in b/python/setup.py.in
index f9395f8dd318b..b7a6289d38f17 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -349,7 +349,8 @@ if '${CMAKE_BUILD_TYPE}' == 'Release':
             command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/${FLUID_CORE_NAME}" + '.so'
         # The dynamic library compiled under aarch64 is greater than 64M,
         # and an oversize error will be reported when using patchelf.
-        if platform.machine() != 'aarch64':
+        # The sw_64 not suppot patchelf, so we just disable that.
+        if platform.machine() != 'aarch64' and platform.machine() != 'sw_64':
           if os.system(command) != 0:
               raise Exception("patch ${FLUID_CORE_NAME}.%s failed, command: %s" % (ext_name, command))
 

From 6f0f45f69cb7d2f17809b3541450c6efdd08beec Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Tue, 3 Nov 2020 03:25:35 -0600
Subject: [PATCH 102/185] copy_to_cpu support uint8 (#28372)

---
 paddle/fluid/pybind/inference_api.cc | 10 ++++++++--
 python/CMakeLists.txt                |  2 +-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index e503ca31cdd74..61b8b1643665c 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -119,9 +119,12 @@ py::dtype PaddleDTypeToNumpyDType(PaddleDType dtype) {
     case PaddleDType::FLOAT32:
       dt = py::dtype::of<float>();
       break;
+    case PaddleDType::UINT8:
+      dt = py::dtype::of<uint8_t>();
+      break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported data type. Now only supports INT32, INT64 and "
+          "Unsupported data type. Now only supports INT32, INT64, UINT8 and "
           "FLOAT32."));
   }
 
@@ -187,9 +190,12 @@ py::array ZeroCopyTensorToNumpy(ZeroCopyTensor &tensor) {  // NOLINT
     case PaddleDType::FLOAT32:
       tensor.copy_to_cpu<float>(static_cast<float *>(array.mutable_data()));
       break;
+    case PaddleDType::UINT8:
+      tensor.copy_to_cpu<uint8_t>(static_cast<uint8_t *>(array.mutable_data()));
+      break;
     default:
       PADDLE_THROW(platform::errors::Unimplemented(
-          "Unsupported data type. Now only supports INT32, INT64 and "
+          "Unsupported data type. Now only supports INT32, INT64, UINT8 and "
           "FLOAT32."));
   }
   return array;
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index c7ee43a3bc07d..34edb0280b0ba 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -112,4 +112,4 @@ if(LINUX AND NOT WITH_SW)
     message(FATAL_ERROR "patchelf not found, please install it.\n"
             "For Ubuntu, the command is: apt-get install -y patchelf.")
   endif()
-endif(LINUX)
+endif()

From 84cc61b2cd76d89fff0501e55b5a55dec9e079e0 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Tue, 3 Nov 2020 10:36:09 +0100
Subject: [PATCH 103/185] [oneDNN] sum op refactor (#28318)

---
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   | 177 ++++++++++++------
 paddle/fluid/platform/mkldnn_reuse.h          |  53 ------
 .../unittests/mkldnn/test_sum_mkldnn_op.py    |   2 +
 3 files changed, 126 insertions(+), 106 deletions(-)

diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index bdff665f0f626..3d3738d922f77 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -25,7 +25,7 @@
   limitations under the License. */
 
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/mkldnn_helper.h"
+#include "paddle/fluid/platform/mkldnn_reuse.h"
 
 namespace paddle {
 namespace framework {
@@ -51,6 +51,95 @@ using paddle::platform::CPUDeviceContext;
 using paddle::platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
 
+template <typename T>
+class SumMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::sum> {
+ public:
+  SumMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx,
+                   platform::Place cpu_place,
+                   const std::vector<framework::Variable*>& in_vars,
+                   framework::LoDTensor* z, const std::string& uniq_name)
+
+      : platform::MKLDNNHandlerT<T, dnnl::sum>(
+            dev_ctx, dev_ctx.GetEngine(), cpu_place,
+            platform::CreateKey(framework::vectorize(z->dims()), uniq_name)),
+        num_inputs_(0) {
+    for (size_t i = 0; i < in_vars.size(); i++) {
+      srcs_suffix_.push_back(std::string("-") + std::to_string(i));
+    }
+
+    if (!this->isCached()) {
+      auto dst_tz = framework::vectorize<int64_t>(z->dims());
+      auto src_tz = dst_tz;
+
+      std::vector<memory::desc> srcs_md;
+      for (size_t i = 0; i < in_vars.size(); i++) {
+        auto& input_it = in_vars[i]->Get<framework::LoDTensor>();
+        if (input_it.numel() == 0) {
+          continue;
+        }
+        MKLDNNMemoryFormat input_format = input_it.format();
+        srcs_md.push_back(memory::desc(src_tz, platform::MKLDNNGetDataType<T>(),
+                                       input_format));
+        ++num_inputs_;
+      }
+      std::vector<float> scales(num_inputs_, 1.0);
+
+      auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
+                                 MKLDNNMemoryFormat::any);
+
+      this->AcquireForwardPrimitiveDescriptor(dst_md, scales, srcs_md);
+    }
+  }
+
+  // (jczaja) sum oneDNN prim is not having .desc attribute so
+  // we cannot use base AcquireForwardPrimitiveDescriptor
+  void AcquireForwardPrimitiveDescriptor(
+      const memory::desc& dst_md, const std::vector<float>& scales,
+      const std::vector<memory::desc>& srcs_md) {
+    // Sum op does not have backward so no passing from FWD to BWD is needed
+    const std::string key_pd = this->key_ + "@fwd_pd";
+    this->fwd_pd_ = std::static_pointer_cast<dnnl::sum::primitive_desc>(
+        this->dev_ctx_.GetBlob(key_pd));
+    if (this->fwd_pd_ == nullptr) {
+      this->fwd_pd_.reset(new mkldnn::sum::primitive_desc(
+          dst_md, scales, srcs_md, this->engine_));
+      this->dev_ctx_.SetBlob(key_pd, this->fwd_pd_);
+    }
+  }
+
+  std::shared_ptr<mkldnn::memory> AcquireSrcMemory(
+      const framework::Tensor& input, int i) {
+    const T* input_data = input.data<T>();
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src_desc(i),
+                                            to_void_cast<T>(input_data),
+                                            "@src_mem_p" + srcs_suffix_[i]);
+  }
+
+  using platform::MKLDNNHandlerT<T, dnnl::sum>::AcquireDstMemory;
+
+  std::shared_ptr<mkldnn::memory> AcquireDstMemory(void) {
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(),
+                                            "@dst_mem_p");
+  }
+
+  inline int GetNumInputs(void) { return num_inputs_; }
+
+ protected:
+  // isCached need to be overloaded as base one works on key_common
+  bool isCached() {
+    const std::string key_pd = this->key_ + "@fwd_pd";
+    this->fwd_pd_ = std::static_pointer_cast<dnnl::sum::primitive_desc>(
+        this->dev_ctx_.GetBlob(key_pd));
+
+    const std::string key_p = this->key_ + "@fwd_p";
+    return (this->dev_ctx_.GetBlob(key_p) != nullptr);
+  }
+
+ private:
+  int num_inputs_;
+  std::vector<std::string> srcs_suffix_;
+};
+
 template <typename T>
 class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
  public:
@@ -59,85 +148,67 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
                       paddle::platform::errors::PreconditionNotMet(
                           "Operator DNNL Sum must use CPUPlace"));
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
-    const auto& mkldnn_engine = dev_ctx.GetEngine();
     auto in_vars = ctx.MultiInputVar("X");
-    auto out_var = ctx.OutputVar("Out");
 
     PADDLE_ENFORCE_NE(in_vars.empty(), true, platform::errors::InvalidArgument(
                                                  "Input variable is empty."));
-    bool in_place = out_var == in_vars[0];
-
+    auto& input0 = in_vars[0]->Get<LoDTensor>();
     LoDTensor* output = ctx.Output<LoDTensor>("Out");
-    T* output_data = output->mutable_data<T>(ctx.GetPlace());
 
-    auto dst_tz = framework::vectorize<int64_t>(output->dims());
-    auto src_tz = dst_tz;
-    MKLDNNMemoryFormat output_format{MKLDNNMemoryFormat::undef};
-    std::vector<float> scales;
-    std::vector<memory::desc> srcs_md;
-    std::vector<mkldnn::memory> srcs_mem;
+    bool in_place = (input0.numel() > 0) && input0.IsSharedBufferWith(*output);
 
-    auto& input0 = in_vars[0]->Get<LoDTensor>();
-    in_place = (input0.numel() > 0) && (input0.data<T>() == output_data);
+    SumMKLDNNHandler<T> handler(dev_ctx, ctx.GetPlace(), in_vars, output,
+                                ctx.OutputName("Out"));
 
+    // Create list of SRC MEMs
+    std::vector<std::shared_ptr<mkldnn::memory>> srcs_mem;
+    srcs_mem.reserve(handler.GetNumInputs());
+    int input_index = 0;
     for (size_t i = 0; i < in_vars.size(); i++) {
-      auto& input_it = in_vars[i]->Get<LoDTensor>();
+      auto& input_it = in_vars[i]->Get<framework::LoDTensor>();
       if (input_it.numel() == 0) {
         continue;
       }
-
-      const T* input_data = input_it.data<T>();
-      MKLDNNMemoryFormat input_format = input_it.format();
-
-      auto src_md = memory::desc(src_tz, memory::data_type::f32, input_format);
-      auto src_mem = memory(src_md, mkldnn_engine, to_void_cast(input_data));
-      srcs_md.push_back(src_md);
-      srcs_mem.push_back(src_mem);
-      scales.push_back(1.0);
-    }
-
-    auto dst_md =
-        memory::desc(dst_tz, memory::data_type::f32, MKLDNNMemoryFormat::any);
-
-    auto sum_pd = sum::primitive_desc(dst_md, scales, srcs_md, mkldnn_engine);
-
-    std::shared_ptr<memory> dst_mem;
-    if (in_place) {
-      dst_mem.reset(new memory(sum_pd.dst_desc(), mkldnn_engine));
-    } else {
-      dst_mem.reset(new memory(sum_pd.dst_desc(), mkldnn_engine, output_data));
+      srcs_mem.push_back(handler.AcquireSrcMemory(input_it, input_index));
+      ++input_index;
     }
 
-    auto sum_prim = mkldnn::sum(sum_pd);
-    output_format = platform::GetMKLDNNFormat(sum_pd.dst_desc());
+    auto dst_mem = in_place ? handler.AcquireDstMemory()
+                            : handler.AcquireDstMemory(output);
 
-    std::shared_ptr<mkldnn::reorder> reorder_p;
-    std::shared_ptr<memory> target_mem;
-    if (in_place) {
-      output_format = input0.format();
-      target_mem.reset(
-          new memory({{src_tz}, memory::data_type::f32, output_format},
-                     mkldnn_engine, output_data));
-      reorder_p = std::make_shared<reorder>(*dst_mem, *target_mem);
-    }
+    auto sum_p = handler.AcquireForwardPrimitive();
 
-    mkldnn::stream astream(mkldnn_engine);
     std::unordered_map<int, memory> args;
     for (size_t i = 0; i < srcs_mem.size(); ++i) {
-      args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, srcs_mem.at(i)});
+      args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
     }
     args.insert({MKLDNN_ARG_DST, *dst_mem});
 
-    sum_prim.execute(astream, args);
+    mkldnn::stream astream(dev_ctx.GetEngine());
+    sum_p->execute(astream, args);
     astream.wait();
 
+    // For in-place execution which sum does not have we need to fake it
+    // so from oneDNN dst memory we reorder data into input
     if (in_place) {
+      const std::string reorder_key = platform::CreateKey(
+          framework::vectorize(output->dims()), ctx.OutputName("Out") + "-I");
+
+      auto& in_out = in_vars[0]->Get<framework::LoDTensor>();
+      auto output_tz = framework::vectorize<int64_t>(output->dims());
+      platform::ReorderMKLDNNHandler reorder_handler(
+          output_tz, output->type(), framework::ToMKLDNNDataType(in_out.type()),
+          dev_ctx, dev_ctx.GetEngine(), reorder_key);
+
+      auto target_mem = reorder_handler.AcquireDstMemory(
+          output, in_out.format(), ctx.GetPlace());
+
+      auto reorder_p = reorder_handler.AcquireReorder(target_mem, dst_mem);
       reorder_p->execute(astream, *dst_mem, *target_mem);
       astream.wait();
     }
-
-    output->set_layout(DataLayout::kMKLDNN);
-    output->set_format(output_format);
+    output->set_layout(framework::DataLayout::kMKLDNN);
+    output->set_format(platform::GetMKLDNNFormat(*dst_mem));
   }
 };
 
diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h
index 2d9e4333ac95e..54f8cb1dc8842 100644
--- a/paddle/fluid/platform/mkldnn_reuse.h
+++ b/paddle/fluid/platform/mkldnn_reuse.h
@@ -591,59 +591,6 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
   }
 };
 
-class SumMKLDNNHandler : public MKLDNNHandler {
- public:
-  SumMKLDNNHandler(const platform::MKLDNNDeviceContext& dev_ctx,
-                   mkldnn::engine engine, const std::string& base_key)
-      : platform::MKLDNNHandler(dev_ctx, engine, base_key) {}
-
-  std::shared_ptr<mkldnn::sum::primitive_desc> AcquireSumPrimitiveDescriptor(
-      const std::vector<std::shared_ptr<mkldnn::memory>>& src_mems,
-      const std::vector<float>& scales, const mkldnn::memory::desc& dst_md) {
-    const std::string key_sum_pd = key_ + "@sum_pd";
-
-    sum_pd_ = std::static_pointer_cast<mkldnn::sum::primitive_desc>(
-        dev_ctx_.GetBlob(key_sum_pd));
-    if (sum_pd_ == nullptr) {
-      // Get vector of inputs primitive descriptors
-      std::vector<mkldnn::memory::desc> src_ds;
-      for (auto& input_mem : src_mems) {
-        src_ds.push_back(input_mem->get_desc());
-      }
-
-      sum_pd_.reset(
-          new mkldnn::sum::primitive_desc(dst_md, scales, src_ds, engine_));
-      dev_ctx_.SetBlob(key_sum_pd, sum_pd_);
-    }
-
-    return sum_pd_;
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireDstMemoryFromPrimitive(void* ptr) {
-    return this->AcquireMemoryFromPrimitive(sum_pd_->dst_desc(), ptr,
-                                            "@dst_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::memory> AcquireSecondSrcMemory(
-      const mkldnn::memory::desc& md, void* ptr) {
-    return this->AcquireMemory(md, ptr, "@user_src2_mem_p");
-  }
-
-  std::shared_ptr<mkldnn::sum> AcquireSum() {
-    auto prim_key = key_ + "@sum_p";
-    auto sum_p =
-        std::static_pointer_cast<mkldnn::sum>(dev_ctx_.GetBlob(prim_key));
-    if (sum_p == nullptr) {
-      sum_p = std::make_shared<mkldnn::sum>(*sum_pd_);
-      dev_ctx_.SetBlob(prim_key, sum_p);
-    }
-    return sum_p;
-  }
-
- private:
-  std::shared_ptr<mkldnn::sum::primitive_desc> sum_pd_;
-};
-
 template <typename T>
 class ActivationMKLDNNHandler
     : public MKLDNNHandlerT<T, mkldnn::eltwise_forward,
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
index 27ac21db4141b..1a87b1cea532d 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_mkldnn_op.py
@@ -86,4 +86,6 @@ def test_check_grad(self):
 
 
 if __name__ == '__main__':
+    from paddle import enable_static
+    enable_static()
     unittest.main()

From ea851796e5bf5018ee4535758e0509c148285473 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Tue, 3 Nov 2020 18:44:11 +0800
Subject: [PATCH 104/185] =?UTF-8?q?TensorRT=E4=B8=ADernie=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8B=E6=8E=A8=E7=90=86=E6=80=A7=E8=83=BD=E4=BC=98=E5=8C=96?=
 =?UTF-8?q?=EF=BC=8C=E6=94=AF=E6=8C=81=E5=8F=98=E9=95=BF=E8=BE=93=E5=85=A5?=
 =?UTF-8?q?=20(#28367)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fp16 result ok

* change -DWITH_NVINFER_PLUGIN toconfig.EnableTensorRtOSS

* auto detect special slice op converter for ernie with trt oss

* ernie oss only support fp16

* fix special_slice_plugin serialize bug

* matmul in tensorrt ok

* ernie unittest ok

* add matmul tensorrt unittest

* remove demo code
---
 paddle/fluid/inference/analysis/argument.h    |   1 +
 .../inference/analysis/ir_pass_manager.cc     |   1 +
 .../ir_passes/tensorrt_subgraph_pass.cc       |  15 +-
 paddle/fluid/inference/api/analysis_config.cc |   5 +
 .../fluid/inference/api/analysis_predictor.cc |   3 +-
 .../inference/api/paddle_analysis_config.h    |  15 ++
 .../inference/tensorrt/convert/CMakeLists.txt |   2 +-
 .../tensorrt/convert/emb_eltwise_layernorm.cc |  91 ++++++++-
 .../inference/tensorrt/convert/matmul_op.cc   |  91 +++++++++
 .../inference/tensorrt/convert/mul_op.cc      |  61 ------
 .../tensorrt/convert/multihead_matmul_op.cc   | 153 ++++++++++++---
 .../tensorrt/convert/skip_layernorm.cc        |  45 ++++-
 .../inference/tensorrt/convert/slice_op.cc    |  30 ++-
 paddle/fluid/inference/tensorrt/engine.h      |  12 +-
 paddle/fluid/inference/tensorrt/op_teller.cc  |  17 ++
 .../inference/tensorrt/plugin/CMakeLists.txt  |   2 +-
 .../tensorrt/plugin/special_slice_plugin.cu   | 177 ++++++++++++++++++
 .../tensorrt/plugin/special_slice_plugin.h    |  96 ++++++++++
 ...rt_dynamic_shape_ernie_deserialize_test.cc |   6 +-
 .../tests/api/trt_dynamic_shape_ernie_test.cc |  13 +-
 .../operators/tensorrt/tensorrt_engine_op.h   |   4 +-
 paddle/fluid/platform/dynload/tensorrt.cc     |  37 +++-
 paddle/fluid/platform/dynload/tensorrt.h      |  31 ++-
 paddle/fluid/pybind/inference_api.cc          |   2 +
 .../ir/inference/inference_pass_test.py       |   4 +
 .../unittests/ir/inference/test_trt_matmul.py | 116 ++++++++++++
 26 files changed, 893 insertions(+), 137 deletions(-)
 create mode 100644 paddle/fluid/inference/tensorrt/convert/matmul_op.cc
 delete mode 100644 paddle/fluid/inference/tensorrt/convert/mul_op.cc
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
 create mode 100644 paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
 create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py

diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index cd0fc03852a4d..aa8ebcb4930b9 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -207,6 +207,7 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                       bool);
   DECL_ARGUMENT_FIELD(tensorrt_use_calib_mode, TensorRtUseCalibMode, bool);
+  DECL_ARGUMENT_FIELD(tensorrt_use_oss, TensorRtUseOSS, bool);
 
   DECL_ARGUMENT_FIELD(lite_passes_filter, LitePassesFilter,
                       std::vector<std::string>);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index d136f5033e7e3..e94590e847cd5 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -95,6 +95,7 @@ void IRPassManager::CreatePasses(Argument *argument,
       bool use_calib_mode = argument->tensorrt_use_calib_mode();
       pass->Set("enable_int8", new bool(enable_int8));
       pass->Set("use_calib_mode", new bool(use_calib_mode));
+      pass->Set("use_oss", new bool(argument->tensorrt_use_oss()));
       pass->Set("precision_mode",
                 new AnalysisConfig::Precision(precision_mode));
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 1d4725ddab514..7ad882797870d 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -117,11 +117,20 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   block_desc.Proto()->set_idx(0);
   LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
 
+  bool has_fused_embedding_eltwise_layernorm = false;
+  bool has_multihead_matmul = false;
   for (auto *node : subgraph) {
     auto *new_block_op = new_block->AppendOp();
     auto *op = block_desc.AppendOp();
     *new_block_op->Proto() = *node->Op()->Proto();
     *op->Proto() = *node->Op()->Proto();
+    if (!has_fused_embedding_eltwise_layernorm 
+        && op->Type() == "fused_embedding_eltwise_layernorm") {
+      has_fused_embedding_eltwise_layernorm = true;
+    }
+    if (!has_multihead_matmul && op->Type() == "multihead_matmul") {
+      has_multihead_matmul = true;
+    }
   }
 
   // Then, we will use the input_names_with_id and output_names_with_id to
@@ -308,6 +317,9 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   precision_mode, calibrator.get(), Get<int>("gpu_device_id"),
                   min_input_shape, max_input_shape, opt_input_shape,
                   disable_trt_plugin_fp16);
+  trt_engine->SetUseOSS(Get<bool>("use_oss"));
+  trt_engine->SetWithErnie(
+      has_multihead_matmul && has_fused_embedding_eltwise_layernorm);
 
   bool need_serialize = (use_static_engine && !load_from_memory);
   if (need_serialize) {
@@ -386,4 +398,5 @@ REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
             .EQ("instance_norm", 0)
             .EQ("gelu", 0)
             .EQ("layer_norm", 0)
-            .EQ("scale", 0));
+            .EQ("scale", 0)
+            .EQ("matmul", 0));
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 009ebd520c2b6..7e5552a74ccd5 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -122,6 +122,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
   CP_MEMBER(tensorrt_precision_mode_);
   CP_MEMBER(trt_use_static_engine_);
   CP_MEMBER(trt_use_calib_mode_);
+  CP_MEMBER(trt_use_oss_);
   // MKLDNN related.
   CP_MEMBER(use_mkldnn_);
   CP_MEMBER(mkldnn_enabled_op_types_);
@@ -280,6 +281,10 @@ void AnalysisConfig::SetTRTDynamicShapeInfo(
   disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
 }
 
+void AnalysisConfig::EnableTensorRtOSS() {
+    trt_use_oss_ = true;
+}
+
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
   auto info = SerializeInfoCache();
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 5dae7368a8e7d..ccfb6dfa17ab4 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -470,6 +470,7 @@ void AnalysisPredictor::PrepareArgument() {
     argument_.SetTensorRtPrecisionMode(config_.tensorrt_precision_mode_);
     argument_.SetTensorRtUseStaticEngine(config_.trt_use_static_engine_);
     argument_.SetTensorRtUseCalibMode(config_.trt_use_calib_mode_);
+    argument_.SetTensorRtUseOSS(config_.trt_use_oss_);
     argument_.SetMinInputShape(config_.min_input_shape_);
     argument_.SetMaxInputShape(config_.max_input_shape_);
     argument_.SetOptimInputShape(config_.optim_input_shape_);
@@ -1055,7 +1056,7 @@ USE_TRT_CONVERTER(elementwise_mul_tensor);
 USE_TRT_CONVERTER(elementwise_max_tensor);
 USE_TRT_CONVERTER(elementwise_min_tensor);
 USE_TRT_CONVERTER(elementwise_pow_tensor);
-USE_TRT_CONVERTER(mul);
+USE_TRT_CONVERTER(matmul);
 USE_TRT_CONVERTER(conv2d);
 USE_TRT_CONVERTER(relu);
 USE_TRT_CONVERTER(sigmoid);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 7ad3aaf1f9d08..edf2c323e82fb 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -312,6 +312,20 @@ struct PD_INFER_DECL AnalysisConfig {
       std::map<std::string, std::vector<int>> max_input_shape,
       std::map<std::string, std::vector<int>> optim_input_shape,
       bool disable_trt_plugin_fp16 = false);
+
+  ///
+  /// \brief Replace some TensorRT plugins to TensorRT OSS(
+  /// https://github.com/NVIDIA/TensorRT), with which some models's inference may 
+  /// be more high-performance. Libnvinfer_plugin.so greater than V7.2.1 is needed.
+  ///
+  void EnableTensorRtOSS();
+  ///
+  /// \brief A boolean state telling whether to use the TensorRT OSS.
+  ///
+  /// \return bool Whether to use the TensorRT OSS.
+  ///
+  bool tensorrt_oss_enabled() { return trt_use_oss_; }
+
   ///
   /// \brief Turn on the usage of Lite sub-graph engine.
   ///
@@ -569,6 +583,7 @@ struct PD_INFER_DECL AnalysisConfig {
   Precision tensorrt_precision_mode_{Precision::kFloat32};
   bool trt_use_static_engine_{false};
   bool trt_use_calib_mode_{true};
+  bool trt_use_oss_{false};
   std::map<std::string, std::vector<int>> min_input_shape_{};
   std::map<std::string, std::vector<int>> max_input_shape_{};
   std::map<std::string, std::vector<int>> optim_input_shape_{};
diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
index 39d02909abd1f..e20d017cdf9d6 100644
--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Add TRT tests
 nv_library(tensorrt_converter
-           SRCS mul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
+           SRCS matmul_op.cc conv2d_op.cc fc_op.cc pool2d_op.cc elementwise_op.cc
                 batch_norm_op.cc activation_op.cc softmax_op.cc concat_op.cc dropout_op.cc
                 pad_op.cc split_op.cc prelu_op.cc leaky_relu_op.cc gelu_op.cc layer_norm_op.cc multihead_matmul_op.cc
                 shuffle_channel_op.cc swish_op.cc instance_norm_op.cc stack_op.cc
diff --git a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
index b846b3033f674..4bc21351b4e57 100644
--- a/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/emb_eltwise_layernorm.cc
@@ -49,6 +49,9 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
       input_ids.push_back(engine_->GetITensor(id_names[i]));
     }
 
+    // input_embs[0]: word_embedding
+    // input_embs[1]: pos_embedding
+    // input_embs[2]: sent_embedding
     std::vector<float*> input_embs;
     std::vector<int> emb_sizes;
 
@@ -85,15 +88,88 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
         get_persistable_data(op_desc.Input("Scale").front(), &scale_dims);
     int64_t bias_size = framework::product(bias_dims);
     int64_t scale_size = framework::product(scale_dims);
-    float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
     nvinfer1::ILayer* layer = nullptr;
 
     if (engine_->with_dynamic_shape()) {
-      auto use_fp16 = engine_->WithFp16();
-      auto plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
-          input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
-          eps, use_fp16);
-      layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
+      if (engine_->use_oss()) {
+        int output_fp16 = static_cast<int>((engine_->WithFp16() == 1) ? 1 : 0);
+        PADDLE_ENFORCE_EQ(output_fp16, 1,
+            platform::errors::InvalidArgument(
+              "Only Precision::KHalf(fp16) is supported when infering "
+              "ernie(bert) model with config.EnableTensorRtOSS(). "
+              "But Precision::KFloat32 is setted."));
+        const std::vector<nvinfer1::PluginField> fields{
+            {"bert_embeddings_layernorm_beta", bias,
+             nvinfer1::PluginFieldType::kFLOAT32,
+             static_cast<int32_t>(bias_size)},
+            {"bert_embeddings_layernorm_gamma", scale,
+             nvinfer1::PluginFieldType::kFLOAT32,
+             static_cast<int32_t>(scale_size)},
+            {"bert_embeddings_word_embeddings", input_embs[0],
+             nvinfer1::PluginFieldType::kFLOAT32,
+             static_cast<int32_t>(emb_sizes[0])},
+            {"bert_embeddings_token_type_embeddings", input_embs[2],
+             nvinfer1::PluginFieldType::kFLOAT32,
+             static_cast<int32_t>(emb_sizes[2])},
+            {"bert_embeddings_position_embeddings", input_embs[1],
+             nvinfer1::PluginFieldType::kFLOAT32,
+             static_cast<int32_t>(emb_sizes[1])},
+            {"output_fp16", &output_fp16, nvinfer1::PluginFieldType::kINT32, 1},
+        };
+
+        // remember to free
+        nvinfer1::PluginFieldCollection* plugin_ptr =
+            static_cast<nvinfer1::PluginFieldCollection*>(
+                malloc(sizeof(*plugin_ptr) +
+                       fields.size() * sizeof(nvinfer1::PluginField)));
+        plugin_ptr->nbFields = static_cast<int>(fields.size());
+        plugin_ptr->fields = fields.data();
+
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        plugin_inputs.emplace_back(engine_->GetITensor(
+            engine_->network()->getInput(0)->getName()));  // word_embedding,
+                                                           // eval_placeholder_0
+        plugin_inputs.emplace_back(engine_->GetITensor(
+            engine_->network()->getInput(1)->getName()));  // sent_embedding,
+                                                           // eval_placeholder_1
+        plugin_inputs.emplace_back(engine_->GetITensor(
+            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
+                                                           // eval_placeholder_2
+        auto max_seqlen_tensor = engine_->GetITensor(
+            engine_->network()->getInput(3)->getName());
+        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Shuffle, *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
+        nvinfer1::Dims shape_dim;
+        shape_dim.nbDims = 1;
+        shape_dim.d[0] = -1;
+        shuffle_layer->setReshapeDimensions(shape_dim);
+        plugin_inputs.emplace_back(shuffle_layer->getOutput(0));     // max_seqlen, eval_placeholder_3
+
+        auto creator = GetPluginRegistry()->getPluginCreator(
+            "CustomEmbLayerNormPluginDynamic", "2");
+
+        auto plugin_obj =
+            creator->createPlugin("CustomEmbLayerNormPluginDynamic", plugin_ptr);
+        auto plugin_layer = engine_->network()->addPluginV2(
+            plugin_inputs.data(), plugin_inputs.size(), *plugin_obj);
+        layer = plugin_layer;
+        free(plugin_ptr);
+        auto output_name = op_desc.Output("Out")[0];
+        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm",
+                                 {output_name, std::string("qkv_plugin_mask")},
+                                 test_mode);
+      } else {
+        bool use_fp16 = engine_->WithFp16();
+        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+        plugin::DynamicPluginTensorRT* plugin = nullptr;
+        plugin = new plugin::EmbEltwiseLayernormPluginDynamic(
+            input_embs, bias, scale, emb_sizes, bias_size, scale_size, hidden,
+            eps, use_fp16);
+        layer = engine_->AddPluginV2(input_ids.data(), input_num, plugin);
+        auto output_name = op_desc.Output("Out")[0];
+        RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
+                                 test_mode);
+      }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the Ernie(Bert) model in static"
@@ -102,9 +178,6 @@ class EmbEltwiseLayerNormOpConverter : public OpConverter {
           " to set the shape information to run the dynamic shape mode."));
     }
 
-    auto output_name = op_desc.Output("Out")[0];
-    RreplenishLayerAndOutput(layer, "emb_eltwise_layernorm", {output_name},
-                             test_mode);
 #else
     PADDLE_THROW(platform::errors::Fatal(
         "You are running the TRT Dynamic Shape mode, need to confirm that "
diff --git a/paddle/fluid/inference/tensorrt/convert/matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
new file mode 100644
index 0000000000000..88dbf15529155
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/convert/matmul_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
+
+namespace paddle {
+namespace framework {
+class Scope;
+namespace proto {
+class OpDesc;
+}  // namespace proto
+}  // namespace framework
+}  // namespace paddle
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+
+/*
+ * MatMulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
+ */
+class MatMulOpConverter : public OpConverter {
+ public:
+  void operator()(const framework::proto::OpDesc& op,
+                  const framework::Scope& scope, bool test_mode) override {
+    VLOG(3) << "convert a fluid matmul op to tensorrt mul layer without bias";
+
+    framework::OpDesc op_desc(op, nullptr);
+    // Declare inputs
+    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
+    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
+    
+    bool transpose_X = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_X"));
+    bool transpose_Y = BOOST_GET_CONST(bool, op_desc.GetAttr("transpose_Y"));
+
+    auto* layer = TRT_ENGINE_ADD_LAYER(
+        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), transpose_X,
+        *const_cast<nvinfer1::ITensor*>(input2), transpose_Y);
+
+    float alpha = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
+    auto output_name = op_desc.Output("Out")[0];
+    if (fabs(alpha - 1.0) < std::numeric_limits<float>::epsilon()) {
+      engine_->SetITensor(output_name, layer->getOutput(0));
+    } else {
+      auto create_weights = [&](float data, const std::string &type) -> float* {
+        std::unique_ptr<framework::Tensor> tmp_tensor(new framework::Tensor());
+        tmp_tensor->Resize({1});
+        auto* tmp_data = tmp_tensor->mutable_data<float>(platform::CPUPlace());
+        tmp_data[0] = data;
+        engine_->SetWeights(output_name + "_add_scale_op_" + type,
+                            std::move(tmp_tensor));
+        return tmp_data;
+      };
+      float* alpha_data = create_weights(alpha, "alpha");
+      float* shift_data = create_weights(0.0, "shift");
+      float* power_data = create_weights(1.0, "power");
+      TensorRTEngine::Weight nv_alpha{nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(alpha_data), 1};
+      TensorRTEngine::Weight nv_shift{nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(shift_data), 1};
+      TensorRTEngine::Weight nv_power{nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(power_data), 1};
+      auto* scale_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Scale, *layer->getOutput(0), 
+          nvinfer1::ScaleMode::kUNIFORM,
+          nv_shift.get(), nv_alpha.get(), nv_power.get());
+      engine_->SetITensor(output_name, scale_layer->getOutput(0));
+    }
+    if (test_mode) {  // the test framework can not determine which is the
+                      // output, so place the declaration inside.
+      engine_->DeclareOutput(output_name);
+    }
+  }
+};
+
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_TRT_OP_CONVERTER(matmul, MatMulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/mul_op.cc b/paddle/fluid/inference/tensorrt/convert/mul_op.cc
deleted file mode 100644
index c99528b207b6c..0000000000000
--- a/paddle/fluid/inference/tensorrt/convert/mul_op.cc
+++ /dev/null
@@ -1,61 +0,0 @@
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
-
-namespace paddle {
-namespace framework {
-class Scope;
-namespace proto {
-class OpDesc;
-}  // namespace proto
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace inference {
-namespace tensorrt {
-
-/*
- * MulOp, IMatrixMultiplyLayer in TRT. This Layer doesn't has weights.
- */
-class MulOpConverter : public OpConverter {
- public:
-  void operator()(const framework::proto::OpDesc& op,
-                  const framework::Scope& scope, bool test_mode) override {
-    VLOG(3) << "convert a fluid mul op to tensorrt mul layer without bias";
-
-    framework::OpDesc op_desc(op, nullptr);
-    // Declare inputs
-    auto* input1 = engine_->GetITensor(op_desc.Input("X")[0]);
-    auto* input2 = engine_->GetITensor(op_desc.Input("Y")[0]);
-    // Both the input1 and input2 do not need transpose.
-    auto* layer = TRT_ENGINE_ADD_LAYER(
-        engine_, MatrixMultiply, *const_cast<nvinfer1::ITensor*>(input1), false,
-        *const_cast<nvinfer1::ITensor*>(input2), false);
-
-    auto output_name = op_desc.Output("Out")[0];
-    engine_->SetITensor(output_name, layer->getOutput(0));
-    if (test_mode) {  // the test framework can not determine which is the
-                      // output, so place the declaration inside.
-      engine_->DeclareOutput(output_name);
-    }
-  }
-};
-
-}  // namespace tensorrt
-}  // namespace inference
-}  // namespace paddle
-
-REGISTER_TRT_OP_CONVERTER(mul, MulOpConverter);
diff --git a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
index a19d91f36e243..e3b29bd5231bf 100644
--- a/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/multihead_matmul_op.cc
@@ -30,7 +30,6 @@ class MultiheadMatMulOpConverter : public OpConverter {
     // Declare inputs
     // Shouble be a 5 dims tensor.
     auto* input = engine_->GetITensor(op_desc.Input("Input").front());
-    auto* input_bias_qk = engine_->GetITensor(op_desc.Input("BiasQK").front());
 
     // fc weights and fc bias
     auto weight_name = op_desc.Input("W").front();
@@ -50,7 +49,7 @@ class MultiheadMatMulOpConverter : public OpConverter {
     memcpy(weight_data_tmp.data(), weight_data,
            weight_t->numel() * sizeof(float));
 
-    //  (hidden, 3, all_head_size)
+    // (hidden, 3, all_head_size)
     auto weight_dims = weight_t->dims();
 
     int hidden = weight_dims[0];         // channels_in
@@ -65,36 +64,136 @@ class MultiheadMatMulOpConverter : public OpConverter {
         }
       }
     };
-
-    // transpose weight_data from m * n to  n * m
     tranpose_weight(weight_data_tmp.data(), weight_data, m, n);
-    TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
-                                  static_cast<void*>(weight_data),
-                                  static_cast<size_t>(weight_t->numel())};
-
-    weight.dims.assign({n, m});
-    TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
-                                static_cast<void*>(bias_data),
-                                static_cast<size_t>(bias_t->numel())};
-
-    auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
-                                          weight.get(), bias.get());
-    auto* fc_out = fc_layer->getOutput(0);
-    // add qkv to context
+
     int head_number = BOOST_GET_CONST(int, op_desc.GetAttr("head_number"));
-    int head_size = all_head_size / head_number;
-    float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
 
-    std::vector<nvinfer1::ITensor*> plugin_inputs;
-    plugin_inputs.push_back(fc_out);
-    plugin_inputs.push_back(input_bias_qk);
     nvinfer1::ILayer* layer = nullptr;
+
     if (engine_->with_dynamic_shape()) {
-      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
-      plugin::DynamicPluginTensorRT* plugin =
-          new plugin::QkvToContextPluginDynamic(hidden, head_number, head_size,
-                                                scale, ban_fp16);
-      layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
+      if (engine_->use_oss()) {
+        int head_size = hidden / head_number;
+        // [3, Nout, Hout, Nin, Hin] -> [Nout, 3, Hout, Nin, Hin]
+        auto transpose_weight_v2 = [](const float* src, float* dst, int N,
+                                      int H) {
+          const int HNH = H * N * H;
+          for (int i = 0; i < 3; ++i) {
+            for (int n = 0; n < N; ++n) {
+              for (int hnh = 0; hnh < HNH; ++hnh) {
+                dst[n * 3 * HNH + i * HNH + hnh] =
+                    src[i * N * HNH + n * HNH + hnh];
+              }
+            }
+          }
+        };
+        // [3, N, H] -> [N, 3, H]
+        auto transpose_bias_v2 = [](const float* src, float* dst, int N, int H) {
+          for (int i = 0; i < 3; ++i) {
+            for (int n = 0; n < N; ++n) {
+              for (int h = 0; h < H; ++h) {
+                dst[n * 3 * H + i * H + h] = src[i * N * H + n * H + h];
+              }
+            }
+          }
+        };
+        memcpy(weight_data_tmp.data(), weight_data,
+               weight_t->numel() * sizeof(float));
+        transpose_weight_v2(weight_data_tmp.data(), weight_data, head_number,
+                            head_size);
+        nvinfer1::Weights weight{nvinfer1::DataType::kFLOAT,
+                                 static_cast<void*>(weight_data),
+                                 static_cast<int32_t>(weight_t->numel())};
+
+        std::vector<float> bias_data_tmp;
+        bias_data_tmp.reserve(bias_t->numel());
+        memcpy(bias_data_tmp.data(), bias_data, bias_t->numel() * sizeof(float));
+        transpose_bias_v2(bias_data_tmp.data(), bias_data, head_number,
+                          head_size);
+        nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT,
+                               static_cast<void*>(bias_data),
+                               static_cast<int32_t>(bias_t->numel())};
+
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                              weight, bias);
+
+        auto mask_tensor = engine_->GetITensor("qkv_plugin_mask");
+
+        auto creator = GetPluginRegistry()->getPluginCreator(
+            "CustomQKVToContextPluginDynamic", "2");
+        assert(creator != nullptr);
+        int type = static_cast<int>((engine_->WithFp16() == 1)
+                                        ? nvinfer1::DataType::kHALF
+                                        : nvinfer1::DataType::kFLOAT);
+        bool has_mask = true;
+        int var_seqlen = 1;
+        const std::vector<nvinfer1::PluginField> fields{
+            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+            {"hidden_size", &hidden, nvinfer1::PluginFieldType::kINT32, 1},
+            {"num_heads", &head_number, nvinfer1::PluginFieldType::kINT32, 1},
+            {"has_mask", &has_mask, nvinfer1::PluginFieldType::kINT32, 1},
+            {"var_seqlen", &var_seqlen, nvinfer1::PluginFieldType::kINT32, 1},
+        };
+        nvinfer1::PluginFieldCollection* plugin_collection =
+            static_cast<nvinfer1::PluginFieldCollection*>(
+                malloc(sizeof(*plugin_collection) +
+                       fields.size() *
+                           sizeof(nvinfer1::PluginField)));  // remember to free
+        plugin_collection->nbFields = static_cast<int>(fields.size());
+        plugin_collection->fields = fields.data();
+
+        auto plugin = creator->createPlugin("CustomQKVToContextPluginDynamic",
+                                            plugin_collection);
+        free(plugin_collection);
+
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        plugin_inputs.emplace_back(fc_layer->getOutput(0));
+        plugin_inputs.emplace_back(mask_tensor);
+        plugin_inputs.emplace_back(engine_->GetITensor(
+            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
+                                                           // eval_placeholder_2
+        auto max_seqlen_tensor = engine_->GetITensor(
+            engine_->network()->getInput(3)->getName());
+        auto* shuffle_layer = TRT_ENGINE_ADD_LAYER(
+          engine_, Shuffle, *const_cast<nvinfer1::ITensor*>(max_seqlen_tensor));
+        nvinfer1::Dims shape_dim;
+        shape_dim.nbDims = 1;
+        shape_dim.d[0] = -1;
+        shuffle_layer->setReshapeDimensions(shape_dim);
+        plugin_inputs.emplace_back(shuffle_layer->getOutput(0)); // max_seqlen, eval_placeholder_3
+
+        auto plugin_layer = engine_->network()->addPluginV2(
+            plugin_inputs.data(), plugin_inputs.size(), *plugin);
+        layer = plugin_layer;
+      } else {
+        // transpose weight_data from m * n to  n * m
+        auto* input_bias_qk =
+            engine_->GetITensor(op_desc.Input("BiasQK").front());
+
+        TensorRTEngine::Weight weight{nvinfer1::DataType::kFLOAT,
+                                      static_cast<void*>(weight_data),
+                                      static_cast<size_t>(weight_t->numel())};
+        weight.dims.assign({n, m});
+
+        TensorRTEngine::Weight bias{nvinfer1::DataType::kFLOAT,
+                                    static_cast<void*>(bias_data),
+                                    static_cast<size_t>(bias_t->numel())};
+
+        auto* fc_layer = TRT_ENGINE_ADD_LAYER(engine_, FullyConnected, *input, n,
+                                              weight.get(), bias.get());
+        auto* fc_out = fc_layer->getOutput(0);
+        // add qkv to context
+        int head_size = all_head_size / head_number;
+        float scale = BOOST_GET_CONST(float, op_desc.GetAttr("alpha"));
+
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        plugin_inputs.push_back(fc_out);
+        plugin_inputs.push_back(input_bias_qk);
+        bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+        plugin::DynamicPluginTensorRT* plugin =
+            new plugin::QkvToContextPluginDynamic(hidden, head_number, head_size,
+                                                  scale, ban_fp16);
+        layer = engine_->AddPluginV2(plugin_inputs.data(), 2, plugin);
+      }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the Ernie(Bert) model in static shape mode, which "
diff --git a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
index 9ac6b92af89e1..823e66a4bf99b 100644
--- a/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
+++ b/paddle/fluid/inference/tensorrt/convert/skip_layernorm.cc
@@ -47,17 +47,50 @@ class SkipLayerNormOpConverter : public OpConverter {
     framework::DDim bias_dims, scale_dims;
     auto* bias = get_persistable_data("Bias", &bias_dims);
     auto* scale = get_persistable_data("Scale", &scale_dims);
-    float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
     int bias_size = framework::product(bias_dims);
     int scale_size = framework::product(scale_dims);
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
-      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
-      plugin::SkipLayerNormPluginDynamic* plugin =
-          new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
-                                                 scale_size, eps, ban_fp16);
-      layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
+      if (engine_->use_oss()) {
+        auto creator = GetPluginRegistry()->getPluginCreator(
+            "CustomSkipLayerNormPluginDynamic", "2");
+        assert(creator != nullptr);
+        int type = static_cast<int>((engine_->WithFp16() == 1)
+                                        ? nvinfer1::DataType::kHALF
+                                        : nvinfer1::DataType::kFLOAT);
+        int ld = input1->getDimensions().d[2];  // hidden dimension
+        assert(ld > 0);
+
+        const std::vector<nvinfer1::PluginField> fields{
+            {"type_id", &type, nvinfer1::PluginFieldType::kINT32, 1},
+            {"ld", &ld, nvinfer1::PluginFieldType::kINT32, 1},
+            {"beta", bias, nvinfer1::PluginFieldType::kFLOAT32, bias_size},
+            {"gamma", scale, nvinfer1::PluginFieldType::kFLOAT32, scale_size},
+        };
+        nvinfer1::PluginFieldCollection* pluginPtr =
+            static_cast<nvinfer1::PluginFieldCollection*>(
+                malloc(sizeof(*pluginPtr) +
+                       fields.size() *
+                           sizeof(nvinfer1::PluginField)));  // remember to free
+        pluginPtr->nbFields = static_cast<int>(fields.size());
+        pluginPtr->fields = fields.data();
+
+        auto pluginObj =
+            creator->createPlugin("CustomSkipLayerNormPluginDynamic", pluginPtr);
+        auto plugin_layer = engine_->network()->addPluginV2(
+            inputs.data(), inputs.size(), *pluginObj);
+
+        assert(plugin_layer != nullptr);
+        layer = plugin_layer;
+      } else {
+        float eps = BOOST_GET_CONST(float, op_desc.GetAttr("epsilon"));
+        bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+        plugin::SkipLayerNormPluginDynamic* plugin =
+            new plugin::SkipLayerNormPluginDynamic(bias, scale, bias_size,
+                                                   scale_size, eps, ban_fp16);
+        layer = engine_->AddPluginV2(inputs.data(), 2, plugin);
+      }
     } else {
       PADDLE_THROW(platform::errors::Fatal(
           "You are running the Ernie(Bert) model in static"
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index 3c3fead3d361b..ee4716bb56bc2 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/inference/tensorrt/convert/op_converter.h"
 #include "paddle/fluid/inference/tensorrt/plugin/slice_op_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h"
 
 namespace paddle {
 namespace inference {
@@ -77,16 +78,31 @@ class SliceOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
+      if (engine_->use_oss() && engine_->with_ernie()) {
+        std::vector<nvinfer1::ITensor*> plugin_inputs;
+        // plugin_inputs.emplace_back(trans_layer->getOutput(0));
+        plugin_inputs.emplace_back(input);
+        plugin_inputs.emplace_back(engine_->GetITensor(
+            engine_->network()->getInput(2)->getName()));  // cu_seqlens,
+                                                           // eval_placeholder_2
+
+        // bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+        plugin::SpecialSlicePluginDynamic* plugin =
+            new plugin::SpecialSlicePluginDynamic();
+        layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
+                                     plugin);
+      } else {
 #if IS_TRT_VERSION_GE(6000)
-      bool ban_fp16 = engine_->disable_trt_plugin_fp16();
-      plugin::SlicePluginDynamic* plugin =
-          new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
-      layer = engine_->AddPluginV2(&input, 1, plugin);
+        bool ban_fp16 = engine_->disable_trt_plugin_fp16();
+        plugin::SlicePluginDynamic* plugin =
+            new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
+        layer = engine_->AddPluginV2(&input, 1, plugin);
 #else
-      PADDLE_THROW(platform::errors::Fatal(
-          "You are running the TRT Dynamic Shape mode, need to confirm that "
-          "your TRT version is no less than 6.0"));
+        PADDLE_THROW(platform::errors::Fatal(
+            "You are running the TRT Dynamic Shape mode, need to confirm that "
+            "your TRT version is no less than 6.0"));
 #endif
+      }
     } else {
       bool ban_fp16 = engine_->disable_trt_plugin_fp16();
       plugin::SlicePlugin* plugin =
diff --git a/paddle/fluid/inference/tensorrt/engine.h b/paddle/fluid/inference/tensorrt/engine.h
index 71625210054b3..cb3f3f94707de 100644
--- a/paddle/fluid/inference/tensorrt/engine.h
+++ b/paddle/fluid/inference/tensorrt/engine.h
@@ -71,9 +71,9 @@ TRT_DT FluidDataType2TRT(FluidDT type) {
 template <typename T>
 nvinfer1::Dims Vec2TRT_Dims(const std::vector<T>& shape, std::string input,
                             bool with_dynamic_shape = false) {
-  PADDLE_ENFORCE_GT(shape.size(), 1UL,
+  PADDLE_ENFORCE_GT(shape.size(), 0UL,
                     platform::errors::InvalidArgument(
-                        "TensorRT's tensor input requires at least 2 "
+                        "TensorRT's tensor input requires at least 1 "
                         "dimensions, but input %s has %d dims.",
                         input, shape.size()));
   PADDLE_ENFORCE_LE(shape.size(), 4UL,
@@ -174,6 +174,7 @@ class TensorRTEngine {
                       "version should be at least 6.";
 #endif
     }
+    dy::initLibNvInferPlugins(&logger, "");
   }
 
   ~TensorRTEngine() {}
@@ -285,6 +286,9 @@ class TensorRTEngine {
     suffix_counter += 1;
   }
 
+  void SetUseOSS(bool use_oss) { use_oss_ = use_oss; }
+  void SetWithErnie(bool with_ernie) { with_ernie_ = with_ernie; }
+
   void ClearWeights() {
     for (auto& weight_pair : weight_map) {
       weight_pair.second.reset(nullptr);
@@ -312,6 +316,8 @@ class TensorRTEngine {
   ShapeMapType min_input_shape() { return min_input_shape_; }
   ShapeMapType max_input_shape() { return max_input_shape_; }
   ShapeMapType optim_input_shape() { return optim_input_shape_; }
+  bool use_oss() { return use_oss_; };
+  bool with_ernie() { return with_ernie_; };
   bool disable_trt_plugin_fp16() { return disable_trt_plugin_fp16_; }
   bool with_dynamic_shape() { return with_dynamic_shape_; }
 
@@ -347,6 +353,8 @@ class TensorRTEngine {
   ShapeMapType max_input_shape_;
   ShapeMapType optim_input_shape_;
   bool disable_trt_plugin_fp16_{false};
+  bool use_oss_{false};
+  bool with_ernie_{false};
   nvinfer1::ILogger& logger_;
 
   // max data size for the buffers.
diff --git a/paddle/fluid/inference/tensorrt/op_teller.cc b/paddle/fluid/inference/tensorrt/op_teller.cc
index 21ca67839784a..78585078e19e6 100644
--- a/paddle/fluid/inference/tensorrt/op_teller.cc
+++ b/paddle/fluid/inference/tensorrt/op_teller.cc
@@ -13,6 +13,8 @@
 // limitations under the License.
 
 #include "paddle/fluid/inference/tensorrt/op_teller.h"
+#include "paddle/fluid/framework/block_desc.h"
+#include "paddle/fluid/framework/var_desc.h"
 
 namespace paddle {
 namespace framework {
@@ -70,6 +72,7 @@ struct SimpleOpTypeSetTeller : public Teller {
                                                   "hard_swish"};
   std::unordered_set<std::string> teller_set{
       "mul",
+      "matmul",
       "conv2d",
       "pool2d",
       "relu",
@@ -122,6 +125,20 @@ bool OpTeller::Tell(const std::string& op_type, const framework::OpDesc& desc,
           (padding_algorithm == "SAME" && op_type != "pool2d"))
         return false;
     }
+    if (op_type == "matmul") {
+      auto* block = desc.Block();
+      for (auto& param_name : desc.Inputs()) {
+        for (auto& var_name : param_name.second) {
+          auto* var_desc = block->FindVar(var_name);
+          const auto shape = var_desc->GetShape();
+          if (shape.size() < 3) {
+            VLOG(1) << "matmul op dims < 3 not supported in tensorrt, but got dims " 
+              << shape.size() << ", so jump it.";
+            return false;
+          }
+        }
+      }
+    }
     if ((*teller)(op_type, desc, use_no_calib_int8)) return true;
   }
   return false;
diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
index 98afdbe254a4b..e37beb3b8e5c3 100644
--- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt
@@ -4,5 +4,5 @@ nv_library(tensorrt_plugin
            pool_op_plugin.cu swish_op_plugin.cu layer_norm_op_plugin.cu
            instance_norm_op_plugin.cu emb_eltwise_layernorm_plugin.cu
            qkv_to_context_plugin.cu skip_layernorm_op_plugin.cu slice_op_plugin.cu
-           hard_swish_op_plugin.cu stack_op_plugin.cu
+           hard_swish_op_plugin.cu stack_op_plugin.cu special_slice_plugin.cu
            DEPS enforce tensorrt_engine prelu tensor bert_encoder_functor)
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
new file mode 100644
index 0000000000000..ed0a530439f0a
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.cu
@@ -0,0 +1,177 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cassert>
+#include <cstring>
+#include <vector>
+#include "paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_factory.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+SpecialSlicePluginDynamic::SpecialSlicePluginDynamic() {}
+
+SpecialSlicePluginDynamic::SpecialSlicePluginDynamic(void const* serial_data,
+                                                     size_t serial_length) {}
+
+SpecialSlicePluginDynamic::~SpecialSlicePluginDynamic() {}
+
+nvinfer1::IPluginV2DynamicExt* SpecialSlicePluginDynamic::clone() const {
+  return new SpecialSlicePluginDynamic();
+}
+
+const char* SpecialSlicePluginDynamic::getPluginType() const {
+  return "special_slice_plugin";
+}
+
+int SpecialSlicePluginDynamic::getNbOutputs() const { return 1; }
+
+int SpecialSlicePluginDynamic::initialize() { return 0; }
+
+size_t SpecialSlicePluginDynamic::getSerializationSize() const {
+  size_t serialize_size = 0;
+  return serialize_size;
+}
+
+void SpecialSlicePluginDynamic::serialize(void* buffer) const {}
+
+nvinfer1::DimsExprs SpecialSlicePluginDynamic::getOutputDimensions(
+    int output_index, const nvinfer1::DimsExprs* inputs, int nb_inputs,
+    nvinfer1::IExprBuilder& expr_builder) {
+  nvinfer1::DimsExprs output(inputs[0]);
+  auto one = expr_builder.constant(1);
+  output.d[0] = expr_builder.operation(nvinfer1::DimensionOperation::kSUB,
+                                       *inputs[1].d[0], *one);
+
+  return output;
+}
+
+void SpecialSlicePluginDynamic::configurePlugin(
+    const nvinfer1::DynamicPluginTensorDesc* in, int nbInputs,
+    const nvinfer1::DynamicPluginTensorDesc* out, int nbOutputs) {}
+
+size_t SpecialSlicePluginDynamic::getWorkspaceSize(
+    const nvinfer1::PluginTensorDesc* inputs, int nbInputs,
+    const nvinfer1::PluginTensorDesc* outputs, int nbOutputs) const {
+  return 0;
+}
+
+void SpecialSlicePluginDynamic::destroy() { delete this; }
+
+void SpecialSlicePluginDynamic::terminate() {}
+
+bool SpecialSlicePluginDynamic::supportsFormatCombination(
+    int pos, const nvinfer1::PluginTensorDesc* desc, int nb_inputs,
+    int nb_outputs) {
+  if (pos == 0)  // slice tensor
+    return (desc[pos].type == nvinfer1::DataType::kHALF &&
+            desc[pos].format ==
+                nvinfer1::TensorFormat::kLINEAR);  // || desc[pos].type ==
+  // nvinfer1::DataType::kFLOAT);
+
+  if (pos == 1)  // cu_seqlen
+    return (desc[pos].type == nvinfer1::DataType::kINT32 &&
+            desc[pos].format == nvinfer1::TensorFormat::kLINEAR);
+
+  return (desc[pos].type == nvinfer1::DataType::kHALF &&
+          desc[pos].format ==
+              nvinfer1::TensorFormat::kLINEAR);  // || desc[pos].type ==
+  // nvinfer1::DataType::kFLOAT);
+}
+
+nvinfer1::DataType SpecialSlicePluginDynamic::getOutputDataType(
+    int index, const nvinfer1::DataType* input_types, int nb_inputs) const {
+  PADDLE_ENFORCE_EQ(index, 0, platform::errors::InvalidArgument(
+                                  "The index should be equal to 0"));
+  return input_types[0];
+}
+
+template <typename T>
+__global__ void SpecialSliceKernel(const T* slice_input,
+                                   const int32_t* cu_seqlens, T* output) {
+  const int hidden = blockDim.x;
+  const int batch = blockIdx.x;
+
+  output[batch * hidden + threadIdx.x] =
+      slice_input[cu_seqlens[batch] * hidden + threadIdx.x];
+}
+
+int SpecialSlicePluginDynamic::enqueue(
+    const nvinfer1::PluginTensorDesc* input_desc,
+    const nvinfer1::PluginTensorDesc* output_desc, const void* const* inputs,
+    void* const* outputs, void* workspace, cudaStream_t stream) {
+  auto input_dims = input_desc[0].dims;  // (sum(S), 768, 1, 1)
+  auto out_dims = output_desc[0].dims;   // (batch, 768, 1, 1)
+
+  assert(input_desc[0].type == nvinfer1::DataType::kHALF);
+
+  const int32_t hidden = input_dims.d[1];
+  const int num_blocks = out_dims.d[0];  // batch size
+  const int num_threads = hidden;
+
+  const half* slice_input = static_cast<const half*>(inputs[0]);
+  const int32_t* cu_seqlens = static_cast<const int32_t*>(inputs[1]);
+  half* output = static_cast<half*>(outputs[0]);
+
+  SpecialSliceKernel<<<num_blocks, num_threads, 0, stream>>>(
+      slice_input, cu_seqlens, output);
+
+  return cudaGetLastError() != cudaSuccess;
+}
+
+SpecialSlicePluginDynamicCreator::SpecialSlicePluginDynamicCreator() {}
+
+const char* SpecialSlicePluginDynamicCreator::getPluginName() const {
+  return "special_slice_plugin";
+}
+
+const char* SpecialSlicePluginDynamicCreator::getPluginVersion() const {
+  return "1";
+}
+
+const nvinfer1::PluginFieldCollection*
+SpecialSlicePluginDynamicCreator::getFieldNames() {
+  return &field_collection_;
+}
+
+nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::createPlugin(
+    const char* name, const nvinfer1::PluginFieldCollection* fc) {
+  return new SpecialSlicePluginDynamic();
+}
+
+nvinfer1::IPluginV2* SpecialSlicePluginDynamicCreator::deserializePlugin(
+    const char* name, const void* serial_data, size_t serial_length) {
+  auto plugin = new SpecialSlicePluginDynamic(serial_data, serial_length);
+  return plugin;
+}
+
+void SpecialSlicePluginDynamicCreator::setPluginNamespace(
+    const char* lib_namespace) {
+  plugin_namespace_ = lib_namespace;
+}
+
+const char* SpecialSlicePluginDynamicCreator::getPluginNamespace() const {
+  return plugin_namespace_.c_str();
+}
+
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
new file mode 100644
index 0000000000000..438d9e9465c52
--- /dev/null
+++ b/paddle/fluid/inference/tensorrt/plugin/special_slice_plugin.h
@@ -0,0 +1,96 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <stdio.h>
+#include <cassert>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h"
+
+namespace paddle {
+namespace inference {
+namespace tensorrt {
+namespace plugin {
+
+#if IS_TRT_VERSION_GE(6000)
+class SpecialSlicePluginDynamic : public DynamicPluginTensorRT {
+ public:
+  SpecialSlicePluginDynamic();
+  SpecialSlicePluginDynamic(void const* serial_data, size_t serial_length);
+  ~SpecialSlicePluginDynamic();
+  nvinfer1::IPluginV2DynamicExt* clone() const override;
+  nvinfer1::DimsExprs getOutputDimensions(
+      int outputIndex, const nvinfer1::DimsExprs* inputs, int nbInputs,
+      nvinfer1::IExprBuilder& exprBuilder) override;
+  bool supportsFormatCombination(int pos,
+                                 const nvinfer1::PluginTensorDesc* inOut,
+                                 int nbInputs, int nbOutputs) override;
+  void configurePlugin(const nvinfer1::DynamicPluginTensorDesc* in,
+                       int nbInputs,
+                       const nvinfer1::DynamicPluginTensorDesc* out,
+                       int nbOutputs) override;
+  size_t getWorkspaceSize(const nvinfer1::PluginTensorDesc* inputs,
+                          int nbInputs,
+                          const nvinfer1::PluginTensorDesc* outputs,
+                          int nbOutputs) const override;
+  int enqueue(const nvinfer1::PluginTensorDesc* inputDesc,
+              const nvinfer1::PluginTensorDesc* outputDesc,
+              const void* const* inputs, void* const* outputs, void* workspace,
+              cudaStream_t stream) override;
+
+  nvinfer1::DataType getOutputDataType(int index,
+                                       const nvinfer1::DataType* inputTypes,
+                                       int nbInputs) const override;
+
+  const char* getPluginType() const override;
+  int getNbOutputs() const override;
+  int initialize() override;
+  void terminate() override;
+  size_t getSerializationSize() const override;
+  void serialize(void* buffer) const override;
+  void destroy() override;
+
+ private:
+  int axis_;
+  int num_stack_;
+};
+
+class SpecialSlicePluginDynamicCreator : public nvinfer1::IPluginCreator {
+ public:
+  SpecialSlicePluginDynamicCreator();
+  const char* getPluginName() const override;
+  const char* getPluginVersion() const override;
+  const nvinfer1::PluginFieldCollection* getFieldNames() override;
+  nvinfer1::IPluginV2* createPlugin(
+      const char* name, const nvinfer1::PluginFieldCollection* fc) override;
+  nvinfer1::IPluginV2* deserializePlugin(const char* name,
+                                         const void* serial_data,
+                                         size_t serial_length) override;
+  void setPluginNamespace(const char* lib_namespace) override;
+  const char* getPluginNamespace() const override;
+
+ private:
+  std::string plugin_namespace_;
+  nvinfer1::PluginFieldCollection field_collection_{0, nullptr};
+  std::vector<nvinfer1::PluginField> plugin_attributes_;
+};
+REGISTER_TRT_PLUGIN_V2(SpecialSlicePluginDynamicCreator);
+#endif
+
+}  // namespace plugin
+}  // namespace tensorrt
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
index d49f83b9d38a3..b2711ee1e9d8a 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_deserialize_test.cc
@@ -126,17 +126,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
       {"read_file_0.tmp_0", min_shape},
       {"read_file_0.tmp_1", min_shape},
       {"read_file_0.tmp_2", min_shape},
-      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
+      {"read_file_0.tmp_4", min_shape}};
   std::map<std::string, std::vector<int>> max_input_shape = {
       {"read_file_0.tmp_0", max_shape},
       {"read_file_0.tmp_1", max_shape},
       {"read_file_0.tmp_2", max_shape},
-      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
+      {"read_file_0.tmp_4", max_shape}};
   std::map<std::string, std::vector<int>> opt_input_shape = {
       {"read_file_0.tmp_0", opt_shape},
       {"read_file_0.tmp_1", opt_shape},
       {"read_file_0.tmp_2", opt_shape},
-      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};
+      {"read_file_0.tmp_4", opt_shape}};
 
   auto precision = AnalysisConfig::Precision::kFloat32;
   if (with_fp16) {
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
index 17fedc3d3b8bb..43dfb893c5dfd 100644
--- a/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_ernie_test.cc
@@ -86,16 +86,16 @@ void run(const AnalysisConfig& config, std::vector<float>* out_data) {
 void trt_ernie(bool with_fp16, std::vector<float> result) {
   AnalysisConfig config;
   std::string model_dir = FLAGS_infer_model;
-  SetConfig(&config, model_dir, true /* use_gpu */);
+  SetConfig(&config, model_dir, true);
 
   config.SwitchUseFeedFetchOps(false);
 
-  int batch = 1;
+  int batch = 32;
   int min_seq_len = 1;
   int max_seq_len = 128;
   int opt_seq_len = 128;
 
-  std::vector<int> min_shape = {batch, min_seq_len, 1};
+  std::vector<int> min_shape = {1, min_seq_len, 1};
   std::vector<int> max_shape = {batch, max_seq_len, 1};
   std::vector<int> opt_shape = {batch, opt_seq_len, 1};
   // Set the input's min, max, opt shape
@@ -103,17 +103,17 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
       {"read_file_0.tmp_0", min_shape},
       {"read_file_0.tmp_1", min_shape},
       {"read_file_0.tmp_2", min_shape},
-      {"matmul_0.tmp_0", {batch, min_seq_len, min_seq_len}}};
+      {"read_file_0.tmp_4", min_shape}};
   std::map<std::string, std::vector<int>> max_input_shape = {
       {"read_file_0.tmp_0", max_shape},
       {"read_file_0.tmp_1", max_shape},
       {"read_file_0.tmp_2", max_shape},
-      {"matmul_0.tmp_0", {batch, max_seq_len, max_seq_len}}};
+      {"read_file_0.tmp_4", max_shape}};
   std::map<std::string, std::vector<int>> opt_input_shape = {
       {"read_file_0.tmp_0", opt_shape},
       {"read_file_0.tmp_1", opt_shape},
       {"read_file_0.tmp_2", opt_shape},
-      {"matmul_0.tmp_0", {batch, opt_seq_len, opt_seq_len}}};
+      {"read_file_0.tmp_4", opt_shape}};
 
   auto precision = AnalysisConfig::Precision::kFloat32;
   if (with_fp16) {
@@ -124,6 +124,7 @@ void trt_ernie(bool with_fp16, std::vector<float> result) {
                                 opt_input_shape);
   std::vector<float> out_data;
   run(config, &out_data);
+
   for (size_t i = 0; i < out_data.size(); i++) {
     EXPECT_NEAR(result[i], out_data[i], 1e-5);
   }
diff --git a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
index 922340b08c638..792737865ba17 100644
--- a/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
+++ b/paddle/fluid/operators/tensorrt/tensorrt_engine_op.h
@@ -278,9 +278,11 @@ class TensorRTEngineOp : public framework::OperatorBase {
         buffers[bind_index] = static_cast<void *>(t.data<float>());
       } else if (type == framework::proto::VarType::INT64) {
         buffers[bind_index] = static_cast<void *>(t.data<int64_t>());
+      } else if (type == framework::proto::VarType::INT32) {
+        buffers[bind_index] = static_cast<void *>(t.data<int32_t>());
       } else {
         PADDLE_THROW(platform::errors::Fatal(
-            "The TRT Engine OP only support float and int64_t input."));
+            "The TRT Engine OP only support float/int32_t/int64_t input."));
       }
     }
 
diff --git a/paddle/fluid/platform/dynload/tensorrt.cc b/paddle/fluid/platform/dynload/tensorrt.cc
index b7b8a749d2ac0..6232a6e33cac4 100644
--- a/paddle/fluid/platform/dynload/tensorrt.cc
+++ b/paddle/fluid/platform/dynload/tensorrt.cc
@@ -22,19 +22,15 @@ namespace dynload {
 std::once_flag tensorrt_dso_flag;
 void* tensorrt_dso_handle;
 
+std::once_flag tensorrt_plugin_dso_flag;
+void* tensorrt_plugin_dso_handle;
+
 #define DEFINE_WRAP(__name) DynLoad__##__name __name
 
 TENSORRT_RAND_ROUTINE_EACH(DEFINE_WRAP);
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DEFINE_WRAP);
 
-void* GetTensorRtHandle() {
-#if defined(__APPLE__) || defined(__OSX__)
-  std::string dso_name = "libnvinfer.dylib";
-#elif defined(_WIN32)
-  std::string dso_name = "nvinfer.dll";
-#else
-  std::string dso_name = "libnvinfer.so";
-#endif
-
+void* GetDsoHandle(const std::string& dso_name) {
 #if !defined(_WIN32)
   int dynload_flags = RTLD_LAZY | RTLD_LOCAL;
 #else
@@ -49,10 +45,31 @@ void* GetTensorRtHandle() {
         "library is not found. Ignore this if TensorRT is not needed.";
     std::cerr << error_msg;
   }
-
   return dso_handle;
 }
 
+void* GetTensorRtHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer.dll";
+#else
+  std::string dso_name = "libnvinfer.so";
+#endif
+  return GetDsoHandle(dso_name);
+}
+
+void* GetTensorRtPluginHandle() {
+#if defined(__APPLE__) || defined(__OSX__)
+  std::string dso_name = "libnvinfer_plugin.dylib";
+#elif defined(_WIN32)
+  std::string dso_name = "nvinfer_plugin.dll";
+#else
+  std::string dso_name = "libnvinfer_plugin.so";
+#endif
+  return GetDsoHandle(dso_name);
+}
+
 }  // namespace dynload
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/platform/dynload/tensorrt.h b/paddle/fluid/platform/dynload/tensorrt.h
index c6650c0b041e2..dbd5e5e2d65e6 100644
--- a/paddle/fluid/platform/dynload/tensorrt.h
+++ b/paddle/fluid/platform/dynload/tensorrt.h
@@ -14,6 +14,7 @@ limitations under the License. */
 #pragma once
 
 #include <NvInfer.h>
+#include <NvInferPlugin.h>
 #if !defined(_WIN32)
 #include <dlfcn.h>
 #endif
@@ -32,6 +33,10 @@ void* GetTensorRtHandle();
 extern std::once_flag tensorrt_dso_flag;
 extern void* tensorrt_dso_handle;
 
+void* GetTensorRtPluginHandle();
+extern std::once_flag tensorrt_plugin_dso_flag;
+extern void* tensorrt_plugin_dso_handle;
+
 #define DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP(__name)                            \
   struct DynLoad__##__name {                                                  \
     template <typename... Args>                                               \
@@ -50,7 +55,26 @@ extern void* tensorrt_dso_handle;
   };                                                                          \
   extern DynLoad__##__name __name
 
+#define DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP(__name)                      \
+  struct DynLoad__##__name {                                                   \
+    template <typename... Args>                                                \
+    auto operator()(Args... args) -> DECLARE_TYPE(__name, args...) {           \
+      std::call_once(tensorrt_plugin_dso_flag, []() {                          \
+        tensorrt_plugin_dso_handle =                                           \
+            paddle::platform::dynload::GetTensorRtPluginHandle();              \
+      });                                                                      \
+      static void* p_##__name = dlsym(tensorrt_plugin_dso_handle, #__name);    \
+      PADDLE_ENFORCE_NOT_NULL(p_##__name,                                      \
+                              platform::errors::Unavailable(                   \
+                                  "Load tensorrt plugin %s failed", #__name)); \
+      using tensorrt_plugin_func = decltype(&::__name);                        \
+      return reinterpret_cast<tensorrt_plugin_func>(p_##__name)(args...);      \
+    }                                                                          \
+  };                                                                           \
+  extern DynLoad__##__name __name
+
 #ifdef NV_TENSORRT_MAJOR
+
 #if (NV_TENSORRT_MAJOR >= 6)
 #define TENSORRT_RAND_ROUTINE_EACH(__macro) \
   __macro(createInferBuilder_INTERNAL);     \
@@ -62,8 +86,13 @@ extern void* tensorrt_dso_handle;
   __macro(createInferRuntime_INTERNAL);
 #endif
 
+#define TENSORRT_PLUGIN_RAND_ROUTINE_EACH(__macro) \
+  __macro(initLibNvInferPlugins);
+
 TENSORRT_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_WRAP)
-#endif
+TENSORRT_PLUGIN_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_TENSORRT_PLUGIN_WRAP)
+
+#endif // end of NV_TENSORRT_MAJOR
 
 }  // namespace dynload
 }  // namespace platform
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index 61b8b1643665c..a0cb096193fcd 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -487,6 +487,8 @@ void BindAnalysisConfig(py::module *m) {
            py::arg("optim_input_shape") =
                std::map<std::string, std::vector<int>>({}),
            py::arg("disable_trt_plugin_fp16") = false)
+      .def("enable_tensorrt_oss", &AnalysisConfig::EnableTensorRtOSS)
+      .def("tensorrt_oss_enabled", &AnalysisConfig::tensorrt_oss_enabled)
       .def("tensorrt_engine_enabled", &AnalysisConfig::tensorrt_engine_enabled)
       .def("enable_lite_engine", &AnalysisConfig::EnableLiteEngine,
            py::arg("precision_mode") = AnalysisConfig::Precision::kFloat32,
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
index 0d32af7c2870d..0209bb344ece7 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/inference_pass_test.py
@@ -20,6 +20,7 @@
 import unittest
 import numpy as np
 
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import PaddleTensor
@@ -34,6 +35,7 @@
 
 class InferencePassTest(unittest.TestCase):
     def __init__(self, methodName='runTest'):
+        paddle.enable_static()
         super(InferencePassTest, self).__init__(methodName)
         self.main_program = fluid.Program()
         self.startup_program = fluid.Program()
@@ -211,6 +213,7 @@ def check_output_with_option(self,
             if flatten:
                 out = out.flatten()
                 analysis_output = analysis_output.flatten()
+
             self.assertTrue(
                 np.allclose(
                     out, analysis_output, atol=atol),
@@ -232,6 +235,7 @@ def check_output_with_option(self,
                 if flatten:
                     out = out.flatten()
                     tensorrt_output = tensorrt_output.flatten()
+
                 self.assertTrue(
                     np.allclose(
                         out, tensorrt_output, atol=atol),
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
new file mode 100644
index 0000000000000..94434f4043448
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_trt_matmul.py
@@ -0,0 +1,116 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from inference_pass_test import InferencePassTest
+import paddle.fluid as fluid
+import paddle.fluid.core as core
+from paddle.fluid.core import PassVersionChecker
+from paddle.fluid.core import AnalysisConfig
+
+
+class TensorRTMatMulDims2Test(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[24, 24], dtype="float32")
+            matmul_out = fluid.layers.matmul(
+                x=data,
+                y=data,
+                transpose_x = self.transpose_x,
+                transpose_y = self.transpose_y,
+                alpha = self.alpha)
+            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+
+        self.feeds = {
+            "data": np.ones([24, 24]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTMatMulDims2Test.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.transpose_x = True
+        self.transpose_y = True
+        self.alpha = 2.0
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTMatMulTest(InferencePassTest):
+    def setUp(self):
+        self.set_params()
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(
+                name="data", shape=[-1, 6, 24, 24], dtype="float32")
+            matmul_out = fluid.layers.matmul(
+                x=data,
+                y=data,
+                transpose_x = self.transpose_x,
+                transpose_y = self.transpose_y,
+                alpha = self.alpha)
+            out = fluid.layers.batch_norm(matmul_out, is_test=True)
+
+        self.feeds = {
+            "data": np.ones([1, 6, 24, 24]).astype("float32"),
+        }
+        self.enable_trt = True
+        self.trt_parameters = TensorRTMatMulTest.TensorRTParam(
+            1 << 30, 32, 0, AnalysisConfig.Precision.Float32, False, False)
+        self.fetch_list = [out]
+
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = False
+        self.alpha = 1.0
+
+    def test_check_output(self):
+        if core.is_compiled_with_cuda():
+            use_gpu = True
+            self.check_output_with_option(use_gpu)
+            self.assertTrue(
+                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass'))
+
+
+class TensorRTMatMulTransposeXTest(TensorRTMatMulTest):
+    def set_params(self):
+        self.transpose_x = True
+        self.transpose_y = False
+        self.alpha = 1.0
+
+
+class TensorRTMatMulTransposeYTest(TensorRTMatMulTest):
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = True
+        self.alpha = 1.0
+
+
+class TensorRTMatMulScaleTest(TensorRTMatMulTest):
+    def set_params(self):
+        self.transpose_x = False
+        self.transpose_y = False
+        self.alpha = 2.0
+
+
+if __name__ == "__main__":
+    unittest.main()

From c1c3e217262198e0caa179b06ef4ada04a4527d4 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 4 Nov 2020 09:58:35 +0800
Subject: [PATCH 105/185] retry will not be executed when the number of failed
 ut is greater than 20 (#28374)

* retry will not be executed when the number of failed ut is greater than 20

* add log display

* fix some error

* fix some error

* fix some error

* fix some error
---
 paddle/scripts/paddle_build.sh | 189 ++++++++++++++++++---------------
 1 file changed, 105 insertions(+), 84 deletions(-)

diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 87fb6628f4223..315e2ac7af003 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -568,35 +568,46 @@ EOF
         retry_time=3
         exec_times=0
         exec_time_array=('first' 'second' 'third')
+        exec_retry_threshold=20
         if [ -n "$failed_test_lists" ];then
             mactest_error=1
-            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
-                do
-                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
-                    failed_test_lists_ult=`echo "${failed_test_lists}"`
-                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
-                    echo "========================================="
-                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
-                    echo "========================================="
-                    echo "The following unittest will be re-run:"
-                    echo "${retry_unittests}"
-                    echo "========================================="
-
-                    retry_unittests_regular=''
-                    for line in ${retry_unittests[@]} ;
-                        do
-                            if [[ "$retry_unittests_regular" == "" ]];then
-                                retry_unittests_regular="^$line$"
-                            else
-                                retry_unittests_regular="$retry_unittests_regular|^$line$"
-                            fi
-                        done
-                    rm -f $tmp_dir/*
-                    failed_test_lists=''
-                    ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
-                    collect_failed_tests
-                    exec_times=$[$exec_times+1]
-                done
+            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                    do
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}"`
+                        read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"
+                        echo "========================================="
+
+                        retry_unittests_regular=''
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_unittests_regular" == "" ]];then
+                                    retry_unittests_regular="^$line$"
+                                else
+                                    retry_unittests_regular="$retry_unittests_regular|^$line$"
+                                fi
+                            done
+                        rm -f $tmp_dir/*
+                        failed_test_lists=''
+                        ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
+                        collect_failed_tests
+                        exec_times=$[$exec_times+1]
+                    done
+            else
+                echo "========================================="
+                echo "There are more than 20 failed unit tests, so no unit test retry!!!"
+                echo "========================================="
+            fi
+
         fi
         #mactest_error=$?
         ut_endTime_s=`date +%s`
@@ -1080,71 +1091,81 @@ set +x
         retry_unittests_record=''
         retry_time=3
         exec_time_array=('first' 'second' 'third')
+        exec_retry_threshold=20
         if [ -n "$failed_test_lists" ];then
-            while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
-                do
-                    
-                    retry_unittests_record="$retry_unittests_record$failed_test_lists"
-                    failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
-                    read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
-                    echo "========================================="
-                    echo "This is the ${exec_time_array[$exec_times]} time to re-run"
-                    echo "========================================="
-                    echo "The following unittest will be re-run:"
-                    echo "${failed_test_lists_ult}"
+            read need_retry_ut_str <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                while ( [ $exec_times -lt $retry_time ] && [ -n "${failed_test_lists}" ] )
+                    do
                         
-                    for line in ${retry_unittests[@]} ;
-                        do
-
-                            one_card_tests=$single_card_tests'|'$single_card_tests_1
-
-                            read tmp_one_tmp <<< "$( echo $one_card_tests | grep -oEi $line )"
-                            read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )"
-                            read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )"
-
-                            if [[ "$tmp_one_tmp" != ""  ]]; then
-                                if [[ "$one_card_retry" == "" ]]; then
-                                    one_card_retry="^$line$"
-                                else
-                                    one_card_retry="$one_card_retry|^$line$"
-                                fi
-                            elif [[ "$tmp_mul_tmp" != "" ]]; then
-                                if [[ "$multiple_card_retry" == "" ]]; then
-                                    multiple_card_retry="^$line$"
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}" |grep -Po '[^ ].*$'`
+                        read retry_unittests <<< $(echo "$failed_test_lists" | grep -oEi "\-.+\(.+\)" | sed 's/(.\+)//' | sed 's/- //' )
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${failed_test_lists_ult}"
+                            
+                        for line in ${retry_unittests[@]} ;
+                            do
+
+                                one_card_tests=$single_card_tests'|'$single_card_tests_1
+
+                                read tmp_one_tmp <<< "$( echo $one_card_tests | grep -oEi $line )"
+                                read tmp_mul_tmp <<< "$( echo $multiple_card_tests | grep -oEi $line )"
+                                read exclusive_tmp <<< "$( echo $exclusive_tests | grep -oEi $line )"
+
+                                if [[ "$tmp_one_tmp" != ""  ]]; then
+                                    if [[ "$one_card_retry" == "" ]]; then
+                                        one_card_retry="^$line$"
+                                    else
+                                        one_card_retry="$one_card_retry|^$line$"
+                                    fi
+                                elif [[ "$tmp_mul_tmp" != "" ]]; then
+                                    if [[ "$multiple_card_retry" == "" ]]; then
+                                        multiple_card_retry="^$line$"
+                                    else
+                                        multiple_card_retry="$multiple_card_retry|^$line$"
+                                    fi
                                 else
-                                    multiple_card_retry="$multiple_card_retry|^$line$"
+                                    if [[ "$exclusive_retry" == "" ]];then
+                                        exclusive_retry="^$line$"
+                                    else
+                                        exclusive_retry="$exclusive_retry|^$line$"
+                                    fi
                                 fi
-                            else
-                                if [[ "$exclusive_retry" == "" ]];then
-                                    exclusive_retry="^$line$"
-                                else
-                                    exclusive_retry="$exclusive_retry|^$line$"
-                                fi
-                            fi
 
-                        done
+                            done
 
-                    if [[ "$one_card_retry" != "" ]]; then
-                        card_test "$one_card_retry" 1
-                    fi
+                        if [[ "$one_card_retry" != "" ]]; then
+                            card_test "$one_card_retry" 1
+                        fi
 
-                    if [[ "$multiple_card_retry" != "" ]]; then
-                        card_test "$multiple_card_retry" 2
-                    fi
+                        if [[ "$multiple_card_retry" != "" ]]; then
+                            card_test "$multiple_card_retry" 2
+                        fi
 
-                    if [[ "$exclusive_retry" != "" ]]; then
-                        card_test "$exclusive_retry"
-                    fi
-                    
-                    exec_times=$[$exec_times+1]
-                    failed_test_lists=''
-                    collect_failed_tests
-                    rm -f $tmp_dir/*
-                    one_card_retry=''
-                    multiple_card_retry=''
-                    exclusive_retry=''
-                    retry_unittests=''
-                done
+                        if [[ "$exclusive_retry" != "" ]]; then
+                            card_test "$exclusive_retry"
+                        fi
+                        
+                        exec_times=$[$exec_times+1]
+                        failed_test_lists=''
+                        collect_failed_tests
+                        rm -f $tmp_dir/*
+                        one_card_retry=''
+                        multiple_card_retry=''
+                        exclusive_retry=''
+                        retry_unittests=''
+                    done
+            else 
+                echo "========================================="
+                echo "There are more than 20 failed unit tests, so no unit test retry!!!"
+                echo "========================================="
+            fi
         fi
 
         if [[ "$EXIT_CODE" != "0" ]]; then

From 21a63f6f90fcea6ff6b892709372cab35b8cf06e Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Wed, 4 Nov 2020 10:25:16 +0800
Subject: [PATCH 106/185] enhance the op_version_registry, test=develop
 (#28347)

* enhance the op_version_registry, test=develop

* add unittests, test=develop

* enhance the op_version_registry, test=develop

* fix bugs, test=develop

* revert pybind_boost_headers.h, test=develop

* fix a attribute bug, test=develop
---
 paddle/fluid/framework/ir/CMakeLists.txt      |   4 +-
 paddle/fluid/framework/op_version_registry.cc |  72 ++++++
 paddle/fluid/framework/op_version_registry.h  | 234 ++++++++++--------
 .../fluid/framework/op_version_registry.inl   |  42 ++++
 .../framework/op_version_registry_test.cc     |  11 +-
 paddle/fluid/operators/CMakeLists.txt         |   1 +
 paddle/fluid/pybind/CMakeLists.txt            |   2 +-
 paddle/fluid/pybind/compatible.cc             | 120 ++++++++-
 .../fluid/tests/unittests/test_op_version.py  |  83 +++++++
 python/paddle/utils/__init__.py               |   1 +
 python/paddle/utils/op_version.py             |  70 ++++++
 11 files changed, 518 insertions(+), 122 deletions(-)
 create mode 100644 paddle/fluid/framework/op_version_registry.inl
 create mode 100644 python/paddle/fluid/tests/unittests/test_op_version.py
 create mode 100644 python/paddle/utils/op_version.py

diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index f9ab60c5c7478..f2f7e16ff2bbe 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -23,9 +23,9 @@ function(pass_library TARGET DEST)
 
     cmake_parse_arguments(pass_library "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     if(pass_library_DIR)
-        cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS})
+        cc_library(${TARGET} SRCS ${pass_library_DIR}/${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry ${pass_library_DEPS})
     else()
-        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base ${pass_library_DEPS})
+        cc_library(${TARGET} SRCS ${TARGET}.cc DEPS graph_pattern_detector pass fuse_pass_base op_version_registry ${pass_library_DEPS})
     endif()
 
     # add more DEST here, such as train, dist and collect USE_PASS into a file automatically.
diff --git a/paddle/fluid/framework/op_version_registry.cc b/paddle/fluid/framework/op_version_registry.cc
index 9a67c160f0233..38eb8af77db7d 100644
--- a/paddle/fluid/framework/op_version_registry.cc
+++ b/paddle/fluid/framework/op_version_registry.cc
@@ -13,3 +13,75 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_version_registry.h"
+
+namespace paddle {
+namespace framework {
+namespace compatible {
+
+namespace {
+template <OpUpdateType type__, typename InfoType>
+OpUpdate<InfoType, type__>* new_update(InfoType&& info) {
+  return new OpUpdate<InfoType, type__>(info);
+}
+}
+
+OpVersionDesc&& OpVersionDesc::ModifyAttr(const std::string& name,
+                                          const std::string& remark,
+                                          const OpAttrVariantT& default_value) {
+  infos_.emplace_back(new_update<OpUpdateType::kModifyAttr>(
+      OpAttrInfo(name, remark, default_value)));
+  return std::move(*this);
+}
+
+OpVersionDesc&& OpVersionDesc::NewAttr(const std::string& name,
+                                       const std::string& remark,
+                                       const OpAttrVariantT& default_value) {
+  infos_.emplace_back(new_update<OpUpdateType::kNewAttr>(
+      OpAttrInfo(name, remark, default_value)));
+  return std::move(*this);
+}
+
+OpVersionDesc&& OpVersionDesc::NewInput(const std::string& name,
+                                        const std::string& remark) {
+  infos_.emplace_back(
+      new_update<OpUpdateType::kNewInput>(OpInputOutputInfo(name, remark)));
+  return std::move(*this);
+}
+
+OpVersionDesc&& OpVersionDesc::NewOutput(const std::string& name,
+                                         const std::string& remark) {
+  infos_.emplace_back(
+      new_update<OpUpdateType::kNewOutput>(OpInputOutputInfo(name, remark)));
+  return std::move(*this);
+}
+
+OpVersionDesc&& OpVersionDesc::BugfixWithBehaviorChanged(
+    const std::string& remark) {
+  infos_.emplace_back(new_update<OpUpdateType::kBugfixWithBehaviorChanged>(
+      OpBugfixInfo(remark)));
+  return std::move(*this);
+}
+
+OpVersion& OpVersionRegistrar::Register(const std::string& op_type) {
+  PADDLE_ENFORCE_EQ(
+      op_version_map_.find(op_type), op_version_map_.end(),
+      platform::errors::AlreadyExists(
+          "'%s' is registered in operator version more than once.", op_type));
+  op_version_map_.insert(
+      std::pair<std::string, OpVersion>{op_type, OpVersion()});
+  return op_version_map_[op_type];
+}
+uint32_t OpVersionRegistrar::version_id(const std::string& op_type) const {
+  PADDLE_ENFORCE_NE(
+      op_version_map_.count(op_type), 0,
+      platform::errors::InvalidArgument(
+          "The version of operator type %s has not been registered.", op_type));
+  return op_version_map_.find(op_type)->second.version_id();
+}
+
+// Provide a fake registration item for pybind testing.
+#include "paddle/fluid/framework/op_version_registry.inl"
+
+}  // namespace compatible
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 5ddaf1bd8d8ce..5822dfa11dd25 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 
-#include <boost/any.hpp>
+#include <boost/variant.hpp>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/framework/op_version_proto.h"
 #include "paddle/fluid/platform/enforce.h"
@@ -29,160 +29,173 @@ namespace paddle {
 namespace framework {
 namespace compatible {
 
-struct OpUpdateRecord {
-  enum class Type {
-    kInvalid = 0,
-    kModifyAttr,
-    kNewAttr,
-    kNewInput,
-    kNewOutput,
-    kBugfixWithBehaviorChanged,
-  };
-  Type type_;
-  std::string remark_;
+using OpAttrVariantT =
+    boost::variant<bool,                    /* AttrType::BOOL */
+                   float,                   /* AttrType::FLOAT */
+                   int32_t,                 /* AttrType::INT */
+                   int64_t,                 /* AttrType::LONG*/
+                   std::string,             /* AttrType::STRING */
+                   std::vector<bool>,       /* AttrType::BOOLS */
+                   std::vector<float>,      /* AttrType::FLOATS */
+                   std::vector<int32_t>,    /* AttrType::INTS */
+                   std::vector<int64_t>,    /* AttrType::LONGS */
+                   std::vector<std::string> /* AttrType::STRINGS */
+                   >;
+
+struct OpUpdateInfo {
+  virtual ~OpUpdateInfo() = default;
 };
 
-struct ModifyAttr : OpUpdateRecord {
-  ModifyAttr(const std::string& name, const std::string& remark,
-             const boost::any& default_value)
-      : OpUpdateRecord({Type::kModifyAttr, remark}),
-        name_(name),
-        default_value_(default_value) {
-    // TODO(Shixiaowei02): Check the data type with proto::OpDesc.
-  }
+struct OpAttrInfo : OpUpdateInfo {
+  OpAttrInfo(const std::string& name, const std::string& remark,
+             const OpAttrVariantT& default_value)
+      : name_{name}, default_value_{default_value}, remark_{remark} {}
+
+  const std::string& name() const { return name_; }
+  const OpAttrVariantT& default_value() const { return default_value_; }
+  const std::string& remark() const { return remark_; }
 
  private:
   std::string name_;
-  boost::any default_value_;
+  OpAttrVariantT default_value_;
+  std::string remark_;
 };
 
-struct NewAttr : OpUpdateRecord {
-  NewAttr(const std::string& name, const std::string& remark,
-          const boost::any& default_value)
-      : OpUpdateRecord({Type::kNewAttr, remark}),
-        name_(name),
-        default_value_(default_value) {}
+struct OpInputOutputInfo : OpUpdateInfo {
+  OpInputOutputInfo(const std::string& name, const std::string& remark)
+      : name_{name}, remark_{remark} {}
+
+  const std::string& name() const { return name_; }
+  const std::string& remark() const { return remark_; }
 
  private:
   std::string name_;
-  boost::any default_value_;
+  std::string remark_;
 };
 
-struct NewInput : OpUpdateRecord {
-  NewInput(const std::string& name, const std::string& remark)
-      : OpUpdateRecord({Type::kNewInput, remark}), name_(name) {}
+struct OpBugfixInfo : OpUpdateInfo {
+  explicit OpBugfixInfo(const std::string& remark) : remark_{remark} {}
+  const std::string& remark() const { return remark_; }
 
  private:
-  std::string name_;
+  std::string remark_;
 };
 
-struct NewOutput : OpUpdateRecord {
-  NewOutput(const std::string& name, const std::string& remark)
-      : OpUpdateRecord({Type::kNewOutput, remark}), name_(name) {}
+enum class OpUpdateType {
+  kInvalid = 0,
+  kModifyAttr,
+  kNewAttr,
+  kNewInput,
+  kNewOutput,
+  kBugfixWithBehaviorChanged,
+};
 
- private:
-  std::string name_;
+class OpUpdateBase {
+ public:
+  virtual const OpUpdateInfo* info() const = 0;
+  virtual OpUpdateType type() const = 0;
+  virtual ~OpUpdateBase() = default;
 };
 
-struct BugfixWithBehaviorChanged : OpUpdateRecord {
-  explicit BugfixWithBehaviorChanged(const std::string& remark)
-      : OpUpdateRecord({Type::kBugfixWithBehaviorChanged, remark}) {}
+template <typename InfoType, OpUpdateType type__>
+class OpUpdate : public OpUpdateBase {
+ public:
+  explicit OpUpdate(const InfoType& info) : info_{info}, type_{type__} {}
+  const OpUpdateInfo* info() const override { return &info_; }
+  OpUpdateType type() const override { return type_; }
+
+ private:
+  InfoType info_;
+  OpUpdateType type_;
 };
 
 class OpVersionDesc {
  public:
-  OpVersionDesc& ModifyAttr(const std::string& name, const std::string& remark,
-                            boost::any default_value) {
-    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
-        new compatible::ModifyAttr(name, remark, default_value)));
-    return *this;
+  OpVersionDesc&& ModifyAttr(const std::string& name, const std::string& remark,
+                             const OpAttrVariantT& default_value);
+  OpVersionDesc&& NewAttr(const std::string& name, const std::string& remark,
+                          const OpAttrVariantT& default_value);
+  OpVersionDesc&& NewInput(const std::string& name, const std::string& remark);
+  OpVersionDesc&& NewOutput(const std::string& name, const std::string& remark);
+  OpVersionDesc&& BugfixWithBehaviorChanged(const std::string& remark);
+  const std::vector<std::unique_ptr<OpUpdateBase>>& infos() const {
+    return infos_;
   }
 
-  OpVersionDesc& NewAttr(const std::string& name, const std::string& remark,
-                         boost::any default_value) {
-    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
-        new compatible::NewAttr(name, remark, default_value)));
-    return *this;
-  }
+  OpVersionDesc() = default;
+  OpVersionDesc(OpVersionDesc&&) = default;
+  OpVersionDesc& operator=(OpVersionDesc&&) = default;
 
-  OpVersionDesc& NewInput(const std::string& name, const std::string& remark) {
-    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
-        new compatible::NewInput(name, remark)));
-    return *this;
-  }
+ private:
+  std::vector<std::unique_ptr<OpUpdateBase>> infos_;
+};
 
-  OpVersionDesc& NewOutput(const std::string& name, const std::string& remark) {
-    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
-        new compatible::NewOutput(name, remark)));
-    return *this;
-  }
+class OpCheckpoint {
+ public:
+  OpCheckpoint(const std::string& note, OpVersionDesc&& op_version_desc)
+      : note_{note},
+        op_version_desc_{std::forward<OpVersionDesc>(op_version_desc)} {}
+  const std::string& note() const { return note_; }
+  const OpVersionDesc& version_desc() { return op_version_desc_; }
 
-  OpVersionDesc& BugfixWithBehaviorChanged(const std::string& remark) {
-    infos_.push_back(std::shared_ptr<OpUpdateRecord>(
-        new compatible::BugfixWithBehaviorChanged(remark)));
-    return *this;
-  }
+  OpCheckpoint() = default;
+  OpCheckpoint(OpCheckpoint&&) = default;
+  OpCheckpoint& operator=(OpCheckpoint&&) = default;
 
  private:
-  std::vector<std::shared_ptr<OpUpdateRecord>> infos_;
+  std::string note_;
+  OpVersionDesc op_version_desc_;
 };
 
 class OpVersion {
  public:
   OpVersion& AddCheckpoint(const std::string& note,
-                           const OpVersionDesc& op_version_desc) {
-    checkpoints_.push_back(Checkpoint({note, op_version_desc}));
+                           OpVersionDesc&& op_version_desc) {
+    checkpoints_.emplace_back(OpCheckpoint{note, std::move(op_version_desc)});
     return *this;
   }
-  uint32_t GetVersionID() const {
+  uint32_t version_id() const {
     return static_cast<uint32_t>(checkpoints_.size());
   }
+  const std::vector<OpCheckpoint>& checkpoints() const { return checkpoints_; }
+
+  OpVersion() = default;
+  OpVersion(OpVersion&&) = default;
+  OpVersion& operator=(OpVersion&&) = default;
 
  private:
-  struct Checkpoint {
-    std::string note_;
-    OpVersionDesc op_version_desc_;
-  };
-  std::vector<Checkpoint> checkpoints_;
+  std::vector<OpCheckpoint> checkpoints_;
 };
 
 class OpVersionRegistrar {
  public:
+  OpVersionRegistrar() = default;
   static OpVersionRegistrar& GetInstance() {
     static OpVersionRegistrar instance;
     return instance;
   }
-  OpVersion& Register(const std::string& op_type) {
-    PADDLE_ENFORCE_EQ(
-        op_version_map_.find(op_type), op_version_map_.end(),
-        platform::errors::AlreadyExists(
-            "'%s' is registered in operator version more than once.", op_type));
-    op_version_map_.insert({op_type, OpVersion()});
-    return op_version_map_[op_type];
-  }
+  OpVersion& Register(const std::string& op_type);
   const std::unordered_map<std::string, OpVersion>& GetVersionMap() {
     return op_version_map_;
   }
-  uint32_t GetVersionID(const std::string& op_type) const {
-    auto it = op_version_map_.find(op_type);
-    if (it == op_version_map_.end()) {
-      return 0;
-    }
-    return it->second.GetVersionID();
+  bool Has(const std::string& op_type) const {
+    return op_version_map_.count(op_type);
   }
+  uint32_t version_id(const std::string& op_type) const;
 
  private:
   std::unordered_map<std::string, OpVersion> op_version_map_;
-
-  OpVersionRegistrar() = default;
-  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
+inline const std::unordered_map<std::string, OpVersion>& get_op_version_map() {
+  return OpVersionRegistrar::GetInstance().GetVersionMap();
+}
+
 inline void SaveOpVersions(
     const std::unordered_map<std::string, OpVersion>& src,
     pb::OpVersionMap* dst) {
   for (const auto& pair : src) {
-    (*dst)[pair.first].SetVersionID(pair.second.GetVersionID());
+    (*dst)[pair.first].SetVersionID(pair.second.version_id());
   }
 }
 
@@ -192,21 +205,24 @@ class OpVersionComparator {
   virtual ~OpVersionComparator() = default;
 };
 
-#define ADD_OP_VERSION_COMPARATOR(cmp_name, cmp_math)                   \
-  class OpVersion##cmp_name##Comparator : public OpVersionComparator {  \
-   public:                                                              \
-    explicit OpVersion##cmp_name##Comparator(const std::string op_name, \
-                                             uint32_t target_version)   \
-        : op_name_(op_name), target_version_(target_version) {}         \
-    virtual bool operator()() {                                         \
-      return OpVersionRegistrar::GetInstance().GetVersionID(op_name_)   \
-          cmp_math target_version_;                                     \
-    }                                                                   \
-    virtual ~OpVersion##cmp_name##Comparator() {}                       \
-                                                                        \
-   private:                                                             \
-    std::string op_name_;                                               \
-    uint32_t target_version_;                                           \
+#define ADD_OP_VERSION_COMPARATOR(cmp_name, cmp_math)                        \
+  class OpVersion##cmp_name##Comparator : public OpVersionComparator {       \
+   public:                                                                   \
+    explicit OpVersion##cmp_name##Comparator(const std::string op_name,      \
+                                             uint32_t target_version)        \
+        : op_name_(op_name), target_version_(target_version) {}              \
+    virtual bool operator()() {                                              \
+      uint32_t version_id = 0;                                               \
+      if (OpVersionRegistrar::GetInstance().Has(op_name_)) {                 \
+        version_id = OpVersionRegistrar::GetInstance().version_id(op_name_); \
+      }                                                                      \
+      return version_id cmp_math target_version_;                            \
+    }                                                                        \
+    virtual ~OpVersion##cmp_name##Comparator() {}                            \
+                                                                             \
+   private:                                                                  \
+    std::string op_name_;                                                    \
+    uint32_t target_version_;                                                \
   };
 
 ADD_OP_VERSION_COMPARATOR(LE, <=);
@@ -310,7 +326,7 @@ class PassVersionCheckerRegistrar {
 }  // namespace paddle
 
 #define REGISTER_OP_VERSION(op_type)                                       \
-  static paddle::framework::compatible::OpVersion                          \
+  UNUSED static paddle::framework::compatible::OpVersion&                  \
       RegisterOpVersion__##op_type =                                       \
           paddle::framework::compatible::OpVersionRegistrar::GetInstance() \
               .Register(#op_type)
diff --git a/paddle/fluid/framework/op_version_registry.inl b/paddle/fluid/framework/op_version_registry.inl
new file mode 100644
index 0000000000000..ec90b3028be22
--- /dev/null
+++ b/paddle/fluid/framework/op_version_registry.inl
@@ -0,0 +1,42 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+REGISTER_OP_VERSION(for_pybind_test__)
+    .AddCheckpoint("Note 0", framework::compatible::OpVersionDesc()
+                                 .BugfixWithBehaviorChanged(
+                                     "BugfixWithBehaviorChanged Remark"))
+    .AddCheckpoint("Note 1", framework::compatible::OpVersionDesc()
+                                 .ModifyAttr("BOOL", "bool", true)
+                                 .ModifyAttr("FLOAT", "float", 1.23f)
+                                 .ModifyAttr("INT", "int32", -1)
+                                 .ModifyAttr("STRING", "std::string",
+                                             std::string{"hello"}))
+    .AddCheckpoint("Note 2",
+                   framework::compatible::OpVersionDesc()
+                       .ModifyAttr("BOOLS", "std::vector<bool>",
+                                   std::vector<bool>{true, false})
+                       .ModifyAttr("FLOATS", "std::vector<float>",
+                                   std::vector<float>{2.56f, 1.28f})
+                       .ModifyAttr("INTS", "std::vector<int32>",
+                                   std::vector<int32_t>{10, 100})
+                       .NewAttr("LONGS", "std::vector<int64>",
+                                std::vector<int64_t>{10000001, -10000001}))
+    .AddCheckpoint("Note 3", framework::compatible::OpVersionDesc()
+                           .NewAttr("STRINGS", "std::vector<std::string>",
+                                std::vector<std::string>{"str1", "str2"})
+                                .ModifyAttr("LONG", "int64", static_cast<int64_t>(10000001))
+                                 .NewInput("NewInput", "NewInput_")
+                                 .NewOutput("NewOutput", "NewOutput_")
+                                 .BugfixWithBehaviorChanged(
+                                     "BugfixWithBehaviorChanged_"));
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index 2b173c9571588..ef8384c1e7ee1 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -21,7 +21,7 @@ namespace framework {
 namespace compatible {
 
 TEST(test_operator_version, test_operator_version) {
-  REGISTER_OP_VERSION(test__)
+  REGISTER_OP_VERSION(op_name__)
       .AddCheckpoint(
           R"ROC(Fix the bug of reshape op, support the case of axis < 0)ROC",
           framework::compatible::OpVersionDesc().BugfixWithBehaviorChanged(
@@ -56,6 +56,7 @@ TEST(test_operator_version, test_operator_version) {
 }
 
 TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
+  const std::string fake_op_name{"op_name__"};
   ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "no_bind_pass"));
 
@@ -90,7 +91,7 @@ TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
   REGISTER_PASS_CAPABILITY(test_pass4)
       .AddCombination(
           paddle::framework::compatible::OpVersionComparatorCombination()
-              .GE("test__", 5)
+              .GE(fake_op_name, 5)
               .EQ("fc", 0));
   ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "test_pass4"));
@@ -98,7 +99,7 @@ TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
   REGISTER_PASS_CAPABILITY(test_pass5)
       .AddCombination(
           paddle::framework::compatible::OpVersionComparatorCombination()
-              .GE("test__", 4)
+              .GE(fake_op_name, 4)
               .EQ("fc", 0));
   ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "test_pass5"));
@@ -106,7 +107,7 @@ TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
   REGISTER_PASS_CAPABILITY(test_pass6)
       .AddCombination(
           paddle::framework::compatible::OpVersionComparatorCombination()
-              .EQ("test__", 4)
+              .EQ(fake_op_name, 4)
               .EQ("fc", 0));
   ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "test_pass6"));
@@ -114,7 +115,7 @@ TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
   REGISTER_PASS_CAPABILITY(test_pass7)
       .AddCombination(
           paddle::framework::compatible::OpVersionComparatorCombination()
-              .NE("test__", 4)
+              .NE(fake_op_name, 4)
               .EQ("fc", 0));
   ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "test_pass7"));
diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
index 5fa8f6bab8cca..ca80ada7b6ea7 100644
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -104,6 +104,7 @@ endif()
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} device_memory_aligment)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} layer)
 set(COMMON_OP_DEPS ${COMMON_OP_DEPS} tensor_formatter)
+set(COMMON_OP_DEPS ${COMMON_OP_DEPS} op_version_registry)
 
 # FIXME(typhoonzero): operator deps may not needed.
 # op_library(lod_tensor_to_array_op DEPS lod_rank_table_op)
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 92d9473141009..6fd1b7e1d36c2 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -1,7 +1,7 @@
 set(PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
   feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
   analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
-  gloo_wrapper infer_io_utils heter_wrapper generator)
+  gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry)
 
 if (WITH_NCCL)
   set(PYBIND_DEPS ${PYBIND_DEPS} nccl_wrapper)
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
index 971d230458db4..57b024c25cbaf 100644
--- a/paddle/fluid/pybind/compatible.cc
+++ b/paddle/fluid/pybind/compatible.cc
@@ -13,26 +13,136 @@
 // limitations under the License.
 
 #include "paddle/fluid/pybind/compatible.h"
-
 #include <memory>
 #include <string>
-
 #include "paddle/fluid/framework/op_version_registry.h"
+#include "paddle/fluid/pybind/pybind_boost_headers.h"
 
 namespace py = pybind11;
 
-using paddle::framework::compatible::PassVersionCheckerRegistrar;
+using paddle::framework::compatible::OpAttrVariantT;
+using paddle::framework::compatible::OpUpdateInfo;
+using paddle::framework::compatible::OpAttrInfo;
+using paddle::framework::compatible::OpInputOutputInfo;
+using paddle::framework::compatible::OpBugfixInfo;
+using paddle::framework::compatible::OpUpdateType;
+using paddle::framework::compatible::OpUpdateBase;
+using paddle::framework::compatible::OpVersionDesc;
+using paddle::framework::compatible::OpCheckpoint;
+using paddle::framework::compatible::OpVersion;
 
 namespace paddle {
 namespace pybind {
 
-void BindCompatible(py::module* m) {
+namespace {
+using paddle::framework::compatible::PassVersionCheckerRegistrar;
+void BindPassVersionChecker(py::module *m) {
   py::class_<PassVersionCheckerRegistrar>(*m, "PassVersionChecker")
-      .def_static("IsCompatible", [](const std::string& name) -> bool {
+      .def_static("IsCompatible", [](const std::string &name) -> bool {
         auto instance = PassVersionCheckerRegistrar::GetInstance();
         return instance.IsPassCompatible(name);
       });
 }
 
+void BindPassCompatible(py::module *m) { BindPassVersionChecker(m); }
+
+void BindOpUpdateInfo(py::module *m) {
+  py::class_<OpUpdateInfo>(*m, "OpUpdateInfo").def(py::init<>());
+}
+
+void BindOpAttrInfo(py::module *m) {
+  py::class_<OpAttrInfo, OpUpdateInfo>(*m, "OpAttrInfo")
+      .def(py::init<const std::string &, const std::string &,
+                    const OpAttrVariantT &>())
+      .def(py::init<const OpAttrInfo &>())
+      .def("name", &OpAttrInfo::name)
+      .def("default_value", &OpAttrInfo::default_value)
+      .def("remark", &OpAttrInfo::remark);
+}
+
+void BindOpInputOutputInfo(py::module *m) {
+  py::class_<OpInputOutputInfo, OpUpdateInfo>(*m, "OpInputOutputInfo")
+      .def(py::init<const std::string &, const std::string &>())
+      .def(py::init<const OpInputOutputInfo &>())
+      .def("name", &OpInputOutputInfo::name)
+      .def("remark", &OpInputOutputInfo::remark);
+}
+
+void BindOpBugfixInfo(py::module *m) {
+  py::class_<OpBugfixInfo, OpUpdateInfo>(*m, "OpBugfixInfo")
+      .def(py::init<const std::string &>())
+      .def(py::init<const OpBugfixInfo &>())
+      .def("remark", &OpBugfixInfo::remark);
+}
+
+void BindOpCompatible(py::module *m) {
+  BindOpUpdateInfo(m);
+  BindOpAttrInfo(m);
+  BindOpInputOutputInfo(m);
+  BindOpBugfixInfo(m);
+}
+
+void BindOpUpdateType(py::module *m) {
+  py::enum_<OpUpdateType>(*m, "OpUpdateType")
+      .value("kInvalid", OpUpdateType::kInvalid)
+      .value("kModifyAttr", OpUpdateType::kModifyAttr)
+      .value("kNewAttr", OpUpdateType::kNewAttr)
+      .value("kNewInput", OpUpdateType::kNewInput)
+      .value("kNewOutput", OpUpdateType::kNewOutput)
+      .value("kBugfixWithBehaviorChanged",
+             OpUpdateType::kBugfixWithBehaviorChanged);
+}
+
+void BindOpUpdateBase(py::module *m) {
+  py::class_<OpUpdateBase>(*m, "OpUpdateBase")
+      .def("info", [](const OpUpdateBase &obj) { return obj.info(); },
+           py::return_value_policy::reference)
+      .def("type", &OpUpdateBase::type);
+}
+
+void BindOpVersionDesc(py::module *m) {
+  py::class_<OpVersionDesc>(*m, "OpVersionDesc")
+      // Pybind11 does not yet support the transfer of `const
+      // std::vector<std::unique_ptr<T>>&` type objects.
+      .def("infos", [](const OpVersionDesc &obj) {
+        auto pylist = py::list();
+        for (const auto &ptr : obj.infos()) {
+          auto pyobj = py::cast(*ptr, py::return_value_policy::reference);
+          pylist.append(pyobj);
+        }
+        return pylist;
+      });
+}
+
+void BindOpCheckpoint(py::module *m) {
+  py::class_<OpCheckpoint>(*m, "OpCheckpoint")
+      .def("note", &OpCheckpoint::note, py::return_value_policy::reference)
+      .def("version_desc", &OpCheckpoint::version_desc,
+           py::return_value_policy::reference);
+}
+
+void BindOpVersion(py::module *m) {
+  py::class_<OpVersion>(*m, "OpVersion")
+      .def("version_id", &OpVersion::version_id,
+           py::return_value_policy::reference)
+      .def("checkpoints", &OpVersion::checkpoints,
+           py::return_value_policy::reference);
+  // At least pybind v2.3.0 is required because of bug #1603 of pybind11.
+  m->def("get_op_version_map", &framework::compatible::get_op_version_map,
+         py::return_value_policy::reference);
+}
+
+}  // namespace
+
+void BindCompatible(py::module *m) {
+  BindPassCompatible(m);
+  BindOpCompatible(m);
+  BindOpUpdateType(m);
+  BindOpUpdateBase(m);
+  BindOpVersionDesc(m);
+  BindOpCheckpoint(m);
+  BindOpVersion(m);
+}
+
 }  // namespace pybind
 }  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/test_op_version.py b/python/paddle/fluid/tests/unittests/test_op_version.py
new file mode 100644
index 0000000000000..1d7167955ac7c
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_op_version.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+
+import paddle.utils as utils
+import paddle.fluid as fluid
+
+
+class OpLastCheckpointCheckerTest(unittest.TestCase):
+    def __init__(self, methodName='runTest'):
+        super(OpLastCheckpointCheckerTest, self).__init__(methodName)
+        self.checker = utils.OpLastCheckpointChecker()
+        self.fake_op = 'for_pybind_test__'
+
+    def test_op_attr_info(self):
+        update_type = fluid.core.OpUpdateType.kNewAttr
+        info_list = self.checker.filter_updates(self.fake_op, update_type,
+                                                'STRINGS')
+        self.assertTrue(info_list)
+        self.assertEqual(info_list[0].name(), 'STRINGS')
+        self.assertEqual(info_list[0].default_value(), ['str1', 'str2'])
+        self.assertEqual(info_list[0].remark(), 'std::vector<std::string>')
+
+    def test_op_input_output_info(self):
+        update_type = fluid.core.OpUpdateType.kNewInput
+        info_list = self.checker.filter_updates(self.fake_op, update_type,
+                                                'NewInput')
+        self.assertTrue(info_list)
+        self.assertEqual(info_list[0].name(), 'NewInput')
+        self.assertEqual(info_list[0].remark(), 'NewInput_')
+
+    def test_op_bug_fix_info(self):
+        update_type = fluid.core.OpUpdateType.kBugfixWithBehaviorChanged
+        info_list = self.checker.filter_updates(self.fake_op, update_type)
+        self.assertTrue(info_list)
+        self.assertEqual(info_list[0].remark(), 'BugfixWithBehaviorChanged_')
+
+
+class OpVersionTest(unittest.TestCase):
+    def __init__(self, methodName='runTest'):
+        super(OpVersionTest, self).__init__(methodName)
+        self.vmap = fluid.core.get_op_version_map()
+        self.fake_op = 'for_pybind_test__'
+
+    def test_checkpoints(self):
+        version_id = self.vmap[self.fake_op].version_id()
+        checkpoints = self.vmap[self.fake_op].checkpoints()
+        self.assertEqual(version_id, 4)
+        self.assertEqual(len(checkpoints), 4)
+        self.assertEqual(checkpoints[2].note(), 'Note 2')
+        desc_1 = checkpoints[1].version_desc().infos()
+        self.assertEqual(desc_1[0].info().default_value(), True)
+        self.assertAlmostEqual(desc_1[1].info().default_value(), 1.23, 2)
+        self.assertEqual(desc_1[2].info().default_value(), -1)
+        self.assertEqual(desc_1[3].info().default_value(), 'hello')
+        desc_2 = checkpoints[2].version_desc().infos()
+        self.assertEqual(desc_2[0].info().default_value(), [True, False])
+        true_l = [2.56, 1.28]
+        self.assertEqual(len(true_l), len(desc_2[1].info().default_value()))
+        for i in range(len(true_l)):
+            self.assertAlmostEqual(desc_2[1].info().default_value()[i],
+                                   true_l[i], 2)
+        self.assertEqual(desc_2[2].info().default_value(), [10, 100])
+        self.assertEqual(desc_2[3].info().default_value(),
+                         [10000001, -10000001])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/utils/__init__.py b/python/paddle/utils/__init__.py
index 9d7a05131ffa1..faf0fd4984d7c 100644
--- a/python/paddle/utils/__init__.py
+++ b/python/paddle/utils/__init__.py
@@ -17,6 +17,7 @@
 from .profiler import get_profiler
 from .deprecated import deprecated
 from .lazy_import import try_import
+from .op_version import OpLastCheckpointChecker
 from .install_check import run_check
 from ..fluid.framework import unique_name
 from ..fluid.framework import load_op_library
diff --git a/python/paddle/utils/op_version.py b/python/paddle/utils/op_version.py
new file mode 100644
index 0000000000000..68acc9de08151
--- /dev/null
+++ b/python/paddle/utils/op_version.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ..fluid import core
+
+__all__ = ['OpLastCheckpointChecker']
+
+
+def Singleton(cls):
+    _instance = {}
+
+    def _singleton(*args, **kargs):
+        if cls not in _instance:
+            _instance[cls] = cls(*args, **kargs)
+        return _instance[cls]
+
+    return _singleton
+
+
+class OpUpdateInfoHelper(object):
+    def __init__(self, info):
+        self._info = info
+
+    def verify_key_value(self, name=''):
+        result = False
+        key_funcs = {
+            core.OpAttrInfo: 'name',
+            core.OpInputOutputInfo: 'name',
+        }
+        if name == '':
+            result = True
+        elif type(self._info) in key_funcs:
+            if getattr(self._info, key_funcs[type(self._info)])() == name:
+                result = True
+        return result
+
+
+@Singleton
+class OpLastCheckpointChecker(object):
+    def __init__(self):
+        self.raw_version_map = core.get_op_version_map()
+        self.checkpoints_map = {}
+        self._construct_map()
+
+    def _construct_map(self):
+        for op_name in self.raw_version_map:
+            last_checkpoint = self.raw_version_map[op_name].checkpoints()[-1]
+            infos = last_checkpoint.version_desc().infos()
+            self.checkpoints_map[op_name] = infos
+
+    def filter_updates(self, op_name, type=core.OpUpdateType.kInvalid, key=''):
+        updates = []
+        if op_name in self.checkpoints_map:
+            for update in self.checkpoints_map[op_name]:
+                if (update.type() == type) or (
+                        type == core.OpUpdateType.kInvalid):
+                    if OpUpdateInfoHelper(update.info()).verify_key_value(key):
+                        updates.append(update.info())
+        return updates

From 337d3832f32a64b27274ab7131cdd05867b1c6d0 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 4 Nov 2020 10:25:26 +0800
Subject: [PATCH 107/185] refine (#28366)

---
 cmake/cupti.cmake | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/cmake/cupti.cmake b/cmake/cupti.cmake
index 72ed0f1e5858d..17626688531e6 100644
--- a/cmake/cupti.cmake
+++ b/cmake/cupti.cmake
@@ -8,6 +8,7 @@ find_path(CUPTI_INCLUDE_DIR cupti.h
         PATHS ${CUPTI_ROOT} ${CUPTI_ROOT}/include
         $ENV{CUPTI_ROOT} $ENV{CUPTI_ROOT}/include
         ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/include
+        ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/include
         NO_DEFAULT_PATH
         )
 
@@ -27,6 +28,7 @@ list(APPEND CUPTI_CHECK_LIBRARY_DIRS
         $ENV{CUPTI_ROOT}/lib64
         $ENV{CUPTI_ROOT}/lib
         /usr/lib
+        ${CUDA_TOOLKIT_ROOT_DIR}/targets/x86_64-linux/lib64
         ${CUDA_TOOLKIT_ROOT_DIR}/extras/CUPTI/lib64)
 find_library(CUPTI_LIBRARY NAMES libcupti.so libcupti.dylib # libcupti_static.a
        PATHS ${CUPTI_CHECK_LIBRARY_DIRS} ${CUPTI_INCLUDE_DIR} ${__libpath_hist}

From 8b2436a776e5cb5d0ebc8bc06f9624ecd87c9189 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 4 Nov 2020 10:43:33 +0800
Subject: [PATCH 108/185] Add broadcast_shape api (#28257)

* add broadcast_shape api

* add ut

* follow comments

* add example code, test=dodument_fix

* update example code, test=document_fix
---
 .../operators/common_infer_shape_functions.cc | 27 +++++++++-----
 .../operators/common_infer_shape_functions.h  |  5 ++-
 paddle/fluid/pybind/pybind.cc                 |  7 ++++
 python/paddle/__init__.py                     |  4 ++-
 .../tests/unittests/test_broadcast_shape.py   | 35 +++++++++++++++++++
 python/paddle/tensor/__init__.py              |  4 ++-
 python/paddle/tensor/math.py                  | 30 +++++++++++++++-
 7 files changed, 99 insertions(+), 13 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/test_broadcast_shape.py

diff --git a/paddle/fluid/operators/common_infer_shape_functions.cc b/paddle/fluid/operators/common_infer_shape_functions.cc
index ce622d7501f90..c10bba74ce7c7 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.cc
+++ b/paddle/fluid/operators/common_infer_shape_functions.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
+
 #include <algorithm>
 #include <vector>
 
@@ -28,6 +29,7 @@ class InferShapeContext;
 namespace paddle {
 namespace operators {
 namespace details {
+
 inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
                                    const framework::DDim &y_dims,
                                    int *x_dims_array, int *y_dims_array,
@@ -76,6 +78,20 @@ inline void GetBroadcastDimsArrays(const framework::DDim &x_dims,
     }
   }
 }
+
+framework::DDim BroadcastTwoDims(const framework::DDim &x_dims,
+                                 const framework::DDim &y_dims, int axis) {
+  int max_dim = std::max(x_dims.size(), y_dims.size());
+  axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
+  std::vector<int> x_dims_array(max_dim);
+  std::vector<int> y_dims_array(max_dim);
+  std::vector<int> out_dims_array(max_dim);
+  GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
+                         y_dims_array.data(), out_dims_array.data(), max_dim,
+                         axis);
+  return framework::make_ddim(out_dims_array);
+}
+
 }  // namespace details
 
 // shape input(0) -> output(0) without change.
@@ -153,16 +169,9 @@ void BinaryOpBroadcastInferShape(framework::InferShapeContext *ctx) {
     ctx->ShareDim(x_name, /*->*/ out_name);
     ctx->ShareLoD(x_name, /*->*/ out_name);
   } else {
-    int max_dim = std::max(x_dims.size(), y_dims.size());
     int axis = ctx->Attrs().Get<int>("axis");
-    axis = (axis == -1 ? std::abs(x_dims.size() - y_dims.size()) : axis);
-    std::vector<int> x_dims_array(max_dim);
-    std::vector<int> y_dims_array(max_dim);
-    std::vector<int> out_dims_array(max_dim);
-    details::GetBroadcastDimsArrays(x_dims, y_dims, x_dims_array.data(),
-                                    y_dims_array.data(), out_dims_array.data(),
-                                    max_dim, axis);
-    ctx->SetOutputDim(out_name, framework::make_ddim(out_dims_array));
+    auto out_dims = details::BroadcastTwoDims(x_dims, y_dims, axis);
+    ctx->SetOutputDim(out_name, out_dims);
     ctx->ShareLoD(x_name, /*->*/ out_name);
   }
 }
diff --git a/paddle/fluid/operators/common_infer_shape_functions.h b/paddle/fluid/operators/common_infer_shape_functions.h
index 922d5262abc42..2c28db4324e6d 100644
--- a/paddle/fluid/operators/common_infer_shape_functions.h
+++ b/paddle/fluid/operators/common_infer_shape_functions.h
@@ -28,7 +28,10 @@ class InferShapeContext;
 
 namespace paddle {
 namespace operators {
-
+namespace details {
+framework::DDim BroadcastTwoDims(const framework::DDim& x_dims,
+                                 const framework::DDim& y_dims, int axis = -1);
+}
 // shape input(0) -> output(0) without change.
 void UnaryOpUnchangedInferShape(framework::InferShapeContext* ctx);
 // shape input(0) -> output(0) without change, check if axis in range [-Rank(x),
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 736669fa4ef92..a7e3cd82d26a4 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -54,6 +54,7 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/allocator_strategy.h"
 #include "paddle/fluid/memory/allocation/mmap_allocator.h"
 #include "paddle/fluid/operators/activation_op.h"
+#include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 #include "paddle/fluid/platform/cpu_info.h"
@@ -467,6 +468,12 @@ PYBIND11_MODULE(core_noavx, m) {
             << ", sci_mode=" << print_opt.sci_mode;
   });
 
+  m.def("broadcast_shape", [](const std::vector<int64_t> &x_dim,
+                              const std::vector<int64_t> &y_dim) {
+    return vectorize(operators::details::BroadcastTwoDims(
+        make_ddim(x_dim), make_ddim(y_dim), -1));
+  });
+
   m.def(
       "_append_python_callable_object_and_return_id",
       [](py::object py_obj) -> size_t {
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index c8e0d830f4e17..50c1142c7bfb6 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -200,6 +200,8 @@
 from .tensor.math import isinf  #DEFINE_ALIAS
 from .tensor.math import isnan  #DEFINE_ALIAS
 from .tensor.math import prod  #DEFINE_ALIAS
+from .tensor.math import broadcast_shape  #DEFINE_ALIAS
+
 from .tensor.random import multinomial  #DEFINE_ALIAS
 from .tensor.random import standard_normal
 from .tensor.random import normal
@@ -220,7 +222,7 @@
 from .tensor.search import nonzero  #DEFINE_ALIAS
 from .tensor.search import sort  #DEFINE_ALIAS
 
-from .tensor.to_string import set_printoptions
+from .tensor.to_string import set_printoptions  #DEFINE_ALIAS
 
 from .framework.random import seed  #DEFINE_ALIAS
 from .framework.random import get_cuda_rng_state  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/test_broadcast_shape.py b/python/paddle/fluid/tests/unittests/test_broadcast_shape.py
new file mode 100644
index 0000000000000..b4ac096a69685
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_broadcast_shape.py
@@ -0,0 +1,35 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle
+
+
+class TestBroadcastShape(unittest.TestCase):
+    def test_result(self):
+        shape = paddle.broadcast_shape([2, 1, 3], [1, 3, 1])
+        self.assertEqual(shape, [2, 3, 3])
+
+        shape = paddle.broadcast_shape(
+            [-1, 1, 3], [1, 3, 1])  #support compile time infershape
+        self.assertEqual(shape, [-1, 3, 3])
+
+    def test_error(self):
+        self.assertRaises(ValueError, paddle.broadcast_shape, [2, 1, 3],
+                          [3, 3, 1])
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 43e6c9654c4d8..2a9820d4a90d3 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -164,6 +164,8 @@
 from .math import prod  #DEFINE_ALIAS
 from .math import all  #DEFINE_ALIAS
 from .math import any  #DEFINE_ALIAS
+from .math import broadcast_shape  #DEFINE_ALIAS
+
 from .random import multinomial  #DEFINE_ALIAS
 from .random import standard_normal
 from .random import normal
@@ -194,4 +196,4 @@
 # from .tensor import Tensor        #DEFINE_ALIAS
 # from .tensor import LoDTensor        #DEFINE_ALIAS
 # from .tensor import LoDTensorArray        #DEFINE_ALIAS
-from .to_string import set_printoptions
+from .to_string import set_printoptions  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 36793e0769672..c83e788538e1c 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -121,7 +121,8 @@
         'kron',
         'isfinite',
         'isinf',
-        'isnan'
+        'isnan',
+        'broadcast_shape'
 ]
 # yapf: enable.
 
@@ -2133,3 +2134,30 @@ def any(x, axis=None, keepdim=False, name=None):
         outputs={'Out': out},
         attrs=attrs)
     return out
+
+def broadcast_shape(x_shape, y_shape):
+    """
+    The function returns the shape of doing operation with broadcasting on tensors of x_shape and y_shape, please refer to :ref:`user_guide_broadcasting` for more details.
+
+    Args:
+        x_shape (list[int]|tuple[int]): A shape of tensor.
+        y_shape (list[int]|tuple[int]): A shape of tensor.
+        
+
+    Returns:
+        list[int], the result shape.
+
+    Examples:
+        .. code-block:: python
+
+            import paddle
+
+            shape = paddle.broadcast_shape([2, 1, 3], [1, 3, 1])
+            # [2, 3, 3]
+            
+            # shape = paddle.broadcast_shape([2, 1, 3], [3, 3, 1])
+            # ValueError (terminated with error message).
+
+    """
+
+    return core.broadcast_shape(x_shape, y_shape)

From 71d6220772cdbdeba305a758166da25bd1c2cd5d Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 4 Nov 2020 10:47:52 +0800
Subject: [PATCH 109/185] Skip reader op in mixed_precision decorator (#28353)

* skip reader op in mixed_precision decorator

* add ut
---
 .../contrib/mixed_precision/fp16_utils.py     |  8 ++++
 .../tests/test_image_classification_fp16.py   | 37 +++++++++++++++++++
 2 files changed, 45 insertions(+)

diff --git a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
index 0ff166d8dc89a..1d9f8af10200e 100644
--- a/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
+++ b/python/paddle/fluid/contrib/mixed_precision/fp16_utils.py
@@ -215,6 +215,14 @@ def rewrite_program(main_prog, amp_lists):
     white_op_set = set()
     black_op_set = set()
     for op in ops:
+
+        # NOTE(zhiqiu): 'create_py_reader' and 'read' is used in non-iterable DataLoder, 
+        # we don't need to handle reader op and the input of 'create_py_reader' is not 
+        # in block, which may result in errors.
+        # See GeneratorLoader._init_non_iterable() for details.
+        if op.type == 'create_py_reader' or op.type == 'read':
+            continue
+
         if amp_lists.black_varnames is not None and _is_in_black_varnames(
                 op, amp_lists):
             black_op_set.add(op)
diff --git a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
index 1bf1a23483467..b29cd265bd64c 100644
--- a/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
+++ b/python/paddle/fluid/contrib/tests/test_image_classification_fp16.py
@@ -417,5 +417,42 @@ def scope_prog_guard(self):
                 yield
 
 
+class TestAmpWithNonIterableDataLoader(unittest.TestCase):
+    def decorate_with_data_loader(self):
+        main_prog = paddle.static.Program()
+        start_prog = paddle.static.Program()
+        with paddle.static.program_guard(main_prog, start_prog):
+            with paddle.fluid.unique_name.guard():
+                image = fluid.layers.data(
+                    name='image', shape=[3, 224, 224], dtype='float32')
+                label = fluid.layers.data(
+                    name='label', shape=[1], dtype='int64')
+                py_reader = fluid.io.DataLoader.from_generator(
+                    feed_list=[image, label],
+                    capacity=4,
+                    iterable=False,
+                    use_double_buffer=False)
+
+                net = vgg16_bn_drop(image)
+                logits = fluid.layers.fc(input=net, size=10, act="softmax")
+                cost, predict = fluid.layers.softmax_with_cross_entropy(
+                    logits, label, return_softmax=True)
+                avg_cost = fluid.layers.mean(cost)
+
+                optimizer = fluid.optimizer.Lamb(learning_rate=0.001)
+                amp_lists = fluid.contrib.mixed_precision.AutoMixedPrecisionLists(
+                    custom_black_varnames={"loss", "conv2d_0.w_0"})
+                mp_optimizer = fluid.contrib.mixed_precision.decorate(
+                    optimizer=optimizer,
+                    amp_lists=amp_lists,
+                    init_loss_scaling=8.0,
+                    use_dynamic_loss_scaling=True)
+
+                mp_optimizer.minimize(avg_cost)
+
+    def test_non_iterable_dataloader(self):
+        self.decorate_with_data_loader()
+
+
 if __name__ == '__main__':
     unittest.main()

From 95b1868366887e8b84dd8636601c066b3ef0b2f8 Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Wed, 4 Nov 2020 12:07:15 +0800
Subject: [PATCH 110/185] update DataLoader doc (#28290)

* update DataLoader doc. test=develop
---
 python/paddle/fluid/reader.py | 103 ++++++++++++----------------------
 1 file changed, 35 insertions(+), 68 deletions(-)

diff --git a/python/paddle/fluid/reader.py b/python/paddle/fluid/reader.py
index 35dcd45223419..0e7fd35f5842e 100644
--- a/python/paddle/fluid/reader.py
+++ b/python/paddle/fluid/reader.py
@@ -153,18 +153,22 @@ class DataLoader(object):
     multi-process workers will be used to load data asynchronously if
     :attr:`num_workers` is set as a positive number.
 
-    DataLoader only supports map-style dataset(can get a sample from
-    dataset with a given index) currently, for a map-style dataset,
-    please see :code:`paddle.io.Dataset`.
+    DataLoader supports map-style dataset and iterable-style dataset.
 
-    batch_sampler please see :code:`paddle.io.BatchSampler`
+    For map-style datast(can get a sample from dataset with a given
+    index), please see :code:`paddle.io.Dataset`.
+
+    For iterable-style datast(get samples from dataset iteratively,
+    like a Python iterator), please see :code:`paddle.io.IterableDataset`.
+
+    For :code:`batch_sampler` please see :code:`paddle.io.BatchSampler`
 
     Args:  
         dataset(Dataset): the dataset to load data from, should be an
             instance of subclass of :code:`paddle.io.Dataset` or
             :code:`paddle.io.IterableDataset`.
         feed_list (list(Tensor)|tuple(Tensor)): feed variable list.
-            The variables should be created by :code:`fluid.data()`.
+            The variables should be created by :code:`paddle.static.data()`.
             :attr:`feed_list` must be set if :attr:`return_list` is
             False. Default None.
         places(list(Place)|tuple(Place)|optional): a list of Place,
@@ -173,10 +177,10 @@ class DataLoader(object):
             will be used. Default None.
         return_list (bool): whether the return value on each device is 
             presented as a list. If :attr:`return_list=False`, the return
-            value on each device would be a dict of str -> LoDTensor, where
+            value on each device would be a dict of str -> Tensor, where
             the key of the dict is the name of each fed variables. If 
             :attr:`return_list=True`, the return value on each device would
-            be a list(LoDTensor). :attr:`return_list` can only be True
+            be a list(Tensor). :attr:`return_list` can only be True
             in dynamic graph mode. Default False.
         batch_sampler(BatchSampler): an instance of `paddle.io.BatchSampler`
             to generate batch indices to draw samples from :attr:`dataset`
@@ -224,7 +228,8 @@ class DataLoader(object):
             import numpy as np
 
             import paddle
-            import paddle.fluid as fluid
+            import paddle.nn as nn
+            import paddle.nn.functional as F
             from paddle.io import Dataset, BatchSampler, DataLoader
 
             BATCH_NUM = 20
@@ -234,8 +239,6 @@ class DataLoader(object):
             IMAGE_SIZE = 784
             CLASS_NUM = 10
 
-            USE_GPU = False # whether use GPU to run model
-
             # define a random dataset
             class RandomDataset(Dataset):
                 def __init__(self, num_samples):
@@ -251,78 +254,34 @@ def __len__(self):
 
             dataset = RandomDataset(BATCH_NUM * BATCH_SIZE)
 
-            # get places
-            places = fluid.cuda_places() if USE_GPU else fluid.cpu_places()
-
-            # --------------------- dygraph mode --------------------
-
-            class SimpleNet(fluid.dygraph.Layer):
+            class SimpleNet(nn.Layer):
                 def __init__(self):
                     super(SimpleNet, self).__init__()
-                    self.fc = fluid.dygraph.nn.Linear(IMAGE_SIZE, CLASS_NUM, act='softmax')
+                    self.fc = nn.Linear(IMAGE_SIZE, CLASS_NUM)
 
                 def forward(self, image, label=None):
                     return self.fc(image)
 
-            with fluid.dygraph.guard(places[0]):
-                simple_net = SimpleNet()
-                opt = fluid.optimizer.SGD(learning_rate=1e-3,
-                                          parameter_list=simple_net.parameters())
-
-                loader = DataLoader(dataset,
-                                    batch_size=BATCH_SIZE,
-                                    shuffle=True,
-                                    drop_last=True,
-                                    num_workers=2)
-
-                for e in range(EPOCH_NUM):
-                    for i, (image, label) in enumerate(loader()):
-                        out = simple_net(image)
-                        loss = fluid.layers.cross_entropy(out, label)
-                        avg_loss = fluid.layers.reduce_mean(loss)
-                        avg_loss.backward()
-                        opt.minimize(avg_loss)
-                        simple_net.clear_gradients()
-                        print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
-
-            # -------------------------------------------------------
-
-            # -------------------- static graph ---------------------
-
-            paddle.enable_static()
-
-            def simple_net(image, label):
-                fc_tmp = fluid.layers.fc(image, size=CLASS_NUM, act='softmax')
-                cross_entropy = fluid.layers.softmax_with_cross_entropy(image, label)
-                loss = fluid.layers.reduce_mean(cross_entropy)
-                sgd = fluid.optimizer.SGD(learning_rate=1e-3)
-                sgd.minimize(loss)
-                return loss
-
-            image = fluid.data(name='image', shape=[None, IMAGE_SIZE], dtype='float32')
-            label = fluid.data(name='label', shape=[None, 1], dtype='int64')
-
-            loss = simple_net(image, label)
-
-            exe = fluid.Executor(places[0])
-            exe.run(fluid.default_startup_program())
-
-            prog = fluid.CompiledProgram(fluid.default_main_program()).with_data_parallel(loss_name=loss.name)
+            simple_net = SimpleNet()
+            opt = paddle.optimizer.SGD(learning_rate=1e-3,
+                                      parameters=simple_net.parameters())
 
             loader = DataLoader(dataset,
-                                feed_list=[image, label],
-                                batch_size=BATCH_SIZE, 
+                                batch_size=BATCH_SIZE,
                                 shuffle=True,
                                 drop_last=True,
                                 num_workers=2)
 
             for e in range(EPOCH_NUM):
-                for i, data in enumerate(loader()):
-                    l = exe.run(prog, feed=data, fetch_list=[loss], return_numpy=True)
-                    print("Epoch {} batch {}: loss = {}".format(e, i, l[0][0]))
+                for i, (image, label) in enumerate(loader()):
+                    out = simple_net(image)
+                    loss = F.cross_entropy(out, label)
+                    avg_loss = paddle.mean(loss)
+                    avg_loss.backward()
+                    opt.minimize(avg_loss)
+                    simple_net.clear_gradients()
+                    print("Epoch {} batch {}: loss = {}".format(e, i, np.mean(loss.numpy())))
 
-            # -------------------------------------------------------
-                
 
     .. note::
         For reading iterable dataset with multiprocess Dataloader,
@@ -439,6 +398,10 @@ def from_generator(feed_list=None,
                        use_multiprocess=False,
                        drop_last=True):
         """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`paddle.io.DataLoader` which supports multi-processes acceleration.
+
         .. note::
           **The framework ensures that the data loading order of DataLoader is exactly the same as the user-defined data source.**
 
@@ -684,6 +647,10 @@ def run_inference(drop_last):
     @staticmethod
     def from_dataset(dataset, places, drop_last=True):
         """
+        .. warning::
+          This API will be deprecated in the future, it is recommended to use
+          :code:`paddle.io.DataLoader` which supports multi-processes acceleration.
+
         Create an iterable DataLoader object for loading data from Dataset.    
         Dataset is only supported in Linux system currently.
 

From 05114693cfb13eec3efb7cba0b9a52b411300126 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 4 Nov 2020 00:29:15 -0600
Subject: [PATCH 111/185] [Inference] Memory modification for ShrinkMemory.
 (#28355)

---
 paddle/fluid/memory/allocation/CMakeLists.txt |  1 +
 paddle/fluid/memory/allocation/allocator.h    |  3 +
 .../memory/allocation/allocator_facade.cc     |  5 ++
 .../memory/allocation/allocator_facade.h      |  3 +
 .../auto_growth_best_fit_allocator.h          |  3 +
 .../auto_growth_best_fit_allocator_test.cc    |  1 +
 .../allocation/naive_best_fit_allocator.cc    | 51 +++++++++++++
 .../allocation/naive_best_fit_allocator.h     |  1 +
 .../naive_best_fit_allocator_test.cc          | 74 +++++++++++++++++++
 .../fluid/memory/allocation/retry_allocator.h |  3 +
 .../memory/allocation/retry_allocator_test.cc |  2 +
 .../allocation/thread_local_allocator.cc      |  2 +
 .../allocation/thread_local_allocator.h       |  4 +
 .../allocation/thread_local_allocator_test.cc |  1 +
 paddle/fluid/memory/detail/buddy_allocator.cc | 41 +++++++++-
 paddle/fluid/memory/detail/buddy_allocator.h  |  7 ++
 .../memory/detail/buddy_allocator_test.cc     | 17 +++++
 paddle/fluid/memory/malloc.cc                 |  4 +
 paddle/fluid/memory/malloc.h                  |  2 +
 19 files changed, 223 insertions(+), 2 deletions(-)
 create mode 100644 paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc

diff --git a/paddle/fluid/memory/allocation/CMakeLists.txt b/paddle/fluid/memory/allocation/CMakeLists.txt
index 9cc7c267454a4..8a1a1115ad7bd 100644
--- a/paddle/fluid/memory/allocation/CMakeLists.txt
+++ b/paddle/fluid/memory/allocation/CMakeLists.txt
@@ -4,6 +4,7 @@ cc_library(locked_allocator SRCS locked_allocator.cc DEPS allocator)
 cc_library(buffered_allocator SRCS buffered_allocator.cc DEPS allocator)
 cc_library(best_fit_allocator SRCS best_fit_allocator.cc DEPS allocator)
 cc_library(naive_best_fit_allocator SRCS naive_best_fit_allocator.cc DEPS allocator buddy_allocator profiler)
+cc_test(naive_best_fit_allocator_test SRCS naive_best_fit_allocator_test.cc DEPS naive_best_fit_allocator)
 cc_test(buffered_allocator_test SRCS buffered_allocator_test.cc DEPS locked_allocator buffered_allocator cpu_allocator best_fit_allocator)
 
 if (WITH_MKLDNN)
diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index e54748a53679d..b83d3efb72b71 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -178,12 +178,15 @@ class Allocator {
     FreeImpl(allocation);
   }
 
+  inline void Release(const platform::Place& place) { ReleaseImpl(place); }
+
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
 
  protected:
   virtual Allocation* AllocateImpl(size_t size) = 0;
   virtual void FreeImpl(Allocation* allocation);
+  virtual void ReleaseImpl(const platform::Place& place) {}
 };
 
 using AllocationDeleter = Allocator::AllocationDeleter;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 3213684c140b0..59b06d082872c 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -287,6 +287,11 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
+void AllocatorFacade::Release(const platform::Place& place) {
+  m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+      ->Release(place);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 64b6fe25c352e..2f2f222f6c74a 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -44,6 +44,9 @@ class AllocatorFacade {
   // Allocate a unique allocation.
   AllocationPtr Alloc(const platform::Place& place, size_t size);
 
+  // Release unused memory pool.
+  void Release(const platform::Place& place);
+
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
   AllocatorFacade();
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index cbc126264ac2c..b55ebf18934f2 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -39,6 +39,9 @@ class AutoGrowthBestFitAllocator : public Allocator {
 
   void FreeImpl(Allocation *allocation) override;
 
+  // Release the memory block which is not used in pool.
+  void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); }
+
  private:
   void FreeIdleChunks();
 
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
index 685248a88f71d..dbe2f0ac94453 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator_test.cc
@@ -65,6 +65,7 @@ static void TestFreeIdleChunk(bool free_idle_chunk,
     } else {
       ASSERT_EQ(recorded_allocator->AllocatedSize(), memory_size + alignment);
     }
+    ag_allocator->Release(platform::CPUPlace());
   }
 }
 
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index c661c9f9c3750..842ebd16cf8af 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -53,6 +53,9 @@ void *Alloc(const Place &place, size_t size);
 template <typename Place>
 void Free(const Place &place, void *p, size_t size);
 
+template <typename Place>
+void Release(const Place &place);
+
 template <typename Place>
 size_t Used(const Place &place);
 
@@ -99,6 +102,11 @@ void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
   GetCPUBuddyAllocator()->Free(p);
 }
 
+template <>
+void Release<platform::CPUPlace>(const platform::CPUPlace &place) {
+  GetCPUBuddyAllocator()->Release();
+}
+
 template <>
 size_t Used<platform::CPUPlace>(const platform::CPUPlace &place) {
   return GetCPUBuddyAllocator()->Used();
@@ -186,6 +194,17 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 #endif
 }
 
+template <>
+void Release<platform::XPUPlace>(const platform::XPUPlace &place) {
+#ifdef PADDLE_WITH_XPU
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("Release XPU pool is not supported."));
+#else
+  PADDLE_THROW(
+      platform::errors::PermissionDenied("'XPUPlace' is not supported."));
+#endif
+}
+
 template <>
 size_t Used<platform::XPUPlace>(const platform::XPUPlace &place) {
 #ifdef PADDLE_WITH_XPU
@@ -313,6 +332,16 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
 #endif
 }
 
+template <>
+void Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
+#ifdef PADDLE_WITH_CUDA
+  GetGPUBuddyAllocator(place.device)->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CUDAPlace' is not supported in CPU only device."));
+#endif
+}
+
 #ifdef PADDLE_WITH_CUDA
 BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
   static std::once_flag init_flag;
@@ -371,6 +400,17 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 #endif
 }
 
+template <>
+void Release<platform::CUDAPinnedPlace>(
+    const platform::CUDAPinnedPlace &place) {
+#ifdef PADDLE_WITH_CUDA
+  GetCUDAPinnedBuddyAllocator()->Release();
+#else
+  PADDLE_THROW(platform::errors::PermissionDenied(
+      "'CUDAPinnedPlace' is not supported in CPU only device."));
+#endif
+}
+
 struct AllocVisitor : public boost::static_visitor<void *> {
   inline explicit AllocVisitor(size_t size) : size_(size) {}
 
@@ -397,6 +437,13 @@ struct FreeVisitor : public boost::static_visitor<void> {
   size_t size_;
 };
 
+struct ReleaseVisitor : public boost::static_visitor<void> {
+  template <typename Place>
+  inline void operator()(const Place &place) const {
+    Release<Place>(place);
+  }
+};
+
 size_t Usage::operator()(const platform::CPUPlace &cpu) const {
   return Used(cpu);
 }
@@ -439,6 +486,10 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
   delete allocation;
 }
 
+void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
+  boost::apply_visitor(legacy::ReleaseVisitor(), place);
+}
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index 4cf1bd6123e5f..ba4c4ca226b1e 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -35,6 +35,7 @@ class NaiveBestFitAllocator : public Allocator {
  protected:
   Allocation *AllocateImpl(size_t size) override;
   void FreeImpl(Allocation *allocation) override;
+  void ReleaseImpl(const platform::Place &place) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
new file mode 100644
index 0000000000000..054c75b11f78c
--- /dev/null
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator_test.cc
@@ -0,0 +1,74 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/memory/allocation/naive_best_fit_allocator.h"
+
+#include <algorithm>
+#include <chrono>              // NOLINT
+#include <condition_variable>  // NOLINT
+#include <mutex>               // NOLINT
+#include <string>
+#include <thread>  // NOLINT
+#include <vector>
+
+#include "gtest/gtest.h"
+
+namespace paddle {
+namespace memory {
+namespace allocation {
+
+TEST(NaiveBestFitAllocatorTest, CpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::CPUPlace()};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  alloc.Release(platform::CPUPlace());
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::CPUPlace());
+}
+
+#ifdef PADDLE_WITH_CUDA
+TEST(NaiveBestFitAllocatorTest, GpuAlloc) {
+  NaiveBestFitAllocator alloc{platform::CUDAPlace(0)};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  alloc.Release(platform::CUDAPlace(0));
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::CUDAPlace(0));
+}
+
+TEST(NaiveBestFitAllocatorTest, CudaPinnedAlloc) {
+  NaiveBestFitAllocator alloc{platform::CUDAPinnedPlace()};
+  {
+    size_t size = (1 << 20);
+    auto allocation = alloc.Allocate(size);
+  }
+  alloc.Release(platform::CUDAPinnedPlace());
+
+  size_t size = (1 << 20);
+  auto allocation = alloc.Allocate(size);
+  alloc.Release(platform::CUDAPinnedPlace());
+}
+#endif
+
+}  // namespace allocation
+}  // namespace memory
+}  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 4a787ff2d7b38..74828a0ede3f4 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -47,6 +47,9 @@ class RetryAllocator : public Allocator {
  protected:
   void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size) override;
+  void ReleaseImpl(const platform::Place& place) override {
+    underlying_allocator_->Release(place);
+  }
 
  private:
   std::shared_ptr<Allocator> underlying_allocator_;
diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc
index b80e48460bf9f..13b77c660ca8f 100644
--- a/paddle/fluid/memory/allocation/retry_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc
@@ -96,6 +96,7 @@ TEST(RetryAllocator, RetryAllocator) {
     bool is_all_equal = std::all_of(addresses.begin(), addresses.end(),
                                     [val](void *p) { return p == val; });
     ASSERT_TRUE(is_all_equal);
+    allocator->Release(platform::CPUPlace());
   }
 }
 
@@ -135,6 +136,7 @@ TEST(RetryAllocator, RetryAllocatorLastAllocFailure) {
       auto allocation = allocator.Allocate(allocate_size);
       ASSERT_TRUE(false);
       allocation.reset();
+      allocator.Release(p);
     } catch (BadAlloc &ex) {
       ASSERT_TRUE(std::string(ex.what()).find("Cannot allocate") !=
                   std::string::npos);
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index 50fe9c9b75249..d2a8250d3db58 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -72,6 +72,8 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
   delete allocation;
 }
 
+void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); }
+
 }  // namespace allocation
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 10ca4b828a4bb..764509e75ba23 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -52,6 +52,7 @@ class ThreadLocalAllocatorImpl
   explicit ThreadLocalAllocatorImpl(const platform::Place& p);
   ThreadLocalAllocation* AllocateImpl(size_t size);
   void FreeImpl(ThreadLocalAllocation* allocation);
+  void ReleaseImpl();
 
  private:
   std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
@@ -91,6 +92,9 @@ class ThreadLocalCUDAAllocator : public Allocator {
     auto allocator_impl = tl_allocation->GetAllocator();
     allocator_impl->FreeImpl(tl_allocation);
   }
+  void ReleaseImpl(const platform::Place& p) override {
+    return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl();
+  }
 
  private:
   int gpu_id_;
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
index f9e2ea8c27a74..70fd3a48d7861 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator_test.cc
@@ -62,6 +62,7 @@ TEST(ThreadLocalAllocator, cross_scope_release) {
         auto tl_allocator_impl =
             ThreadLocalCUDAAllocatorPool::Instance().Get(devices[j]);
         allocator_addresses[j][i] = tl_allocator_impl.get();
+        memory::Release(platform::CUDAPlace(devices[j]));
       }
     });
   }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index 6ac99744d7938..e7738d0714751 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -39,9 +39,10 @@ BuddyAllocator::~BuddyAllocator() {
   while (!pool_.empty()) {
     auto block = static_cast<MemoryBlock*>(std::get<2>(*pool_.begin()));
     auto desc = cache_.LoadDesc(block);
-    VLOG(10) << "Free from block (" << block << ", " << desc->get_size() << ")";
+    VLOG(10) << "Free from block (" << block << ", " << desc->get_total_size()
+             << ")";
 
-    system_allocator_->Free(block, desc->get_size(), desc->get_index());
+    system_allocator_->Free(block, desc->get_total_size(), desc->get_index());
     cache_.Invalidate(block);
     pool_.erase(pool_.begin());
   }
@@ -161,6 +162,39 @@ void BuddyAllocator::Free(void* p) {
       IndexSizeAddress(desc->get_index(), desc->get_total_size(), block));
 }
 
+void BuddyAllocator::Release() {
+  std::lock_guard<std::mutex> lock(mutex_);
+  int num = 0;
+  uint64_t bytes = 0;
+  bool del_flag = false;
+  for (auto iter = pool_.begin(); iter != pool_.end();) {
+    auto remain_size = std::get<1>(*iter);
+    auto remain_ptr = std::get<2>(*iter);
+    for (auto& chunk : chunks_) {
+      auto init_size = std::get<1>(chunk);
+      auto init_ptr = std::get<2>(chunk);
+
+      if (init_size == remain_size && init_ptr == remain_ptr) {
+        ++num;
+        bytes += init_size;
+        total_free_ -= init_size;
+        auto block = static_cast<MemoryBlock*>(std::get<2>(chunk));
+        system_allocator_->Free(init_ptr, init_size, std::get<0>(chunk));
+        cache_.Invalidate(block);
+        del_flag = true;
+        break;
+      }
+    }
+
+    if (del_flag) {
+      iter = pool_.erase(iter);
+    } else {
+      iter++;
+    }
+  }
+  VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes.";
+}
+
 size_t BuddyAllocator::Used() { return total_used_; }
 size_t BuddyAllocator::GetMinChunkSize() { return min_chunk_size_; }
 size_t BuddyAllocator::GetMaxChunkSize() { return max_chunk_size_; }
@@ -213,6 +247,9 @@ BuddyAllocator::PoolSet::iterator BuddyAllocator::RefillPool(
 
   total_free_ += allocate_bytes;
 
+  // record the chunk.
+  chunks_.insert(IndexSizeAddress(index, allocate_bytes, p));
+
   // dump the block into pool
   return pool_.insert(IndexSizeAddress(index, allocate_bytes, p)).first;
 }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 791f8b5627772..0bfc8918503b9 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -40,6 +40,8 @@ class BuddyAllocator {
  public:
   void* Alloc(size_t unaligned_size);
   void Free(void* ptr);
+  // Release the unused memory pool, a real free operation for the OS.
+  void Release();
   size_t Used();
   size_t GetMinChunkSize();
   size_t GetMaxChunkSize();
@@ -92,6 +94,11 @@ class BuddyAllocator {
    */
   PoolSet pool_;
 
+  /**
+   * \brief Record the allocated chunks when Refill pool.
+   */
+  PoolSet chunks_;
+
  private:
   /*! Unify the metadata format between GPU and CPU allocations */
   MetadataCache cache_;
diff --git a/paddle/fluid/memory/detail/buddy_allocator_test.cc b/paddle/fluid/memory/detail/buddy_allocator_test.cc
index 1722acd10aa38..90f7e33eb3540 100644
--- a/paddle/fluid/memory/detail/buddy_allocator_test.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator_test.cc
@@ -305,6 +305,23 @@ TEST(BuddyAllocator, SpeedAna) {
   std::cerr << "time cost " << diff.count() << std::endl;
 }
 
+TEST(BuddyAllocator, Release) {
+  // In a 8 GB machine, the pool size will be about 800 MB
+  FLAGS_fraction_of_gpu_memory_to_use = 0.1;
+  FLAGS_initial_gpu_memory_in_mb = 0;
+  FLAGS_reallocate_gpu_memory_in_mb = 0;
+
+  BuddyAllocator buddy_allocator(
+      std::unique_ptr<SystemAllocator>(new GPUAllocator(TEST_GPU_ID)),
+      platform::GpuMinChunkSize(), platform::GpuMaxChunkSize());
+
+  // Less than pool size
+  TestBuddyAllocator(&buddy_allocator, 10);
+  TestBuddyAllocator(&buddy_allocator, 10 << 10);
+  TestBuddyAllocator(&buddy_allocator, 50 << 20);
+
+  buddy_allocator.Release();
+}
 #endif
 
 }  // namespace detail
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index e01f030585a83..2fbde03b42bcc 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -31,5 +31,9 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
 
+void Release(const platform::Place &place) {
+  return allocation::AllocatorFacade::Instance().Release(place);
+}
+
 }  // namespace memory
 }  // namespace paddle
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 73487795f752e..3d6836e1d255b 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -38,5 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size);
 
 extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
+extern void Release(const platform::Place& place);
+
 }  // namespace memory
 }  // namespace paddle

From f401907775beec285ca3e9ab4ed21a3cf5b76e6b Mon Sep 17 00:00:00 2001
From: Li Fuchen <lifuchen@baidu.com>
Date: Wed, 4 Nov 2020 15:33:05 +0800
Subject: [PATCH 112/185] modified sample code of CTCLoss & ctc_loss by remove
 disable_static() & print([.*].numpy()) & alias, test=document_fix (#28403)

---
 python/paddle/nn/functional/loss.py | 5 ++---
 python/paddle/nn/layer/loss.py      | 7 ++-----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index b056029fb5aa1..ae04cdcc931ec 100644
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1112,7 +1112,6 @@ def ctc_loss(log_probs,
             input_lengths = np.array([5, 5]).astype("int64")
             label_lengths = np.array([3, 3]).astype("int64")
 
-            paddle.disable_static()
             log_probs = paddle.to_tensor(log_probs)
             labels = paddle.to_tensor(labels)
             input_lengths = paddle.to_tensor(input_lengths)
@@ -1123,14 +1122,14 @@ def ctc_loss(log_probs,
                 label_lengths,
                 blank=0,
                 reduction='none')
-            print(loss.numpy())  #[3.9179852 2.9076521]
+            print(loss)  #[3.9179852 2.9076521]
 
             loss = F.ctc_loss(log_probs, labels,
                 input_lengths,
                 label_lengths,
                 blank=0,
                 reduction='mean')
-            print(loss.numpy())  #[1.1376063]
+            print(loss)  #[1.1376063]
 
     """
 
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 5ce4baca55749..351afc97a2a88 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -883,8 +883,6 @@ def forward(self, input, other, label):
 
 class CTCLoss(fluid.dygraph.Layer):
     """
-	:alias_main: paddle.nn.CTCLoss
-	:alias: paddle.nn.CTCLoss, paddle.nn.layer.CTCLoss, paddle.nn.layer.loss.CTCLoss
 
     An operator integrating the open source Warp-CTC library (https://github.com/baidu-research/warp-ctc)
     to compute Connectionist Temporal Classification (CTC) loss.
@@ -941,7 +939,6 @@ class CTCLoss(fluid.dygraph.Layer):
             input_lengths = np.array([5, 5]).astype("int64")
             label_lengths = np.array([3, 3]).astype("int64")
 
-            paddle.disable_static()
             log_probs = paddle.to_tensor(log_probs)
             labels = paddle.to_tensor(labels)
             input_lengths = paddle.to_tensor(input_lengths)
@@ -950,12 +947,12 @@ class CTCLoss(fluid.dygraph.Layer):
             loss = paddle.nn.CTCLoss(blank=0, reduction='none')(log_probs, labels,
                 input_lengths,
                 label_lengths)
-            print(loss.numpy())  #[3.9179852 2.9076521]
+            print(loss)  #[3.9179852 2.9076521]
 
             loss = paddle.nn.CTCLoss(blank=0, reduction='mean')(log_probs, labels,
                 input_lengths,
                 label_lengths)
-            print(loss.numpy())  #[1.1376063]
+            print(loss)  #[1.1376063]
     """
 
     def __init__(self, blank=0, reduction='mean'):

From 0d25d55a865e3ff53a65028e0ee44ddd54abf4d9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Wed, 4 Nov 2020 17:11:51 +0800
Subject: [PATCH 113/185] update the cmake cmd, test=develop (#28344)

---
 cmake/external/pybind11.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake
index 05cc77f23baaa..353cb5c72fdfb 100644
--- a/cmake/external/pybind11.cmake
+++ b/cmake/external/pybind11.cmake
@@ -34,6 +34,11 @@ ExternalProject_Add(
         "${PYBIND_DOWNLOAD_CMD}"
         PREFIX            ${PYBIND_PREFIX_DIR}
         SOURCE_DIR        ${PYBIND_SOURCE_DIR}
+        # If we explicitly leave the `UPDATE_COMMAND` of the ExternalProject_Add
+        # function in CMakeLists blank, it will cause another parameter GIT_TAG
+        # to be modified without triggering incremental compilation, and the
+        # third-party library version changes cannot be incorporated.
+        # reference: https://cmake.org/cmake/help/latest/module/ExternalProject.html
         UPDATE_COMMAND    ""
         CONFIGURE_COMMAND ""
         BUILD_COMMAND     ""

From 12b9587be530d41f4caa67a656c1f61e19a79ac5 Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Wed, 4 Nov 2020 12:28:35 +0100
Subject: [PATCH 114/185] Add conv_bias pass version python test (#28278)

* add conv_bias pass version test

* update according to reviews
---
 .../ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   | 15 +++--
 .../test_conv_bias_mkldnn_fuse_pass.py        | 57 ++++++++++++-------
 2 files changed, 46 insertions(+), 26 deletions(-)

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index dfb030a7cc768..76c6ca24aaaf0 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -147,12 +147,19 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
 }  // namespace paddle
 REGISTER_PASS(conv_bias_mkldnn_fuse_pass,
               paddle::framework::ir::ConvBiasFusePass);
-REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv2DTransposeBiasFusePass);
-REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
-              paddle::framework::ir::Conv3DBiasFusePass);
 REGISTER_PASS_CAPABILITY(conv_bias_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
             .EQ("conv2d", 0)
             .EQ("elementwise_add", 0));
+
+REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv2DTransposeBiasFusePass);
+REGISTER_PASS_CAPABILITY(conv_transpose_bias_mkldnn_fuse_pass)
+    .AddCombination(
+        paddle::framework::compatible::OpVersionComparatorCombination()
+            .EQ("conv2d_transpose", 0)
+            .EQ("elementwise_add", 0));
+
+REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
+              paddle::framework::ir::Conv3DBiasFusePass);
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
index 5eb397b5a95b2..6c8b9d4d3a879 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
@@ -20,11 +20,11 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.core import AnalysisConfig
-"""Test for fusion of conv and bias."""
+from paddle.fluid.core import PassVersionChecker
 
 
 #padding SAME
-class ConvBiasMkldnnFusePassTest(InferencePassTest):
+class ConvBiasMkldnnFusePassSamePadTest(InferencePassTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
@@ -48,10 +48,12 @@ def setUp(self):
     def test_check_output(self):
         use_gpu = False
         self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible("conv_bias_mkldnn_fuse_pass"))
 
 
 #padding VALID
-class ConvBiasMkldnnFusePassTest1(InferencePassTest):
+class ConvBiasMkldnnFusePassValidPadTest(ConvBiasMkldnnFusePassSamePadTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
@@ -72,13 +74,9 @@ def setUp(self):
         self.fetch_list = [conv_out]
         self.enable_mkldnn = True
 
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
-
 
-#padding number
-class ConvBiasMkldnnFusePassTest2(InferencePassTest):
+#padding EXPLICT NUMBER
+class ConvBiasMkldnnFusePassExplictPadTest(ConvBiasMkldnnFusePassSamePadTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
@@ -99,13 +97,8 @@ def setUp(self):
         self.fetch_list = [conv_out]
         self.enable_mkldnn = True
 
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
-
 
-#dilation not supported yet, just print warning log and does not fuse
-class ConvBiasMkldnnFusePassTest3(InferencePassTest):
+class ConvBiasMkldnnFusePassGroupTest(ConvBiasMkldnnFusePassSamePadTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
@@ -118,7 +111,6 @@ def setUp(self):
                 num_filters=3,
                 filter_size=3,
                 padding="VALID",
-                dilation=2,
                 groups=3,
                 bias_attr=param_attr,
                 use_cudnn=False,
@@ -131,13 +123,9 @@ def setUp(self):
         self.fetch_list = [conv_out]
         self.enable_mkldnn = True
 
-    def test_check_output(self):
-        use_gpu = False
-        self.check_output_with_option(use_gpu)
 
-
-#all conv params except for dilation
-class ConvBiasMkldnnFusePassTest4(InferencePassTest):
+class ConvBiasMkldnnFusePassDialtionsGroupsTest(
+        ConvBiasMkldnnFusePassSamePadTest):
     def setUp(self):
         with fluid.program_guard(self.main_program, self.startup_program):
             data = fluid.data(
@@ -150,6 +138,7 @@ def setUp(self):
                 num_filters=3,
                 filter_size=3,
                 padding="VALID",
+                dilation=2,
                 groups=3,
                 bias_attr=param_attr,
                 use_cudnn=False,
@@ -162,9 +151,33 @@ def setUp(self):
         self.fetch_list = [conv_out]
         self.enable_mkldnn = True
 
+
+class ConvTransposeMkldnnFusePassDialtionsGroupsTest(InferencePassTest):
+    def setUp(self):
+        with fluid.program_guard(self.main_program, self.startup_program):
+            data = fluid.data(name="data", shape=[-1, 3, 5, 5], dtype="float32")
+            param_attr = fluid.ParamAttr(
+                initializer=fluid.initializer.Xavier(uniform=False),
+                learning_rate=0.001)
+            conv_out = fluid.layers.conv2d_transpose(
+                input=data,
+                num_filters=3,
+                filter_size=3,
+                padding="SAME",
+                dilation=1,
+                bias_attr=param_attr,
+                use_cudnn=False)
+
+        self.feeds = {"data": np.random.random((1, 3, 5, 5)).astype("float32")}
+        self.fetch_list = [conv_out]
+        self.enable_mkldnn = True
+
     def test_check_output(self):
         use_gpu = False
         self.check_output_with_option(use_gpu)
+        self.assertTrue(
+            PassVersionChecker.IsCompatible(
+                "conv_transpose_bias_mkldnn_fuse_pass"))
 
 
 if __name__ == "__main__":

From 44a476c2abd23a317c11dc898be39a094d272b46 Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 4 Nov 2020 20:38:29 +0800
Subject: [PATCH 115/185] support cuda pinned place (#28416)

---
 paddle/fluid/imperative/amp_auto_cast.cc      | 16 ++--
 .../test_imperative_auto_mixed_precision.py   | 74 ++++++++++++++-----
 2 files changed, 65 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/imperative/amp_auto_cast.cc b/paddle/fluid/imperative/amp_auto_cast.cc
index d4a1519b07e8c..d0f3efcdf67f6 100644
--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -49,15 +49,15 @@ inline std::string GetDtypeStr(
 }
 
 inline bool NeedCast(const std::shared_ptr<VarBase>& var) {
-  if (!platform::is_gpu_place(var->Place())) {
-    return false;
-  }
-  if (var->DataType() == framework::proto::VarType::FP32 ||
-      var->DataType() == framework::proto::VarType::FP16) {
-    return true;
-  } else {
-    return false;
+  if (platform::is_gpu_place(var->Place()) ||
+      platform::is_cuda_pinned_place(var->Place())) {
+    // CudaPinndePlace is added for varbase created by dataloader
+    if (var->DataType() == framework::proto::VarType::FP32 ||
+        var->DataType() == framework::proto::VarType::FP16) {
+      return true;
+    }
   }
+  return false;
 }
 
 // NOTE: Trace a cast op, so if a var is casted from fp32 to fp16, then the grad
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
index 2d1d2949a4eb2..0118f3c800b6f 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_auto_mixed_precision.py
@@ -196,15 +196,27 @@ def test_nan_inf(self):
                     np.array_equal(param.numpy(), params_init[param.name]))
 
 
+def reader_decorator(reader):
+    def __reader__():
+        for item in reader():
+            img = np.array(item[0]).astype('float32').reshape(3, 224, 224)
+            label = np.array(item[1]).astype('int64').reshape(1)
+            yield img, label
+
+    return __reader__
+
+
 class TestResnet2(unittest.TestCase):
-    def train_resnet(self, enable_amp=True):
+    """
+    Use paddle-2.0 API
+    """
+
+    def train_resnet(self, enable_amp=True, use_data_loader=False):
         seed = 90
 
         batch_size = train_parameters["batch_size"]
         batch_num = 1
 
-        paddle.disable_static()
-
         paddle.seed(seed)
         paddle.framework.random._manual_program_seed(seed)
 
@@ -223,18 +235,35 @@ def train_resnet(self, enable_amp=True):
         scaler = paddle.amp.GradScaler(
             enable=enable_amp, init_loss_scaling=2.**10)
 
+        if use_data_loader:
+            train_reader = paddle.batch(
+                reader_decorator(paddle.dataset.flowers.train(use_xmap=False)),
+                batch_size=batch_size,
+                drop_last=True)
+            train_loader = fluid.io.DataLoader.from_generator(
+                capacity=4,
+                use_double_buffer=True,
+                iterable=True,
+                return_list=True)
+            train_loader.set_sample_list_generator(train_reader)
+            train_reader = train_loader
+
         for batch_id, data in enumerate(train_reader()):
             if batch_id >= batch_num:
                 break
-            dy_x_data = np.array(
-                [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
-            if len(np.array([x[1]
-                             for x in data]).astype('int64')) != batch_size:
-                continue
-            y_data = np.array([x[1] for x in data]).astype('int64').reshape(-1,
-                                                                            1)
-            img = paddle.to_tensor(dy_x_data)
-            label = paddle.to_tensor(y_data)
+            if use_data_loader:
+                img, label = data
+            else:
+                dy_x_data = np.array(
+                    [x[0].reshape(3, 224, 224) for x in data]).astype('float32')
+                if len(np.array([x[1]
+                                 for x in data]).astype('int64')) != batch_size:
+                    continue
+                y_data = np.array([x[1] for x in data]).astype('int64').reshape(
+                    -1, 1)
+
+                img = paddle.to_tensor(dy_x_data)
+                label = paddle.to_tensor(y_data)
             label.stop_gradient = True
 
             with paddle.amp.auto_cast(enable=enable_amp):
@@ -262,19 +291,30 @@ def train_resnet(self, enable_amp=True):
             dy_param_value = {}
             for param in resnet.parameters():
                 dy_param_value[param.name] = param.numpy()
-
-            paddle.enable_static()
-
+        if use_data_loader:
+            train_reader._reset()
         return dy_out, dy_param_value, dy_grad_value
 
     def test_resnet(self):
-        out_fp32 = self.train_resnet(enable_amp=False)
-        out_amp = self.train_resnet(enable_amp=True)
+        with fluid.dygraph.guard():
+            out_fp32 = self.train_resnet(enable_amp=False)
+            out_amp = self.train_resnet(enable_amp=True)
+        print(out_fp32[0], out_amp[0])
+        self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
+
+    def test_with_data_loader(self):
+        with fluid.dygraph.guard():
+            out_fp32 = self.train_resnet(enable_amp=False, use_data_loader=True)
+            out_amp = self.train_resnet(enable_amp=True, use_data_loader=True)
         print(out_fp32[0], out_amp[0])
         self.assertTrue(np.allclose(out_fp32[0], out_amp[0], atol=1.e-2))
 
 
 class TestResnet(unittest.TestCase):
+    """
+    Use paddle-1.x API
+    """
+
     def train_resnet(self, enable_amp=True):
         seed = 90
 

From 3c95acc3eb4f0aa64c820ea19d533d12d56f3e93 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Wed, 4 Nov 2020 20:48:22 +0800
Subject: [PATCH 116/185] Remove cv2 dependence of normalize (#28361)

* rm cv2 dependence of normalize
---
 python/paddle/vision/transforms/functional_cv2.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/python/paddle/vision/transforms/functional_cv2.py b/python/paddle/vision/transforms/functional_cv2.py
index 5c2e8d61bc527..4cc04c39d0bf9 100644
--- a/python/paddle/vision/transforms/functional_cv2.py
+++ b/python/paddle/vision/transforms/functional_cv2.py
@@ -495,9 +495,8 @@ def normalize(img, mean, std, data_format='CHW', to_rgb=False):
         mean = np.float32(np.array(mean).reshape(1, 1, -1))
         std = np.float32(np.array(std).reshape(1, 1, -1))
     if to_rgb:
-        cv2 = try_import('cv2')
         # inplace
-        cv2.cvtColor(img, cv2.COLOR_BGR2RGB, img)
+        img = img[..., ::-1]
 
     img = (img - mean) / std
     return img

From 23439b1688875968ea1de51c7e84061da7535768 Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Wed, 4 Nov 2020 21:11:46 +0800
Subject: [PATCH 117/185] show cpp stack when catch signal (#28415)

---
 paddle/fluid/platform/init.cc | 21 +++++++++++++--------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc
index ba4520b1388e6..a594044e9bc27 100644
--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -287,7 +287,7 @@ void SignalHandle(const char *data, int size) {
     // NOTE2: we only deal with the time info ane signal info,
     //   the stack trace will generated by paddle self
     if (StartsWith(data, "*** Aborted at")) {
-      *signal_msg_dunmer_ptr << "  [TimeInfo: " << std::string(data, size - 1)
+      *signal_msg_dunmer_ptr << "\n  [TimeInfo: " << std::string(data, size - 1)
                              << "]\n";
     } else if (StartsWith(data, "***")) {
       std::string signal_info(data, size - 1);
@@ -295,14 +295,19 @@ void SignalHandle(const char *data, int size) {
       size_t start_pos = signal_info.rfind(useless_substr);
       signal_info.replace(start_pos, useless_substr.length(), "");
       *signal_msg_dunmer_ptr << "  [SignalInfo: " << signal_info << "]\n";
-      // NOTE3: Here does not throw an exception,
+
+      // NOTE3: Final singal error message print.
+      // Here does not throw an exception,
       // otherwise it will casue "terminate called recursively"
-      auto exp = platform::EnforceNotMet(
-          platform::errors::Fatal(
-              "A serious error (%s) is detected by the operating system.",
-              ParseSignalErrorString(signal_info)),
-          __FILE__, __LINE__);
-      std::cout << exp.what() << (*signal_msg_dunmer_ptr).str() << std::endl;
+      std::ostringstream sout;
+      sout << platform::GetCurrentTraceBackString();
+      sout << "\n----------------------\nError Message "
+              "Summary:\n----------------------\n";
+      sout << platform::errors::Fatal(
+                  "`%s` is detected by the operating system.",
+                  ParseSignalErrorString(signal_info))
+                  .to_string();
+      std::cout << sout.str() << (*signal_msg_dunmer_ptr).str() << std::endl;
     }
   } catch (...) {
     // Since the program has already triggered a system error,

From a4303496b29a09e48390ae1773bd227d336cc64a Mon Sep 17 00:00:00 2001
From: Qi Li <qili93@qq.com>
Date: Wed, 4 Nov 2020 21:29:11 +0800
Subject: [PATCH 118/185] [DOC] activation api doc, test=document_fix (#28405)

---
 python/paddle/nn/__init__.py              |  2 +-
 python/paddle/nn/functional/activation.py | 21 +--------------------
 python/paddle/nn/layer/__init__.py        |  4 ++--
 python/paddle/nn/layer/activation.py      | 21 +--------------------
 4 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/python/paddle/nn/__init__.py b/python/paddle/nn/__init__.py
index e53ba753a9bda..51b2e2072791e 100644
--- a/python/paddle/nn/__init__.py
+++ b/python/paddle/nn/__init__.py
@@ -61,7 +61,7 @@
 from .layer.activation import LeakyReLU  #DEFINE_ALIAS
 from .layer.activation import Sigmoid  #DEFINE_ALIAS
 from .layer.activation import Hardsigmoid  #DEFINE_ALIAS
-from .layer.activation import LogSigmoid
+from .layer.activation import LogSigmoid  #DEFINE_ALIAS
 from .layer.activation import Softmax  #DEFINE_ALIAS
 from .layer.activation import Softplus  #DEFINE_ALIAS
 from .layer.activation import Softshrink  #DEFINE_ALIAS
diff --git a/python/paddle/nn/functional/activation.py b/python/paddle/nn/functional/activation.py
index 0f79aa012ca32..fd86c2e9fa760 100644
--- a/python/paddle/nn/functional/activation.py
+++ b/python/paddle/nn/functional/activation.py
@@ -81,8 +81,6 @@ def elu(x, alpha=1.0, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
             out = F.elu(x, alpha=0.2)
             # [[-0.12642411  6.        ]
@@ -135,8 +133,6 @@ def gelu(x, approximate=False, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
             out1 = F.gelu(x) # [-0.158655 0.345731 0.841345 1.39979]
             out2 = F.gelu(x, True) # [-0.158808 0.345714 0.841192 1.39957]
@@ -237,8 +233,6 @@ def hardtanh(x, min=-1.0, max=1.0, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
             out = F.hardtanh(x) # [-1., 0.3, 1.]
     """
@@ -439,8 +433,6 @@ def prelu(x, weight, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
                                [ 3.0, -4.0,  5.0, -6.0],
                                [-7.0, -8.0,  8.0,  9.0]],
@@ -512,8 +504,6 @@ def relu(x, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
             out = F.relu(x) # [0., 0., 1.]
     """
@@ -550,8 +540,6 @@ def log_sigmoid(x, name=None):
             import paddle
             import paddle.nn.functional as F
 
-            paddle.disable_static()
-
             x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             out = F.log_sigmoid(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
     """
@@ -823,12 +811,7 @@ def softmax(x, axis=-1, dtype=None, name=None):
             calculations. It should be in range [-D, D), where D is the
             dimensions of ``x`` . If ``axis`` < 0, it works the same way as
             :math:`axis + D` . Default is -1.
-        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
-            type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for
-            preventing data type overflows. Supported dtype: float32, float64.
-            If ``dtype`` is None, the output Tensor has the same dtype as x.
-            Default is None.
+        dtype (str, optional): The data type of the output tensor, can be float32, float64.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -843,8 +826,6 @@ def softmax(x, axis=-1, dtype=None, name=None):
             import paddle.nn.functional as F
             import numpy as np
 
-            paddle.disable_static()
-
             x = np.array([[[2.0, 3.0, 4.0, 5.0],
                         [3.0, 4.0, 5.0, 6.0],
                         [7.0, 8.0, 8.0, 9.0]],
diff --git a/python/paddle/nn/layer/__init__.py b/python/paddle/nn/layer/__init__.py
index 801290e99572b..4e68fcab3fda8 100644
--- a/python/paddle/nn/layer/__init__.py
+++ b/python/paddle/nn/layer/__init__.py
@@ -35,11 +35,11 @@
 from .vision import *
 
 from .transformer import *
-# from .activation import PReLU        #DEFINE_ALIAS
+from .activation import PReLU  #DEFINE_ALIAS
 from .activation import ReLU  #DEFINE_ALIAS
 from .activation import LeakyReLU  #DEFINE_ALIAS
 from .activation import Sigmoid  #DEFINE_ALIAS
-# from .activation import Softmax        #DEFINE_ALIAS
+from .activation import Softmax  #DEFINE_ALIAS
 from .activation import LogSoftmax  #DEFINE_ALIAS
 from .common import BilinearTensorProduct  #DEFINE_ALIAS
 from .common import Bilinear  #DEFINE_ALIAS
diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index dbb9d00f365cf..b0a1b27855a80 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -72,8 +72,6 @@ class ELU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[-1,6],[1,15.6]]))
             m = paddle.nn.ELU(0.2)
             out = m(x)
@@ -121,8 +119,6 @@ class GELU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([[-1, 0.5],[1, 1.5]]))
 
             m = paddle.nn.GELU()
@@ -301,8 +297,6 @@ class Hardtanh(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-1.5, 0.3, 2.5]))
             m = paddle.nn.Hardtanh()
             out = m(x) # # [-1., 0.3, 1.]
@@ -333,7 +327,7 @@ class PReLU(layers.Layer):
             Default is 1.
         init (float, optional): Init value of learnable `weight`. Default is 0.25.
         weight_attr(ParamAttr, optional): The parameter attribute for the learnable `weight`.
-            Default is None. For more information, please refer to :ref:`api_fluid_ParamAttr`.
+            Default is None. For more information, please refer to :ref:`api_paddle_ParamAttr`.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -347,7 +341,6 @@ class PReLU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
             paddle.set_default_dtype("float64")
 
             data = np.array([[[[-2.0,  3.0, -4.0,  5.0],
@@ -408,8 +401,6 @@ class ReLU(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-2, 0, 1]).astype('float32'))
             m = paddle.nn.ReLU()
             out = m(x) # [0., 0., 1.]
@@ -885,8 +876,6 @@ class LogSigmoid(layers.Layer):
 
             import paddle
 
-            paddle.disable_static()
-
             x = paddle.to_tensor([1.0, 2.0, 3.0, 4.0])
             m = paddle.nn.LogSigmoid()
             out = m(x) # [-0.313262 -0.126928 -0.0485874 -0.0181499]
@@ -983,12 +972,6 @@ class Softmax(layers.Layer):
             calculations. It should be in range [-D, D), where D is the
             dimensions of ``x`` . If ``axis`` < 0, it works the same way as
             :math:`axis + D` . Default is -1.
-        dtype (str|np.dtype|core.VarDesc.VarType, optional): The desired data
-            type of the output tensor. If dtype is specified, ``x`` is casted
-            to ``dtype`` before the operation is performed. This is useful for
-            preventing data type overflows. Supported dtype: float32, float64.
-            If ``dtype`` is None, the output Tensor has the same dtype as x.
-            Default is None.
         name (str, optional): Name for the operation (optional, default is None).
             For more information, please refer to :ref:`api_guide_Name`.
 
@@ -1002,8 +985,6 @@ class Softmax(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = np.array([[[2.0, 3.0, 4.0, 5.0],
                         [3.0, 4.0, 5.0, 6.0],
                         [7.0, 8.0, 8.0, 9.0]],

From 463075a82b43a079878a523e7c53b76505029c3d Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Wed, 4 Nov 2020 22:27:19 +0800
Subject: [PATCH 119/185] add paddle.io.ComposeDataset & paddle.io.ChainDataset
 (#28311)

* add paddle.io.ComposeDataset & paddle.io.ChainDataset. test=develop
---
 python/paddle/fluid/dataloader/dataset.py     | 132 +++++++++++++++++-
 .../test_multiprocess_dataloader_dataset.py   |  83 ++++++++++-
 python/paddle/io/__init__.py                  |   5 +-
 3 files changed, 215 insertions(+), 5 deletions(-)

diff --git a/python/paddle/fluid/dataloader/dataset.py b/python/paddle/fluid/dataloader/dataset.py
index 13bb946a5ebca..2269a98c4d976 100644
--- a/python/paddle/fluid/dataloader/dataset.py
+++ b/python/paddle/fluid/dataloader/dataset.py
@@ -17,7 +17,10 @@
 from .. import framework
 import paddle.dataset.common
 
-__all__ = ["Dataset", "IterableDataset", "TensorDataset"]
+__all__ = [
+    "Dataset", "IterableDataset", "TensorDataset", "ComposeDataset",
+    "ChainDataset"
+]
 
 
 class Dataset(object):
@@ -275,3 +278,130 @@ def __getitem__(self, index):
 
     def __len__(self):
         return self.tensors[0].shape[0]
+
+
+def to_list(value):
+    if value is None:
+        return value
+    if isinstance(value, (list, tuple)):
+        return list(value)
+    return [value]
+
+
+class ComposeDataset(Dataset):
+    """
+    A Dataset which composes fields of multiple datasets.
+
+    This dataset is used for composing fileds of multiple map-style
+    datasets of same length.
+
+    Args:
+        datasets(list of Dataset): List of datasets to be composed.
+
+    Returns:
+        Dataset: A Dataset which composes fields of multiple datasets.
+
+    Examples:
+
+        .. code-block:: python
+        
+            import numpy as np
+            import paddle
+            from paddle.io import Dataset, ComposeDataset
+
+
+            # define a random dataset
+            class RandomDataset(Dataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+
+                def __getitem__(self, idx):
+                    image = np.random.random([32]).astype('float32')
+                    label = np.random.randint(0, 9, (1, )).astype('int64')
+                    return image, label
+                
+                def __len__(self):
+                    return self.num_samples
+
+            dataset = ComposeDataset([RandomDataset(10), RandomDataset(10)])
+            for i in range(len(dataset)):
+                image1, label1, image2, label2 = dataset[i]
+                print(image1)
+                print(label1)
+                print(image2)
+                print(label2)
+            
+    """
+
+    def __init__(self, datasets):
+        self.datasets = list(datasets)
+        assert len(self.datasets) > 0, "input datasets shoule not be empty"
+        for i, dataset in enumerate(self.datasets):
+            assert isinstance(dataset, Dataset), \
+                    "each input dataset should be paddle.io.Dataset"
+            assert not isinstance(dataset, IterableDataset), \
+                    "paddle.io.IterableDataset not supported"
+            if i > 0:
+                assert len(dataset) == len(self.datasets[i-1]), \
+                        "lengths of datasets should be same"
+
+    def __len__(self):
+        return len(self.datasets[0])
+
+    def __getitem__(self, idx):
+        sample = []
+        for dataset in self.datasets:
+            sample.extend(to_list(dataset[idx]))
+        return tuple(sample)
+
+
+class ChainDataset(IterableDataset):
+    """
+    A Dataset which chains multiple iterable-tyle datasets.
+
+    This dataset is used for assembling multiple datasets which should
+    be :code:`paddle.io.IterableDataset`.
+
+    Args:
+        datasets(list of Dataset): List of datasets to be chainned.
+
+    Returns:
+        Dataset: A Dataset which chains fields of multiple datasets.
+
+    Examples:
+
+        .. code-block:: python
+        
+            import numpy as np
+            import paddle
+            from paddle.io import IterableDataset, ChainDataset
+
+
+            # define a random dataset
+            class RandomDataset(IterableDataset):
+                def __init__(self, num_samples):
+                    self.num_samples = num_samples
+
+                def __iter__(self):
+                    for i in range(10):
+                        image = np.random.random([32]).astype('float32')
+                        label = np.random.randint(0, 9, (1, )).astype('int64')
+                        yield image, label
+                
+            dataset = ChainDataset([RandomDataset(10), RandomDataset(10)])
+            for image, label in iter(dataset):
+                print(image, label)
+            
+    """
+
+    def __init__(self, datasets):
+        self.datasets = list(datasets)
+        assert len(self.datasets) > 0, "input datasets shoule not be empty"
+        for i, dataset in enumerate(self.datasets):
+            assert isinstance(dataset, IterableDataset), \
+                    "ChainDataset only support paddle.io.IterableDataset"
+
+    def __iter__(self):
+        for dataset in self.datasets:
+            for sample in dataset:
+                yield sample
diff --git a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
index 6e2f9562b453b..496e5320d4ce6 100644
--- a/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
+++ b/python/paddle/fluid/tests/unittests/test_multiprocess_dataloader_dataset.py
@@ -19,9 +19,38 @@
 
 import paddle
 import paddle.fluid as fluid
-from paddle.io import TensorDataset, DataLoader
+from paddle.io import Dataset, IterableDataset, TensorDataset, \
+        ComposeDataset, ChainDataset, DataLoader
 from paddle.fluid.dygraph.base import to_variable
 
+IMAGE_SIZE = 32
+
+
+class RandomDataset(Dataset):
+    def __init__(self, sample_num):
+        self.sample_num = sample_num
+
+    def __len__(self):
+        return self.sample_num
+
+    def __getitem__(self, idx):
+        np.random.seed(idx)
+        image = np.random.random([IMAGE_SIZE]).astype('float32')
+        label = np.random.randint(0, 9, (1, )).astype('int64')
+        return image, label
+
+
+class RandomIterableDataset(IterableDataset):
+    def __init__(self, sample_num):
+        self.sample_num = sample_num
+
+    def __iter__(self):
+        for i in range(self.sample_num):
+            np.random.seed(i)
+            image = np.random.random([IMAGE_SIZE]).astype('float32')
+            label = np.random.randint(0, 9, (1, )).astype('int64')
+            yield image, label
+
 
 class TestTensorDataset(unittest.TestCase):
     def run_main(self, num_workers, places):
@@ -55,8 +84,56 @@ def run_main(self, num_workers, places):
 
     def test_main(self):
         for p in [fluid.CPUPlace(), fluid.CUDAPlace(0)]:
-            for num_workers in [0, 2]:
-                ret = self.run_main(num_workers=num_workers, places=p)
+            self.run_main(num_workers=0, places=p)
+
+
+class TestComposeDataset(unittest.TestCase):
+    def test_main(self):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+
+        dataset1 = RandomDataset(10)
+        dataset2 = RandomDataset(10)
+        dataset = ComposeDataset([dataset1, dataset2])
+        assert len(dataset) == 10
+
+        for i in range(len(dataset)):
+            input1, label1, input2, label2 = dataset[i]
+            input1_t, label1_t = dataset1[i]
+            input2_t, label2_t = dataset2[i]
+            assert np.allclose(input1, input1_t)
+            assert np.allclose(label1, label1_t)
+            assert np.allclose(input2, input2_t)
+            assert np.allclose(label2, label2_t)
+
+
+class TestChainDataset(unittest.TestCase):
+    def run_main(self, num_workers, places):
+        fluid.default_startup_program().random_seed = 1
+        fluid.default_main_program().random_seed = 1
+
+        dataset1 = RandomIterableDataset(10)
+        dataset2 = RandomIterableDataset(10)
+        dataset = ChainDataset([dataset1, dataset2])
+
+        samples = []
+        for data in iter(dataset):
+            samples.append(data)
+        assert len(samples) == 20
+
+        idx = 0
+        for image, label in iter(dataset1):
+            assert np.allclose(image, samples[idx][0])
+            assert np.allclose(label, samples[idx][1])
+            idx += 1
+        for image, label in iter(dataset2):
+            assert np.allclose(image, samples[idx][0])
+            assert np.allclose(label, samples[idx][1])
+            idx += 1
+
+    def test_main(self):
+        for p in [fluid.CPUPlace(), fluid.CUDAPlace(0)]:
+            self.run_main(num_workers=0, places=p)
 
 
 if __name__ == '__main__':
diff --git a/python/paddle/io/__init__.py b/python/paddle/io/__init__.py
index 92dd819b3cd5e..b4e437a97dd22 100644
--- a/python/paddle/io/__init__.py
+++ b/python/paddle/io/__init__.py
@@ -17,6 +17,8 @@
     'Dataset',
     'IterableDataset',
     'TensorDataset',
+    'ComposeDataset',
+    'ChainDataset',
     'BatchSampler',
     'DistributedBatchSampler',
     #            'Transform',
@@ -29,4 +31,5 @@
 
 from ..fluid.io import DataLoader
 from ..fluid.dataloader import Dataset, IterableDataset, BatchSampler, get_worker_info, \
-        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler
+        TensorDataset, Sampler, SequenceSampler, RandomSampler, DistributedBatchSampler, \
+        ComposeDataset, ChainDataset

From b89b4e321db2deecf00174d14141c853638219b5 Mon Sep 17 00:00:00 2001
From: LielinJiang <50691816+LielinJiang@users.noreply.github.com>
Date: Wed, 4 Nov 2020 22:28:05 +0800
Subject: [PATCH 120/185] add fashion dataset (#28411)

---
 python/paddle/tests/test_datasets.py   | 46 +++++++++++++++
 python/paddle/vision/datasets/cifar.py |  2 +-
 python/paddle/vision/datasets/mnist.py | 79 ++++++++++++++++++++------
 3 files changed, 109 insertions(+), 18 deletions(-)

diff --git a/python/paddle/tests/test_datasets.py b/python/paddle/tests/test_datasets.py
index 1dc651f916c42..d119d2c5ccad6 100644
--- a/python/paddle/tests/test_datasets.py
+++ b/python/paddle/tests/test_datasets.py
@@ -134,6 +134,52 @@ def test_main(self):
             mnist = MNIST(mode='train', transform=transform, backend=1)
 
 
+class TestFASHIONMNISTTest(unittest.TestCase):
+    def test_main(self):
+        transform = T.Transpose()
+        mnist = FashionMNIST(mode='test', transform=transform)
+        self.assertTrue(len(mnist) == 10000)
+
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+
+
+class TestFASHIONMNISTTrain(unittest.TestCase):
+    def test_main(self):
+        transform = T.Transpose()
+        mnist = FashionMNIST(mode='train', transform=transform)
+        self.assertTrue(len(mnist) == 60000)
+
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+
+        # test cv2 backend
+        mnist = FashionMNIST(mode='train', transform=transform, backend='cv2')
+        self.assertTrue(len(mnist) == 60000)
+
+        for i in range(len(mnist)):
+            image, label = mnist[i]
+            self.assertTrue(image.shape[0] == 1)
+            self.assertTrue(image.shape[1] == 28)
+            self.assertTrue(image.shape[2] == 28)
+            self.assertTrue(label.shape[0] == 1)
+            self.assertTrue(0 <= int(label) <= 9)
+            break
+
+        with self.assertRaises(ValueError):
+            mnist = FashionMNIST(mode='train', transform=transform, backend=1)
+
+
 class TestFlowersTrain(unittest.TestCase):
     def test_main(self):
         flowers = Flowers(mode='train')
diff --git a/python/paddle/vision/datasets/cifar.py b/python/paddle/vision/datasets/cifar.py
index 671632f871bac..7a766828d84d0 100644
--- a/python/paddle/vision/datasets/cifar.py
+++ b/python/paddle/vision/datasets/cifar.py
@@ -161,7 +161,7 @@ def __getitem__(self, idx):
         image = image.transpose([1, 2, 0])
 
         if self.backend == 'pil':
-            image = Image.fromarray(image)
+            image = Image.fromarray(image.astype('uint8'))
         if self.transform is not None:
             image = self.transform(image)
 
diff --git a/python/paddle/vision/datasets/mnist.py b/python/paddle/vision/datasets/mnist.py
index c8bb6b3ca848d..3d752ece346b7 100644
--- a/python/paddle/vision/datasets/mnist.py
+++ b/python/paddle/vision/datasets/mnist.py
@@ -24,17 +24,7 @@
 from paddle.io import Dataset
 from paddle.dataset.common import _check_exists_and_download
 
-__all__ = ["MNIST"]
-
-URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
-TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
-TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
-TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
-TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
-TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
-TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
-TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
-TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
+__all__ = ["MNIST", "FashionMNIST"]
 
 
 class MNIST(Dataset):
@@ -70,6 +60,16 @@ class MNIST(Dataset):
                 print(sample[0].size, sample[1])
 
     """
+    NAME = 'mnist'
+    URL_PREFIX = 'https://dataset.bj.bcebos.com/mnist/'
+    TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+    TEST_IMAGE_MD5 = '9fb629c4189551a2d022fa330f9573f3'
+    TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+    TEST_LABEL_MD5 = 'ec29112dd5afa0611ce80d1b7f02629c'
+    TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+    TRAIN_IMAGE_MD5 = 'f68b3c2dcbeaaa9fbdd348bbdeb94873'
+    TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+    TRAIN_LABEL_MD5 = 'd53e105ee54ea40749a09fcbcd1e9432'
 
     def __init__(self,
                  image_path=None,
@@ -93,18 +93,18 @@ def __init__(self,
         self.image_path = image_path
         if self.image_path is None:
             assert download, "image_path is not set and downloading automatically is disabled"
-            image_url = TRAIN_IMAGE_URL if mode == 'train' else TEST_IMAGE_URL
-            image_md5 = TRAIN_IMAGE_MD5 if mode == 'train' else TEST_IMAGE_MD5
+            image_url = self.TRAIN_IMAGE_URL if mode == 'train' else self.TEST_IMAGE_URL
+            image_md5 = self.TRAIN_IMAGE_MD5 if mode == 'train' else self.TEST_IMAGE_MD5
             self.image_path = _check_exists_and_download(
-                image_path, image_url, image_md5, 'mnist', download)
+                image_path, image_url, image_md5, self.NAME, download)
 
         self.label_path = label_path
         if self.label_path is None:
             assert download, "label_path is not set and downloading automatically is disabled"
-            label_url = TRAIN_LABEL_URL if self.mode == 'train' else TEST_LABEL_URL
-            label_md5 = TRAIN_LABEL_MD5 if self.mode == 'train' else TEST_LABEL_MD5
+            label_url = self.TRAIN_LABEL_URL if self.mode == 'train' else self.TEST_LABEL_URL
+            label_md5 = self.TRAIN_LABEL_MD5 if self.mode == 'train' else self.TEST_LABEL_MD5
             self.label_path = _check_exists_and_download(
-                label_path, label_url, label_md5, 'mnist', download)
+                label_path, label_url, label_md5, self.NAME, download)
 
         self.transform = transform
 
@@ -175,3 +175,48 @@ def __getitem__(self, idx):
 
     def __len__(self):
         return len(self.labels)
+
+
+class FashionMNIST(MNIST):
+    """
+    Implementation `Fashion-MNIST <https://github.com/zalandoresearch/fashion-mnist>`_ dataset.
+
+    Args:
+        image_path(str): path to image file, can be set None if
+            :attr:`download` is True. Default None
+        label_path(str): path to label file, can be set None if
+            :attr:`download` is True. Default None
+        mode(str): 'train' or 'test' mode. Default 'train'.
+        download(bool): whether to download dataset automatically if
+            :attr:`image_path` :attr:`label_path` is not set. Default True
+        backend(str, optional): Specifies which type of image to be returned: 
+            PIL.Image or numpy.ndarray. Should be one of {'pil', 'cv2'}. 
+            If this option is not set, will get backend from ``paddle.vsion.get_image_backend`` ,
+            default backend is 'pil'. Default: None.
+            
+    Returns:
+        Dataset: Fashion-MNIST Dataset.
+
+    Examples:
+        
+        .. code-block:: python
+
+            from paddle.vision.datasets import FashionMNIST
+
+            mnist = FashionMNIST(mode='test')
+
+            for i in range(len(mnist)):
+                sample = mnist[i]
+                print(sample[0].size, sample[1])
+    """
+
+    NAME = 'fashion-mnist'
+    URL_PREFIX = 'https://dataset.bj.bcebos.com/fashion_mnist/'
+    TEST_IMAGE_URL = URL_PREFIX + 't10k-images-idx3-ubyte.gz'
+    TEST_IMAGE_MD5 = 'bef4ecab320f06d8554ea6380940ec79'
+    TEST_LABEL_URL = URL_PREFIX + 't10k-labels-idx1-ubyte.gz'
+    TEST_LABEL_MD5 = 'bb300cfdad3c16e7a12a480ee83cd310'
+    TRAIN_IMAGE_URL = URL_PREFIX + 'train-images-idx3-ubyte.gz'
+    TRAIN_IMAGE_MD5 = '8d4fb7e6c68d591d4c3dfef9ec88bf0d'
+    TRAIN_LABEL_URL = URL_PREFIX + 'train-labels-idx1-ubyte.gz'
+    TRAIN_LABEL_MD5 = '25c81989df183df01b3e8a0aad5dffbe'

From 648b92c0930f6bd5ee3cf50dd6f9d8e4b4a0ac06 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 4 Nov 2020 08:29:40 -0600
Subject: [PATCH 121/185] [sw] Update compile error for sw (#28419)

---
 cmake/cblas.cmake | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 75bb8bdda2180..6056b53bc2218 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -102,7 +102,7 @@ if(NOT DEFINED CBLAS_PROVIDER AND WITH_SYSTEM_BLAS)
   find_library(REFERENCE_CBLAS_LIBRARY NAMES cblas PATHS
         ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
   find_library(REFERENCE_BLAS_LIBRARY NAMES blas PATHS
-        ${REFERENCE_BLAS_LIB_SEARCH_PATHS})
+        ${REFERENCE_CBLAS_LIB_SEARCH_PATHS})
 
   if(REFERENCE_CBLAS_INCLUDE_DIR AND REFERENCE_CBLAS_LIBRARY)
     set(CBLAS_PROVIDER REFERENCE_CBLAS)
@@ -127,9 +127,9 @@ endif()
 # linear algebra libraries for cc_library(xxx SRCS xxx.c DEPS cblas)
 
 include_directories(${CBLAS_INC_DIR})
-if(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
-  target_link_libraries(cblas ${CBLAS_LIBRARIES})
-elseif(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
+if(${CBLAS_PROVIDER} STREQUAL REFERENCE_CBLAS)
   target_link_libraries(cblas gfortran ${CBLAS_LIBRARIES} ${REFERENCE_BLAS_LIBRARY})
+elseif(NOT ${CBLAS_PROVIDER} STREQUAL MKLML)
+  target_link_libraries(cblas ${CBLAS_LIBRARIES})
 endif()
 

From ca4154147290f4cc30c4585572325c72d0ad8d91 Mon Sep 17 00:00:00 2001
From: Jacek Czaja <jacek.czaja@intel.com>
Date: Thu, 5 Nov 2020 03:05:06 +0100
Subject: [PATCH 122/185] [oneDNN]Sum bf16 kernel (#28382)

* - Added sum bf16 oneDNN

test=develop

* - Fix to UT of sum bf16

test=develop
---
 .../framework/ir/graph_pattern_detector.cc    |  2 +-
 .../cpu_bfloat16_placement_pass_tester.cc     | 12 ++--
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   | 33 +++++------
 paddle/fluid/operators/sum_op.cc              | 16 +++--
 .../mkldnn/test_sum_bf16_mkldnn_op.py         | 59 +++++++++++++++++++
 tools/static_mode_white_list.py               |  1 +
 6 files changed, 95 insertions(+), 28 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 20da74eca4ef8..4f1080952a11e 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2102,7 +2102,7 @@ PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
       std::unordered_set<std::string>(
-          {"concat", "conv2d", "fusion_gru", "reshape2", "transpose2"});
+          {"concat", "conv2d", "fusion_gru", "reshape2", "transpose2", "sum"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
index 146e29249b7c6..4ca9724026a9c 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -44,6 +44,8 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     op->SetInput("X", {inputs[0]});
   } else if (type == "reshape2") {
     op->SetInput("X", {inputs[0]});
+  } else if (type == "sum") {
+    op->SetInput("X", {inputs[0], inputs[1]});
   } else {
     FAIL() << "Unexpected operator type.";
   }
@@ -61,8 +63,9 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 ProgramDesc BuildProgramDesc() {
   ProgramDesc prog;
 
-  for (auto& v : std::vector<std::string>(
-           {"a", "b", "c", "f", "g", "h", "k", "l", "m", "n", "o", "p"})) {
+  for (auto& v :
+       std::vector<std::string>({"a", "b", "c", "f", "g", "h", "k", "l", "m",
+                                 "n", "o", "p", "r", "s"})) {
     prog.MutableBlock(0)->Var(v);
   }
 
@@ -75,6 +78,7 @@ ProgramDesc BuildProgramDesc() {
   SetOp(&prog, "concat", "concat2", {"l", "m"}, {"n"});
   SetOp(&prog, "transpose2", "transpose", {"n"}, {"o"});
   SetOp(&prog, "reshape2", "reshape", {"o"}, {"p"});
+  SetOp(&prog, "sum", "sum", {"p", "r"}, {"s"});
 
   return prog;
 }
@@ -122,7 +126,7 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "relu", "concat"}, 7);
+  MainTest({"conv2d", "pool2d", "relu", "concat", "sum"}, 8);
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
@@ -130,7 +134,7 @@ TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   MainTest({"conv2d", "pool2d"}, 3);
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(5); }
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(6); }
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index 3d3738d922f77..4df7818072f05 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -40,13 +40,6 @@ class MKLDNNDeviceContext;
 namespace paddle {
 namespace operators {
 
-using framework::DataLayout;
-using mkldnn::memory;
-using mkldnn::primitive;
-using mkldnn::reorder;
-using mkldnn::stream;
-using mkldnn::sum;
-using paddle::framework::Tensor;
 using paddle::platform::CPUDeviceContext;
 using paddle::platform::MKLDNNDeviceContext;
 using platform::to_void_cast;
@@ -71,21 +64,21 @@ class SumMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::sum> {
       auto dst_tz = framework::vectorize<int64_t>(z->dims());
       auto src_tz = dst_tz;
 
-      std::vector<memory::desc> srcs_md;
+      std::vector<mkldnn::memory::desc> srcs_md;
       for (size_t i = 0; i < in_vars.size(); i++) {
         auto& input_it = in_vars[i]->Get<framework::LoDTensor>();
         if (input_it.numel() == 0) {
           continue;
         }
         MKLDNNMemoryFormat input_format = input_it.format();
-        srcs_md.push_back(memory::desc(src_tz, platform::MKLDNNGetDataType<T>(),
-                                       input_format));
+        srcs_md.push_back(mkldnn::memory::desc(
+            src_tz, platform::MKLDNNGetDataType<T>(), input_format));
         ++num_inputs_;
       }
       std::vector<float> scales(num_inputs_, 1.0);
 
-      auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
-                                 MKLDNNMemoryFormat::any);
+      auto dst_md = mkldnn::memory::desc(
+          dst_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
 
       this->AcquireForwardPrimitiveDescriptor(dst_md, scales, srcs_md);
     }
@@ -94,15 +87,15 @@ class SumMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::sum> {
   // (jczaja) sum oneDNN prim is not having .desc attribute so
   // we cannot use base AcquireForwardPrimitiveDescriptor
   void AcquireForwardPrimitiveDescriptor(
-      const memory::desc& dst_md, const std::vector<float>& scales,
-      const std::vector<memory::desc>& srcs_md) {
+      const mkldnn::memory::desc& dst_md, const std::vector<float>& scales,
+      const std::vector<mkldnn::memory::desc>& srcs_md) {
     // Sum op does not have backward so no passing from FWD to BWD is needed
     const std::string key_pd = this->key_ + "@fwd_pd";
     this->fwd_pd_ = std::static_pointer_cast<dnnl::sum::primitive_desc>(
         this->dev_ctx_.GetBlob(key_pd));
     if (this->fwd_pd_ == nullptr) {
-      this->fwd_pd_.reset(new mkldnn::sum::primitive_desc(
-          dst_md, scales, srcs_md, this->engine_));
+      this->fwd_pd_.reset(new dnnl::sum::primitive_desc(dst_md, scales, srcs_md,
+                                                        this->engine_));
       this->dev_ctx_.SetBlob(key_pd, this->fwd_pd_);
     }
   }
@@ -178,7 +171,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 
     auto sum_p = handler.AcquireForwardPrimitive();
 
-    std::unordered_map<int, memory> args;
+    std::unordered_map<int, mkldnn::memory> args;
     for (size_t i = 0; i < srcs_mem.size(); ++i) {
       args.insert({MKLDNN_ARG_MULTIPLE_SRC + i, *(srcs_mem[i])});
     }
@@ -215,5 +208,7 @@ class SumMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_KERNEL(sum, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::SumMKLDNNOpKernel<float>);
+REGISTER_OP_KERNEL(
+    sum, MKLDNN, ::paddle::platform::CPUPlace,
+    paddle::operators::SumMKLDNNOpKernel<paddle::platform::bfloat16>,
+    paddle::operators::SumMKLDNNOpKernel<float>);
diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc
index 52c4c63b473c4..faade79091c4a 100644
--- a/paddle/fluid/operators/sum_op.cc
+++ b/paddle/fluid/operators/sum_op.cc
@@ -148,16 +148,19 @@ class SumOp : public framework::OperatorWithKernel {
 #ifdef PADDLE_WITH_MKLDNN
       if (library == framework::LibraryType::kPlain &&
           platform::CanMKLDNNBeUsed(ctx) &&
-          static_cast<framework::proto::VarType::Type>(dtype) ==
-              framework::proto::VarType::FP32 &&
+          (static_cast<framework::proto::VarType::Type>(dtype) ==
+               framework::proto::VarType::FP32 ||
+           static_cast<framework::proto::VarType::Type>(dtype) ==
+               framework::proto::VarType::BF16) &&
           ctx.OutputVar("Out")->IsType<framework::LoDTensor>()) {
         if (std::all_of(x_vars.begin(), x_vars.end(),
                         [](const framework::Variable* v) {
                           return v->IsType<framework::LoDTensor>();
                         })) {
           return framework::OpKernelType(
-              framework::proto::VarType::FP32, ctx.GetPlace(),
-              framework::DataLayout::kMKLDNN, framework::LibraryType::kMKLDNN);
+              static_cast<framework::proto::VarType::Type>(dtype),
+              ctx.GetPlace(), framework::DataLayout::kMKLDNN,
+              framework::LibraryType::kMKLDNN);
         }
       }
 #endif
@@ -215,6 +218,11 @@ class SumOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
     AddComment(R"DOC(This OP is used to sum one or more Tensor or LoDTensor
                     of the input. If the input is LoDTensor, the output only
                     shares LoD information with the first input.)DOC");
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..05d739ae1f3f3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_sum_bf16_mkldnn_op.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.test_sum_op import TestSumOp
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
+from paddle import enable_static
+import numpy as np
+import paddle.fluid.op as fluid_op
+
+
+@unittest.skipIf(not core.supports_bfloat16(),
+                 "place does not support BF16 evaluation")
+class TestSumMKLDNN(TestSumOp):
+    def setUp(self):
+        self.op_type = "sum"
+        self.use_mkldnn = True
+        self.mkldnn_data_type = "bfloat16"
+
+        # float32 input to be use for reference
+        x0 = np.random.random((25, 8)).astype('float32')
+        x1 = np.random.random((25, 8)).astype('float32')
+        x2 = np.random.random((25, 8)).astype('float32')
+
+        # actual input (bf16) to bf16 sum op
+        x0_bf16 = convert_float_to_uint16(x0)
+        x1_bf16 = convert_float_to_uint16(x1)
+        x2_bf16 = convert_float_to_uint16(x2)
+
+        self.inputs = {"X": [("x0", x0_bf16), ("x1", x1_bf16), ("x2", x2_bf16)]}
+
+        y = x0 + x1 + x2
+        self.outputs = {'Out': convert_float_to_uint16(y)}
+        self.attrs = {'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+
+if __name__ == '__main__':
+    enable_static()
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index be11663719441..77e7372290d9c 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -602,6 +602,7 @@
     'test_requantize_mkldnn_op',
     'test_softmax_mkldnn_op',
     'test_sum_mkldnn_op',
+    'test_sum_bf16_mkldnn_op',
     'test_transpose_int8_mkldnn_op',
     'test_transpose_mkldnn_op',
     'test_mkldnn_conv_activation_fuse_pass',

From 2500dca87843177ab401f295323f9ff7307eed61 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Thu, 5 Nov 2020 10:05:46 +0800
Subject: [PATCH 123/185] [Dy2Stat] Fix bug in convert_call (#28368)

* Fix bug in convert_call

* refine unittest

* refine code

* refine code

* fix unittest failed

* add assert
---
 .../dygraph_to_static/convert_call_func.py    | 10 +++---
 .../unittests/dygraph_to_static/test_lstm.py  | 32 +++++++++++++++++--
 2 files changed, 35 insertions(+), 7 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 57c36e80fda88..9654a23852024 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -142,7 +142,7 @@ def dyfunc(x):
             # Note(Aurelius84): Because `@declarative` returns a class instance instead of
             # a function. This will modify the value referring to itself in `__globals__`.
 
-            # For example: 
+            # For example:
             #
             #      @declarative
             #      def foo(x):
@@ -150,7 +150,7 @@ def dyfunc(x):
             #
             # `foo` will be converted into a wrapper class, suppose as `StaticFunction`.
             # And `foo.__globals__['foo']` will still return this `StaticFunction` instead of
-            # `foo` function. So `isinstance(fn, StaticFunction)` is added here. 
+            # `foo` function. So `isinstance(fn, StaticFunction)` is added here.
             global_functions = set()
             for fn in func.__globals__.values():
                 if inspect.isfunction(fn):
@@ -193,8 +193,10 @@ def dyfunc(x):
             try:
                 _, forward_func = unwrap_decorators(func.forward)
                 forward_func = convert_to_static(forward_func)
-                setattr(func, 'forward', forward_func)
-                func_self = func
+                # Bound mothod will be convert into plain function after `convert_to_static`.
+                # So descriptor mechanism is used to bound `self` instance on function to
+                # keep it as bound method.
+                setattr(func, 'forward', forward_func.__get__(func))
             except Exception:
                 # NOTE: func.forward may have been decorated.
                 func_self = None if func_self else func_self
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
index 279c44d3245ea..cfb4bb69a2ea5 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_lstm.py
@@ -18,14 +18,24 @@
 from paddle import nn
 
 
+class LSTMLayer(nn.Layer):
+    def __init__(self, in_channels, hidden_size):
+        super(LSTMLayer, self).__init__()
+        self.cell = nn.LSTM(
+            in_channels, hidden_size, direction='bidirectional', num_layers=2)
+
+    def forward(self, x):
+        x, _ = self.cell(x)
+        return x
+
+
 class Net(nn.Layer):
     def __init__(self, in_channels, hidden_size):
         super(Net, self).__init__()
-        self.lstm = nn.LSTM(
-            in_channels, hidden_size, direction='bidirectional', num_layers=2)
+        self.lstm = LSTMLayer(in_channels, hidden_size)
 
     def forward(self, x):
-        x, _ = self.lstm(x)
+        x = self.lstm(x)
         return x
 
 
@@ -115,5 +125,21 @@ def test_save_in_eval(self):
                                                             infer_out))
 
 
+class TestEvalAfterSave(unittest.TestCase):
+    def test_eval_after_save(self):
+        x = paddle.randn((2, 10, 12)).astype('float32')
+        net = Net(12, 2)
+        dy_out = net(x)
+        # save model
+        paddle.jit.save(net, 'jit.save/lstm', input_spec=[x])
+        load_net = paddle.jit.load('jit.save/lstm')
+        load_out = load_net(x)
+        self.assertTrue(np.allclose(dy_out.numpy(), load_out.numpy()))
+        # eval
+        net.eval()
+        out = net(x)
+        self.assertTrue(np.allclose(dy_out.numpy(), out.numpy()))
+
+
 if __name__ == "__main__":
     unittest.main()

From c41fd033e5b4efd93a1ff738f7ee029a65075b50 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E7=9F=B3=E6=99=93=E4=BC=9F?=
 <39303645+Shixiaowei02@users.noreply.github.com>
Date: Thu, 5 Nov 2020 13:06:58 +0800
Subject: [PATCH 124/185] check op_version_registry in CI test, test=develop
 (#28402)

---
 paddle/fluid/framework/op_version_registry.h  |   7 +-
 .../detection/distribute_fpn_proposals_op.cc  |   2 +-
 paddle/fluid/pybind/compatible.cc             |   3 +-
 tools/check_op_desc.py                        | 142 +++++++++++++++---
 4 files changed, 129 insertions(+), 25 deletions(-)

diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index 5822dfa11dd25..c9d3084724bcd 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -92,7 +92,7 @@ enum class OpUpdateType {
 
 class OpUpdateBase {
  public:
-  virtual const OpUpdateInfo* info() const = 0;
+  virtual const OpUpdateInfo& info() const = 0;
   virtual OpUpdateType type() const = 0;
   virtual ~OpUpdateBase() = default;
 };
@@ -101,7 +101,7 @@ template <typename InfoType, OpUpdateType type__>
 class OpUpdate : public OpUpdateBase {
  public:
   explicit OpUpdate(const InfoType& info) : info_{info}, type_{type__} {}
-  const OpUpdateInfo* info() const override { return &info_; }
+  const InfoType& info() const override { return info_; }
   OpUpdateType type() const override { return type_; }
 
  private:
@@ -169,7 +169,6 @@ class OpVersion {
 
 class OpVersionRegistrar {
  public:
-  OpVersionRegistrar() = default;
   static OpVersionRegistrar& GetInstance() {
     static OpVersionRegistrar instance;
     return instance;
@@ -185,6 +184,8 @@ class OpVersionRegistrar {
 
  private:
   std::unordered_map<std::string, OpVersion> op_version_map_;
+  OpVersionRegistrar() = default;
+  OpVersionRegistrar& operator=(const OpVersionRegistrar&) = delete;
 };
 
 inline const std::unordered_map<std::string, OpVersion>& get_op_version_map() {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
index 614b37e703e72..b0c9d968e47b7 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cc
@@ -130,7 +130,7 @@ REGISTER_OP_VERSION(distribute_fpn_proposals)
               Upgrade distribute_fpn_proposals add a new input
               [RoisNum] and add a new output [MultiLevelRoIsNum].)ROC",
         paddle::framework::compatible::OpVersionDesc()
-            .NewInput("RoIsNum", "The number of RoIs in each image.")
+            .NewInput("RoisNum", "The number of RoIs in each image.")
             .NewOutput("MultiLevelRoisNum",
                        "The RoIs' number of each image on multiple "
                        "levels. The number on each level has the shape of (B),"
diff --git a/paddle/fluid/pybind/compatible.cc b/paddle/fluid/pybind/compatible.cc
index 57b024c25cbaf..cfe87a86cf0e5 100644
--- a/paddle/fluid/pybind/compatible.cc
+++ b/paddle/fluid/pybind/compatible.cc
@@ -95,8 +95,7 @@ void BindOpUpdateType(py::module *m) {
 
 void BindOpUpdateBase(py::module *m) {
   py::class_<OpUpdateBase>(*m, "OpUpdateBase")
-      .def("info", [](const OpUpdateBase &obj) { return obj.info(); },
-           py::return_value_policy::reference)
+      .def("info", &OpUpdateBase::info, py::return_value_policy::reference)
       .def("type", &OpUpdateBase::type);
 }
 
diff --git a/tools/check_op_desc.py b/tools/check_op_desc.py
index 1873fde0c432d..15e410401216c 100644
--- a/tools/check_op_desc.py
+++ b/tools/check_op_desc.py
@@ -14,6 +14,8 @@
 
 import json
 import sys
+from paddle.utils import OpLastCheckpointChecker
+from paddle.fluid.core import OpUpdateType
 
 SAME = 0
 
@@ -21,7 +23,14 @@
 OUTPUTS = "Outputs"
 ATTRS = "Attrs"
 
+# The constant `ADD` means that an item has been added. In particular,
+# we use `ADD_WITH_DEFAULT` to mean adding attributes with default
+# attributes, and `ADD_DISPENSABLE` to mean adding optional inputs or
+# outputs.
+ADD_WITH_DEFAULT = "Add_with_default"
+ADD_DISPENSABLE = "Add_dispensable"
 ADD = "Add"
+
 DELETE = "Delete"
 CHANGE = "Change"
 
@@ -35,12 +44,26 @@
 
 error = False
 
+version_update_map = {
+    INPUTS: {
+        ADD: OpUpdateType.kNewInput,
+    },
+    OUTPUTS: {
+        ADD: OpUpdateType.kNewOutput,
+    },
+    ATTRS: {
+        ADD: OpUpdateType.kNewAttr,
+        CHANGE: OpUpdateType.kModifyAttr,
+    },
+}
+
 
 def diff_vars(origin_vars, new_vars):
     global error
     var_error = False
     var_changed_error_massage = {}
-    var_added_error_massage = []
+    var_add_massage = []
+    var_add_dispensable_massage = []
     var_deleted_error_massage = []
 
     common_vars_name = set(origin_vars.keys()) & set(new_vars.keys())
@@ -65,13 +88,16 @@ def diff_vars(origin_vars, new_vars):
         var_deleted_error_massage.append(var_name)
 
     for var_name in vars_name_only_in_new:
+        var_add_massage.append(var_name)
         if not new_vars.get(var_name).get(DISPENSABLE):
             error, var_error = True, True
-            var_added_error_massage.append(var_name)
+            var_add_dispensable_massage.append(var_name)
 
     var_diff_message = {}
-    if var_added_error_massage:
-        var_diff_message[ADD] = var_added_error_massage
+    if var_add_massage:
+        var_diff_message[ADD] = var_add_massage
+    if var_add_dispensable_massage:
+        var_diff_message[ADD_DISPENSABLE] = var_add_dispensable_massage
     if var_changed_error_massage:
         var_diff_message[CHANGE] = var_changed_error_massage
     if var_deleted_error_massage:
@@ -86,6 +112,7 @@ def diff_attr(ori_attrs, new_attrs):
 
     attr_changed_error_massage = {}
     attr_added_error_massage = []
+    attr_added_def_error_massage = []
     attr_deleted_error_massage = []
 
     common_attrs = set(ori_attrs.keys()) & set(new_attrs.keys())
@@ -110,13 +137,16 @@ def diff_attr(ori_attrs, new_attrs):
         attr_deleted_error_massage.append(attr_name)
 
     for attr_name in attrs_only_in_new:
+        attr_added_error_massage.append(attr_name)
         if new_attrs.get(attr_name).get(DEFAULT_VALUE) == None:
             error, attr_error = True, True
-            attr_added_error_massage.append(attr_name)
+            attr_added_def_error_massage.append(attr_name)
 
     attr_diff_message = {}
     if attr_added_error_massage:
         attr_diff_message[ADD] = attr_added_error_massage
+    if attr_added_def_error_massage:
+        attr_diff_message[ADD_WITH_DEFAULT] = attr_added_def_error_massage
     if attr_changed_error_massage:
         attr_diff_message[CHANGE] = attr_changed_error_massage
     if attr_deleted_error_massage:
@@ -125,15 +155,39 @@ def diff_attr(ori_attrs, new_attrs):
     return attr_error, attr_diff_message
 
 
+def check_io_registry(io_type, op, diff):
+    checker = OpLastCheckpointChecker()
+    results = {}
+    for update_type in [ADD]:
+        for item in diff.get(update_type, {}):
+            infos = checker.filter_updates(
+                op, version_update_map[io_type][update_type], item)
+            if not infos:
+                results[update_type] = (op, item, io_type)
+    return results
+
+
+def check_attr_registry(op, diff):
+    checker = OpLastCheckpointChecker()
+    results = {}
+    for update_type in [ADD, CHANGE]:
+        for item in diff.get(update_type, {}):
+            infos = checker.filter_updates(
+                op, version_update_map[ATTRS][update_type], item)
+            if not infos:
+                results[update_type] = (op, item)
+    return results
+
+
 def compare_op_desc(origin_op_desc, new_op_desc):
     origin = json.loads(origin_op_desc)
     new = json.loads(new_op_desc)
-    error_message = {}
+    desc_error_message = {}
+    version_error_message = {}
     if cmp(origin_op_desc, new_op_desc) == SAME:
-        return error_message
+        return desc_error_message, version_error_message
 
     for op_type in origin:
-
         # no need to compare if the operator is deleted
         if op_type not in new:
             continue
@@ -144,33 +198,47 @@ def compare_op_desc(origin_op_desc, new_op_desc):
         origin_inputs = origin_info.get(INPUTS, {})
         new_inputs = new_info.get(INPUTS, {})
         ins_error, ins_diff = diff_vars(origin_inputs, new_inputs)
+        ins_version_errors = check_io_registry(INPUTS, op_type, ins_diff)
 
         origin_outputs = origin_info.get(OUTPUTS, {})
         new_outputs = new_info.get(OUTPUTS, {})
         outs_error, outs_diff = diff_vars(origin_outputs, new_outputs)
+        outs_version_errors = check_io_registry(OUTPUTS, op_type, outs_diff)
 
         origin_attrs = origin_info.get(ATTRS, {})
         new_attrs = new_info.get(ATTRS, {})
         attrs_error, attrs_diff = diff_attr(origin_attrs, new_attrs)
+        attrs_version_errors = check_attr_registry(op_type, attrs_diff)
 
         if ins_error:
-            error_message.setdefault(op_type, {})[INPUTS] = ins_diff
+            desc_error_message.setdefault(op_type, {})[INPUTS] = ins_diff
         if outs_error:
-            error_message.setdefault(op_type, {})[OUTPUTS] = outs_diff
+            desc_error_message.setdefault(op_type, {})[OUTPUTS] = outs_diff
         if attrs_error:
-            error_message.setdefault(op_type, {})[ATTRS] = attrs_diff
+            desc_error_message.setdefault(op_type, {})[ATTRS] = attrs_diff
 
-    return error_message
+        if ins_version_errors:
+            version_error_message.setdefault(op_type,
+                                             {})[INPUTS] = ins_version_errors
+        if outs_version_errors:
+            version_error_message.setdefault(op_type,
+                                             {})[OUTPUTS] = outs_version_errors
+        if attrs_version_errors:
+            version_error_message.setdefault(op_type,
+                                             {})[ATTRS] = attrs_version_errors
 
+    return desc_error_message, version_error_message
 
-def print_error_message(error_message):
-    print("Op desc error for the changes of Inputs/Outputs/Attrs of OPs:\n")
+
+def print_desc_error_message(error_message):
+    print("\n======================= \n"
+          "Op desc error for the changes of Inputs/Outputs/Attrs of OPs:\n")
     for op_name in error_message:
         print("For OP '{}':".format(op_name))
 
         # 1. print inputs error message
         Inputs_error = error_message.get(op_name, {}).get(INPUTS, {})
-        for name in Inputs_error.get(ADD, {}):
+        for name in Inputs_error.get(ADD_DISPENSABLE, {}):
             print(" * The added Input '{}' is not dispensable.".format(name))
 
         for name in Inputs_error.get(DELETE, {}):
@@ -186,7 +254,7 @@ def print_error_message(error_message):
 
         # 2. print outputs error message
         Outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
-        for name in Outputs_error.get(ADD, {}):
+        for name in Outputs_error.get(ADD_DISPENSABLE, {}):
             print(" * The added Output '{}' is not dispensable.".format(name))
 
         for name in Outputs_error.get(DELETE, {}):
@@ -202,7 +270,7 @@ def print_error_message(error_message):
 
         # 3. print attrs error message
         attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
-        for name in attrs_error.get(ADD, {}):
+        for name in attrs_error.get(ADD_WITH_DEFAULT, {}):
             print(" * The added attr '{}' doesn't set default value.".format(
                 name))
 
@@ -218,6 +286,40 @@ def print_error_message(error_message):
                     format(arg, name, ori_value, new_value))
 
 
+def print_version_error_message(error_message):
+    print(
+        "\n======================= \n"
+        "Operator registration error for the changes of Inputs/Outputs/Attrs of OPs:\n"
+    )
+    for op_name in error_message:
+        print("For OP '{}':".format(op_name))
+
+        # 1. print inputs error message
+        inputs_error = error_message.get(op_name, {}).get(INPUTS, {})
+        tuple = inputs_error.get(ADD, {})
+        if tuple:
+            print(" * The added input '{}' is not yet registered.".format(tuple[
+                1]))
+
+        # 2. print inputs error message
+        outputs_error = error_message.get(op_name, {}).get(OUTPUTS, {})
+        tuple = outputs_error.get(ADD, {})
+        if tuple:
+            print(" * The added output '{}' is not yet registered.".format(
+                tuple[1]))
+
+        #3. print attrs error message
+        attrs_error = error_message.get(op_name, {}).get(ATTRS, {})
+        tuple = attrs_error.get(ADD, {})
+        if tuple:
+            print(" * The added attribute '{}' is not yet registered.".format(
+                tuple[1]))
+        tuple = attrs_error.get(CHANGE, {})
+        if tuple:
+            print(" * The change of attribute '{}' is not yet registered.".
+                  format(tuple[1]))
+
+
 def print_repeat_process():
     print(
         "Tips:"
@@ -241,10 +343,12 @@ def print_repeat_process():
     with open(sys.argv[2], 'r') as f:
         new_op_desc = f.read()
 
-    error_message = compare_op_desc(origin_op_desc, new_op_desc)
+    desc_error_message, version_error_message = compare_op_desc(origin_op_desc,
+                                                                new_op_desc)
     if error:
         print("-" * 30)
-        print_error_message(error_message)
+        print_desc_error_message(desc_error_message)
+        print_version_error_message(version_error_message)
         print("-" * 30)
 else:
     print("Usage: python check_op_desc.py OP_DESC_DEV.spec OP_DESC_PR.spec")

From c42e656179ec9c557848d97c2af4fa78375d4cfc Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Thu, 5 Nov 2020 14:18:10 +0800
Subject: [PATCH 125/185] Add retry for dygraph parallel socket bind (#28404)

* add retry for dygraph parallel socket bind

* change to loop always

* fix writing error
---
 paddle/fluid/imperative/nccl_context.cc | 32 +++++++++++++++++--------
 1 file changed, 22 insertions(+), 10 deletions(-)

diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 9ffec11354d8a..abee311d08cf3 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -48,9 +48,21 @@ void NCCLParallelContext::RecvNCCLID(const std::string &ep,
   address.sin_addr.s_addr = INADDR_ANY;
   address.sin_port = htons(port);
 
-  if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
-    PADDLE_THROW(
-        platform::errors::Unavailable("Bind on endpoint %s failed.", ep));
+  int try_times = 0;
+  while (true) {
+    if (bind(server_fd, (struct sockaddr *)&address, sizeof(address)) < 0) {
+      LOG(WARNING) << "Socket bind worker " << ep
+                   << (try_times < 5 ? " failed, try again after 3 seconds."
+                                     : " failed, try again after 3 seconds. "
+                                       "Bind on endpoint %s failed. "
+                                       "Please confirm whether the "
+                                       "communication port or GPU card is "
+                                       "occupied.");
+      std::this_thread::sleep_for(std::chrono::seconds(3));
+      ++try_times;
+      continue;
+    }
+    break;
   }
 
   VLOG(3) << "listening on: " << ep;
@@ -119,13 +131,13 @@ void NCCLParallelContext::SendNCCLID(const std::string &ep,
   int try_times = 0;
   while (true) {
     if (connect(sock, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
-      VLOG(0) << "worker: " << ep
-              << (try_times < 5 ? " is not ready, will retry after 3 seconds..."
-                                : " is not ready. Maybe that some process "
-                                  "is occupied the GPUs of this node now, "
-                                  "and you should kill those process manually. "
-                                  "Will retry after 3 seconds...");
-
+      LOG(WARNING)
+          << "Socket connect worker " << ep
+          << (try_times < 5
+                  ? " failed, try again after 3 seconds."
+                  : " failed, try again after 3 seconds. Maybe that "
+                    "some process is occupied the GPUs of this node "
+                    "now, and you should kill those process manually.");
       std::this_thread::sleep_for(std::chrono::seconds(3));
       ++try_times;
       continue;

From 0155f91679726662b5907599856599fa11d089de Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Thu, 5 Nov 2020 16:31:56 +0800
Subject: [PATCH 126/185] enable softmax unittest (#28362)

---
 .../fluid/tests/unittests/test_softmax_op.py  | 25 ++++++-------------
 1 file changed, 8 insertions(+), 17 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_softmax_op.py b/python/paddle/fluid/tests/unittests/test_softmax_op.py
index 71df2c4acc467..71c4e9c495ea9 100644
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
@@ -267,22 +267,11 @@ def test_check_grad(self):
         pass
 
 
-@unittest.skip('disable TestSoftmaxFP16Op2')
-class TestSoftmaxFP16Op2(TestSoftmaxOp):
-    def init_kernel_type(self):
-        self.dtype = np.float16
-
-    def test_check_output(self):
-        if core.is_compiled_with_cuda():
-            place = core.CUDAPlace(0)
-            if core.is_float16_supported(place):
-                self.check_output_with_place(place, atol=1e-3)
-
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSoftmaxFP16Op2(TestSoftmaxFP16Op):
     def get_x_shape(self):
-        return [2, 3, 4, 5]
-
-    def test_check_grad(self):
-        pass
+        return [2, 3, 4, 10]
 
 
 @unittest.skipIf(not core.is_compiled_with_cuda(),
@@ -354,10 +343,12 @@ def test_error(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softmax, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[2, 3], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[2, 3], dtype='int32')
             self.assertRaises(TypeError, F.softmax, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[2, 3], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[2, 3], dtype='float16')
             F.softmax(x_fp16)
 
 

From dc6b2321ff3974127a44fc1c996ae165309c6b7e Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Thu, 5 Nov 2020 18:04:47 +0800
Subject: [PATCH 127/185] remove unused pyc file (#28449)

---
 tools/static_mode_white_list.pyc | Bin 21082 -> 0 bytes
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tools/static_mode_white_list.pyc

diff --git a/tools/static_mode_white_list.pyc b/tools/static_mode_white_list.pyc
deleted file mode 100644
index 7d2a45c248ce271c1c4fff310505a172339e5eee..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 21082
zcmeHPb-XNhbw9sffMCJhH3W&g_d;+8kN`n~B)AN7_dC0HxVt->nce%|4eqX`c#Bq`
z)Q}dJ3ee(GfhyEps257V=QlIw%<jD}ANuLP3E_R-p7Yb=@|&FS-IK2P-g~_+8%g)i
zYoo~7{Co466Ggy(gCGh{5L`xZS;6H5CkievxPst{f-4EGEI3JU6~R>nR})-aa1FsV
z1=kW>TW}r0bp_WGTwibl!3_mB65LpD6TwXdHxt}ka0|gL1-BC1T5z)9HiA<Gw-ww@
zaC^ZW1g8p46PzwMLvToNSa3verr?f(vjle%+*xoJ!CeJ+6Wm>J55YYJ_Y#~fxVPXw
zg8K^25u7WypWyz22MEp+JW%i;!Gi@45u7i0sNi9OhYKDdxIl2B;E{qy2_7wYjNq|?
z#|a)Uc!J=Gf+q<w!IK5KU`sF(Ku`$Af{9>TPzs(R*bz(xGr?R?2^NBj1ht?MECnmU
zuHdNPm|#zEvEZqKrwN`ec!uDaf@cYyEqIRLxq{~jo-cTT;Dv%030^FCiQuJzmkC}j
zc!l7Vf>#M%EqIOKM+C1G{HWk{g4YY)Ab6wTO@cQI-Xi!h!H)~xDtMdV?Sgj*-YIyO
z;3ot>DflVDPYd2Hc#q&`1n(96tl;MaKQH(N!7mElCwRZ$1A-3<eo64lf)5EkEcl4v
zqk@kKJ}&r#;FE$+2|g|O6~V6xeogS}g3ky(EBKt?Hw3>a_`Ki?f-efbB>1x6D}t{I
zz9#s(;2VP968yH{cLcvH_@>}ng5MMTzTgi8e<=7P!5<60E%=V$PXvD|_%p$u3;sgz
zmx8|%{I%e31b-{|JHg)z{z34Mf`1bHv*5dee-Zqv;NJxQF8B|@e+vFf@ZW;(3BE7*
zAHfeO{{bN&2AmLZnSjd%TrS|mfXfG5A>fJuR|>dtz)1mD3Ak#&)dH>_aE*X#23#xP
z+5y)IxNgAp0<Ir$gMb?b+$iA20XGS_X~4|_ZXR%pfLjLKD&W=uCkNan;FN&d2HY;-
z_5pVYI5pt3fYSrc2sjjQIN(UYnE`hUI4j^z0e23#OTb+N?iO(OfO`boGvHnUX9wIn
z;64HO4LB#@+<^N9+&|y}0p|rgFyKJ}4-R-p!1)0W4R~0<!vh`>a6!O@0gnuLRKTMH
z9ux4`fX4+qKHv!fPYifcKo;=ifIMI;U=)CWB48Xa3D^!O1D+DF6EF>!1<V7gfJMMX
z0d+tVunbrQ>;@bSI2N!MaB;v>1D+P}^nhmsJTu@~0nZM2PQY^mo)_@^fENV3FyKW2
zFAjJ~z)J&O7Vz?bR|LE=;8g*y4tPz#j|99n;70>q7x4OkHw3&f;7tK<4tPtzj|KdA
zz*_^}7V!3fcLcmM;9UVf5%7}%KNaxP0q+iYPr%Ovyf@%y1AZ>x=L3Er;1>hl7x4ao
z4+MNL;Fkh^Ip9M99}f6Pz()f<7Vz<aPXv52;8Ovg4)~RTUk&)RfL{;zOu%OYJ{RyC
z0lyjW`G7A3d@<ll0bdUIO2AhGz83KHfNuo+R={ru{7%5{27EK%TLHfp@cRLO5b%cq
ze-!Y?0pAYzPQaf8{As|S1^juyUj+PRz+VOYb->>Q{B6MB1^j)$KLq?^z&{22bHH~4
z{w3gF1O6@G-vj<5;6DTYE8xEaz8CQQfd2{jL4ZVrh!}A~#APBb8*#bVnf((ZE+5tG
zpG03kV8oT8x_~S5--wf9=LoJsU{t?wH9{h;5pm6kYeif;;yMx6jksRK^&@T&al?ok
zMcg>zCJ{G{xLL%_BW@9K%ZOV=+&bdqh}%S*5^>vz+eO?y;tmm~Mw}LLdc+wKhawI~
z9EmtH;*Jq#McgUk&JlNsxNF4SBJLh>kBECl+$-Yjh<iueC*r;l=R}+valeTBM?4_n
zyod)zJSgJ95f6ztKjNVg4~uws#3Lduh`2D~kr9uIcyz>LA|4y@xQNF`JR#zV5l@Q9
zBAy(PM{GrmA`nqTj3Xuy+Yx2NQzCXErV+D<c|;Ylh`1=Cj%Xs55vz#Zh@%n5BK9IK
zj(BRs(;}W8@r;OPMm#Ix*%8l)cy7e=BAy@df`}JJyeQ(u5ig08*T%;Q9Ynm~l$X(`
zFRw$VzATz$mXF8fyvRn|`FvhXvud%tN{<><^HIK3&~<xIF)e1re0j8NiYy0?i)&U}
zio0GvZ(L5N*~LX&HQBV>DGsk8r#wJj>k-O%Hmb*pym~)#S|O{8rby?wmP4zZ)goKw
zThl^^pVT83IS!{|UA2cayu-4nb(BqSii=joe8g*~D=x2Lg9nJ+1Hg-_nmRRXnc~<Y
zpCikg1GZ>!2OxM<J(O|6R$I+Gl#$O+gUjrn*Gx-ZX-Ex&_~1p(G0Q4Lu~=^F;2YZA
zFIwewfoys>L)k2Ir4R8}>+$($w5s#bo&rrE^3k%~&6j00&*t-NQs)><K#pR%%yojX
zBS)^I6n14>8Tm{BH{Jx4vywuSjTc7_^N+L6Tw7+AAItJZ>GH~@6~={3929g^Ax26<
z7CbV=cXyI0I}hOiq^{E;2ZsV-O_?rB>8Vi8C%X0x{Mah1ymYPPE#}~;z2P-6nyse8
z(_xk_eob4U)y7F(CJH%xIvq_bD%~{SE9&JfZ2W9Ny(vaJ8MU}v@>Uc>3Q4OKe(CvW
zyQ;HEF)!+T$-nW`SF$zgd_F0bzE?fTX4J%vmd?3)(o}WQLtnVA^SyNE>}aj~b78X|
zdDHN$o1k(sSCHu<M=5AGpRUq$6IE5)c_Q|-Stc_1kyj&K!=;eF8*+o!UZ@BYcT>CG
zZbf3!WKo=zMrA%NFHREcjM&OYJ4f>xPK2$zDMwjNnV#pf!Y#p8z8r05^Quk<x^}<G
z8aUfMJXo6Bl5>qq$yPyWZ)owj5DJP>y0(#jh~_<v99!jR#sBjTv}Oyv;`MF!b<}P$
zuST4$a+-5WKF!P=m~|ztHNL7yCdUTt_G%Y2iW-{yp+u166;`8lN3m5d@_MNfF-sKV
zagBRht<alc*{Y6ZqiLD$<R?=gX_I92CU#RoTbk8L)H~tf>b}~dP6Ss9nO&zwjFNRK
zDN@TT^vjKMIu<iS{FN?YON}^~2T1I5;C3}Fnw?|@ef{m#d?za?#Ja7SpHTu6V$z=O
zxwU5!<ePT{Fmp4irqhB{9WUleVOpMi+UV*g5u<8GA{SIYl0ONN$)lpmDK&vJX>MnO
zD_3(2mzX0NG#-3wT8(xFC#`t-V@vXHo=>R9#u@s|-9vN}$r73O=%7m)lZI#QE!EmA
zdggYPnFyWL`s=lY2)e|DjvTlUp(M}RLIiCtWU{1tui;f<lj^J;+WeX`n-jBUyBsHJ
zG5a`TaMP5@E;x(m4yXf@X|+WP?{Wi|1O_@ulshltl0atbMa&|M6wv9~8vhwiH5p};
z_`P+V=M+b^0_?J}hU0=Rt_eek^Jh8AXfG%80cbUAR_o??w3~jLS|MNN&IkQ9t&&%y
z(VMEfpw_vy8Q^z{f>*Dp^~)ReIyyj8$9#aD8pYOXq7rHrk7T8AG9L3-!#tLfe6>ib
z(K;?pyru26ADQX?t&bfXxfs&}q~KGfDBC@x3^Ub*qZdtGl_``Uo84kFGGHLFNA)ae
zK-B<p{_Xr2^kNKKd4*XqtLkKHQY_S3|1o2@fkOd<)|{U5V!HpWruW+>g9G%mN&$<B
zEQ)4C(_768x6UY*`DlCidzY@$)gOia&L#{N;qe&Ce3BKjtpdxN*)<qN*5%fU3b4<J
zneWn2vmM01g}S;dCVOtT4a)$t8JwwFH`J((&KK3HZYFSP#)3bUu6|uLqpLL4d>!^^
z9~ax124-bOwx&41lo5M#jRKZtH3-OOU1-oOshAqgCa*GBa;||xkLZNQC2%jH8DTGz
zJ727p%Uky}E7aD~E=-FQ`B1tx?6Y~{jge_ep(dtr)!P^|saF|U?YgyrPjQwGINm1A
zOH$CQ6frh$EJDd$)ite>lHRm<W6;rUdM6!VR;l1U*YXb2=3&l>a(8{%K|VU3Rx}-#
zFu)kXeGgo{i+B&lF(N+hJm#p!#jR3EVB8<xrq7rT+;CB)#O3lP84RSHFRE#Y6^w1y
z#n9YTF<a2WXu7S3GbK=ev6nHpu03XQ*kM<)qRqseL&wNjO&&DIP<1i0Sqw2N`u;BJ
z$VRjzg{N0Gu1i@8S_?Cph8z>j_-Nj_yyfkU0{u31v0T+Xo<k{WyG5-Dh#Fb*-j<c?
z{mt(>v<+*`*L4Vv-CUs^Xz^>Ymndpn8|NhPz{+w3BU-|lc_!wH{cXl6g)xq%Tf47e
ze337hsP8QbSLbn|$%j@%x@aKR^veT<7QdR$ih4rQ+Ra+HMjw7IEy(0^mydM}?T%Ie
zk`9|KOhp`C!7sjE)lo8=oO7_o&J1-+T`Kt<i=oHNNO*-5vzQ&}fLfDTmXlSrYO<7>
zu1Rk{XpNH7f;P+Keyl$!6&jTs8XPu(7ki9`sWaPov+Xj`PN^B#&&bQ#<KqoPc4TwC
z6*Ec}Mv6vLW7DyA+G<G}4DPJQQdr~B_?S`ufMz^fakQ#;C{2uM)E~M!j++z}IyaVe
zvMeU7;}u!jtGf?VD5+)Pz<4!h&8C_&9@W_8@W5g3LwWFg-1$e2KYtipd+0M(1CjGk
zUD~o>4mTiyVtL%I@{^EO?MPGWV!9%ISiMSBNWDXrhTYsWi)$cLL)+FVy+p-@%}eWI
z95vfwE`Cvp&~#td#~$6;S*mMahHyz+HCp&mx5}v&uXAMu_rr%vD!gWCD8+VMGF^sq
zhgZ}wJW{%tmd{vb?T4AM?6%-Atx|PmOg+Sq2;D;M2r2_t?CQA|&GhSZr9ZlutSjBx
zZ6^70n~J=wi*>6v$yco0<h&34L6s5!^Y&e#Nt6F(hdJL?(Wb7Z>o%azYzKpkL;D7$
z^w%OsuV#6Q;8uVYt!y#XKv~IR@s)x*<2kN*>V{%6(m=bej;4{uT#FVoWq0vSrMV`b
zF@?x_eQ`rFz^1~*U^QkPmi3#lozw$x#=4O?nNFz8eCTgAHhLRVZna_xux+;K0NZ+#
zg3UB+GssSLfc3$vt**MEU}JbWJ8bo4!cBb38Dmps3tDp-mz1O=<f#f^_bF!!tuShR
zHC52uYz#195;qz<Ne8HLZvltLu_Y(;NP}ZKoZ+Ym{$MrDI%z~xoXzB(We|%c)<&5>
zWnFwtAPzhuY0_FJ_XUI17R#-aFx;lMM*DG@`U;9%FJVIx+nqd&UNxk<$4P_r>Y|)8
zV;QBM%z<}XyN#CpJ$P82cEAZqES8C>)JIEp2l@kMtYzoZ<=%l?r22}c!<41Xi&+Fx
z&PR27N{hovmdB4}ohFf{n~0>CDwr;S^D+0BSIl8Zy{*+)e>T%<*$g<LV^V8p#^T{L
zD=YWFR{f=icQKkLE5Xb;#rIY{DtkDfQqgCo&ejg!RO6-Q4T)fDWfu<zxE8a{W<g8i
znvw!6CenQ^*&?SDFKP3o!>C!X$fBGx(rsO&X?Mx<<<vM8_kqGyD=kN{_{+FBRS;Z7
z>iu<vF->$b@Mgrz{iBkfHIr1b_>^)CGHC$pIuQh#S=yOuUZ!<aWvH32otmzW0K47v
zP=9F`hvYr7W>!^gzshD$bNcS#E~&O?3mB!>z)DQJ;gt2Lxt8Cw?7+6kHVdg!3WrZ^
zi{oB$Hzsvu^R)uGRiub)w9n=@O_qpfHQ$y1nyhz$e#(f3XjZJ1Z8^&kl!V#3Z&EGJ
z7usP`0{Ed6oH{bof{9Ra4JPQRk(#qgotma@+1w&@bE`$;?9_@;&o|?-0z0|)B!fWK
z1(`$*=vxiCv|WOv2et%)8N0V}H?piLGdzxUc6Uu6r!oO4B)0p{@_NEFodF4hj!xK2
z1{vEZK^QACiSlC;%v@G?GJWbm?Rkdt>+)F73OmKDptUSnRk9GSE7{eBhO))6(R9TQ
zPfG7J1M^+l%3f>T2T-PNS&77NBlxUnw)>u-i4c2q1@oAiLF};<kEA(%uyW1}YDmFZ
z*+vUy<7gu3$!SZs*CGMmw$f&QTkJJH%qm79bYQ*_ubEmT7Q1u=wUi^()>7cmlh0YE
zWw&kiOtm7x%@|V*NTf&xTB$nf^i9yZ(xi)#aa&kEHeyDglg*ZB-;O$6F@d%k3azRO
zzCJq87Z7VYY@*3BSjxnbT}R5p?Z$IWv?yO*0bwPl4+fc6r8;P;&a}bDV9&L2+Z$y}
z7CrQhMCxk$X4-3HzT{*{exa`&8DDxoVq8!Fy|&3q`@8FXvR$hNq;?z4M+da{z1sNp
z)aj%iMp#yhv+m@k96fBKq(X4};KS88fhd(`W2usfRm@J94UYziJXzOvtF=t+8q3LD
z{AFp3fym5MKVT}Ku^xET-Jw3I;1*;Es~Vamd#Jtn5JL%(GROWIga?-zMY1hN7WcI%
zXmh3&Gds>fqu6X%%{_qHKO|WQ-nh}X7y7J4t;OL!M%zE0p<fs2v@<!Ht?ksVa0|SD
z+HB7Cm!}?oT^^HCKx(B8`cmG~0-ya$rgubbMVDa`)#=E-)te^QXp(jIOFFiNi@FE9
z?!!t9LFs;;#12oCP#ZPoL160PUcXG&!ZN$7kqtx2a;m*26YE8JjJ$5P8EUd1Z<@5I
zrh5~HF50)TUEK1t4@Z&~o6;zZUCp)kDjkf%yEcqvlYPSo<1jPKDW&hK)h>1EZ3^SS
zdBL1rQ?R7boM+Zn+e&3BPH2CrEex5a6UoX7hYuiR=9+6<>sJlNRV??BaAu79wzJc>
z?L$)q*<UgT)Ypj@6gtfqyH^<IIC^{=fopNs!Wp8L6>|WVf|}8wfJFE)W~wZ1dh>5M
zsYSMpA?I`5cj@%DR-#SE8c2in@^qB*Y%ke<g}H?Ey+vMlwpGqkvg}Qyg`qY2nijiB
z%IJ>VQ}~dDzQswwf(K3m?21l(aq!+<oqVbqZ&{R2?Q$=xqj-+#V_3>!eB4_*kIIL2
z)HRy%Vcn{5;%ITG6Cx8Ag{9ce_|lb-Qlpz!F*#Q_O=2o88CM8oK5UZ3pC0Www2cee
zR`UQl=uMfm@%Ix}TdIq(madaIW0Kjv#_|gk6Oy#v8yvKltTfT0`*s@D4on(tu|-{^
z9yiTwUwIf1jG>-Z{z(R78Tm*NVnjk=Uowf*!V4o#Z5)~u))b;HF?86hp*o}(*-5~_
zVN_7{)yDXJ3GL$6?P5xj>;686E$s9PK%tgJCY9lyn%1hdF-xeUoP}NcTY+XC=$=zd
z;S^=Fi<W(NIIo51JU=Gi!mvW2MLd7`<5EP=)mseCI=S~k>Uc{AgA^uJM|~KRels@j
zf<_BQI+b89Q!Z2RY^Ew@*lkTgLqcsMtAn;1*rPk6Pl6x=gFK!py^!)4*Gy0UqO(oq
zj6#i;IEHj=iirl}gbn(;ZFz@2RqG3_M6A)7_+-(2H&Sj(E5@n+XQ5)(DEG_sB51Uw
zP%_bIYc@QVS?FfFE~$Uo@9u1r)3SA&H0#>6;v|#*&Y|jy-~Q3@ZF^?%G2l!e?5&kj
z+UkmP#q4Z9dR)<+S@7yq>>eAM0~{1YIAtS0YV-vK&*_(Liw>+xq-ftSy$07QmYf{F
z9HhgXHmWrGB(IO!eSrn@{<g)cUA%#f&;?8-n>4J;6P<4f%5l!wMN)r1daCqi1sxt=
zTvv1q&XukS!1urXseEBJxpvlX(3Vxb@Z?)=ybfxpFU+{jBDBBR8|Et@Bgj6jwwo@&
z&IO+fBy}*>D7019WU6k}i_fyqmNUzOOJ?0vmtH{nyv7KzWa+9(k0SerT&C*7=Bn0A
z+uv@z*uqP4V${)RAc;B-;X~;(#+*!YB^uW|50k-6Z{>!ZO^p_{g=_rg8<{CBY?;*A
z#95)6B`UwhCbCV=%swq?7u!ch2efkR+^admmqhykXk4}IFlw?hp(Wr!FUZ7!D8>XX
zX6^5;URu#KH=f$@<+~aulEZx1N>jF0_UvY>1v@@nn?tRvhA5n4)Td$AJfm8Jl!^G}
zN{$xgWLw+Bnk4rRf=xz`Qik-oFiX_FNDJgfHf*n&R{iTO6HNu~9x-ENw%0#@SUYSB
z=BjoyeYDOY-IJu{w$?bDe)jsdt6OlUv0<a6F+{Dk4H|vqWBzd;YF{1S)O0YkO(d<c
zwu#xdhS}`FNLG#*q0LETFQp97)rgNx`m#`#obQ3-+A51C*nHofwFk4uDTDRa$EA$b
zwUHSSUv+97on$j+wmPcG4Z)35nsLO8l1%-f9c}j5QaAQ>4|g>r;!0hq!fwAmedZT0
z9_sB!Gj?c`D{~S1$D~uXJs5B6T;i}S=Tjc0<SdbFI}OtNliv6n)3sBcu|2|XJ=#vY
zIqUsf@Wp@2Q0@P*TgbrVjNUe1(&$uX>jH6|beA7`Qaat>J%P?P<CALtjmNC}kI=r2
z(GrShZ`S@S=B$3D=D<*u99`=ji86z1sBy`6c=SRHFx|oS9~ZHO+TqMN*hKA^yIV9c
zsj4yAftVEdA8>{~7U2P{NMH-nXYJTGFq&DL_0+?qrz`f_p#2zv34_P@Z9;3u?~9*p
zlgsL@Y(sc9oxVQsW70?`liaS^mQPRZxz0+Y7g6~LB9-G3Gm~^Co~LG%cMbWO(1@#$
zAqip5f=e|!{J6kpC%kD>a<v;@!TB<lgQa0xWzP4t>}E0C^)Y6C+c87bDgA{KYWVh#
z&pHk#jq-J@pmosTjj?9l4jk)iS{SuP99M1qDq?leUHbPfzLb$--vbsOV^pP_btm%&
zEI!P|apU2M4>4kRl+?fd;==ll#mCxpJUyP}Dekq6(qPiReG$YbHD=80$Q+`HHBa8X
z1nl=o5A{r**d1az!#}kf)P;&e6JQOcHpJ21h6fB9nIyXK*_Anr!N)_LY;iqv7~Lu8
z?eE-+9?^E?4t?-M{kCFjeCVvf#NoP%zJzrZ?dux*_58#*Uwq8)RQhd&W9+hncan}Z
zHf?CgV1WaNXrswj8OpjtZDPk?mG_NH+u;U74p?9_O!gsW2eoPGU?WE-6OJ*Tpf}h!
zjd7@}4{xAt(GGpUqExE(a{$NL6>I-kz_Dh*dY9oSyP8(a9c;eA#bD!{<`aE{Pq(nK
z10|gpd`p_tHQ8x%3B!?{(5HfR2lp8k*LE0J$gV=-Lr?3?^ZHOfap!giw#LjwH>Iun
zbbI*9mLjyuO>0V}HLmlc3&xLRyr%kXDa_c~sTY?@Wxl~Ijn4E#=)rS07y0zMl|7T1
zEi_w8&qE8q(5<z29tX6T)R;}(S0j>_FX=e@M=<Lib4J7GorFQ?rHi^*+aGH!CLTbi
zvaJXAq>iP0?dw^MAAPYF*0zwW8ttmx-#&e7+=kpK03D<Y`(LvpHg{38k&I7n*8_=x
zBU)xjvyug;1?*om=(XFRV{Mi8Ys<xS#WxAw_|x@WzK-kCtb~=`7mzxpF{w#q50X?e
zRZiN6go#fsWfi{zZdiwKFJ;SivgG`5qIX}?dZgB(>~%EzOGiC<Yz2w(Jv7}OYw_ep
z!0xEWy*<a8m!<G^^=Lv9L$mpGP}iF8YF^|)>!Mr~S_<)8rc3bCw_pw(T=TG3?OJHR
z26t{hTJLf<<7zuBJ=IxMWU7lNkGr1{CeX_BuF33$D#AAFpl$flijtXRoy<46t>`BP
z|M2P8<o}gP>Fob=@}`qa0yQ!2>c4$fd8V1%;WG|5X&Jv6u2cWJH+OB;?wXv87P+6^
ze+(ve<IM7QZ?3XqJdsgz;A2=}QIw7E%PtBuuV-zB^<@Sj+vYOeSHx+&Ez>!yZmdc9
zP-oOccX)Hw>5B`_<<(N@e|^#JWp+2o_A*nRMw{?<;8X-m7`u39ViM5ushPk1NeJU<
zm!uCT8Ff2c+gx8?uk)mb`#YL)w7rKk*t}Wa0^5Lg|9uOEo_|VELDC@Of(!3`;d%GX
z9`ew0&dn}7_ks)0JOBRKgU`F*Le0Iz|7h;PpQq~=Y}4X&Z9{i|KYZF5r$4NPE?InR
zPVcHjT5if%Bho@6yM3oE_R>3|yA#ewg7;MFH78$I<b-hP|1PD#r4+c70+&+Y_!RhG
Da}^uh


From f6834034cc9ddd53c9395ee33c4d45b042c17c26 Mon Sep 17 00:00:00 2001
From: littletomatodonkey <2120160898@bit.edu.cn>
Date: Thu, 5 Nov 2020 21:27:18 +0800
Subject: [PATCH 128/185] fix sample code (#28446)

---
 python/paddle/nn/functional/common.py | 24 +++++++++++-------------
 python/paddle/nn/layer/common.py      | 11 ++++-------
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/python/paddle/nn/functional/common.py b/python/paddle/nn/functional/common.py
index 0b18dec943d5f..1cf3599e846b9 100644
--- a/python/paddle/nn/functional/common.py
+++ b/python/paddle/nn/functional/common.py
@@ -1226,26 +1226,23 @@ def pad(x, pad, mode='constant', value=0, data_format="NCHW", name=None):
 
     Code Examples:
         .. code-block:: python
+
             import numpy as np
             import paddle
             import paddle.nn.functional as F
             
-            paddle.disable_static()
-            
             # example 1
             x_shape = (1, 1, 3)
-            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
-            tensor_x = paddle.to_tensor(x)
-            y = F.pad(tensor_x, pad=[2, 3], value=1, mode='constant')
-            print(y.numpy())
+            x = paddle.arange(np.prod(x_shape), dtype="float32").reshape(x_shape) + 1
+            y = F.pad(x, [2, 3], value=1, mode='constant', data_format="NCL")
+            print(y)
             # [[[1. 1. 1. 2. 3. 1. 1. 1.]]]
-            
+
             # example 2
             x_shape = (1, 1, 2, 3)
-            x = np.arange(np.prod(x_shape), dtype=np.float32).reshape(x_shape) + 1
-            tensor_x = paddle.to_tensor(x)
-            y = F.pad(tensor_x, pad=[1, 2, 1, 1], value=1, mode='circular')
-            print(y.numpy())
+            x = paddle.arange(np.prod(x_shape), dtype="float32").reshape(x_shape) + 1
+            y = F.pad(x, [1, 2, 1, 1], value=1, mode='circular')
+            print(y)
             # [[[[6. 4. 5. 6. 4. 5.]
             #    [3. 1. 2. 3. 1. 2.]
             #    [6. 4. 5. 6. 4. 5.]
@@ -1361,6 +1358,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 
     Examples:
         .. code-block:: text
+        
             Case 0:
                 x1 = [[0.8024077  0.9927354  0.27238318 0.8344984 ]
                      [0.48949873 0.5797396  0.65444374 0.66510963]
@@ -1376,10 +1374,10 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
 
     Code Examples:
         .. code-block:: python
+        
             import paddle
             import paddle.nn as nn
             import numpy as np
-            paddle.disable_static()
 
             np.random.seed(0)
             x1 = np.random.rand(2,3)
@@ -1387,7 +1385,7 @@ def cosine_similarity(x1, x2, axis=1, eps=1e-8):
             x1 = paddle.to_tensor(x1)
             x2 = paddle.to_tensor(x2)
             result = paddle.nn.functional.cosine_similarity(x1, x2, axis=0)
-            print(result.numpy())
+            print(result)
             # [0.99806249 0.9817672  0.94987036]
             
     """
diff --git a/python/paddle/nn/layer/common.py b/python/paddle/nn/layer/common.py
index ad8263e48356c..6e3910745e157 100644
--- a/python/paddle/nn/layer/common.py
+++ b/python/paddle/nn/layer/common.py
@@ -744,7 +744,6 @@ class Pad1D(layers.Layer):
             import paddle
             import paddle.nn as nn
             import numpy as np
-            paddle.disable_static()
 
             input_shape = (1, 2, 3)
             pad = [1, 2]
@@ -752,7 +751,7 @@ class Pad1D(layers.Layer):
             data = paddle.arange(np.prod(input_shape), dtype="float32").reshape(input_shape) + 1
             my_pad = nn.Pad1D(padding=pad, mode=mode)
             result = my_pad(data)
-            print(result.numpy())
+            print(result)
             # [[[0. 1. 2. 3. 0. 0.]
             #   [0. 4. 5. 6. 0. 0.]]]
     """
@@ -821,14 +820,13 @@ class Pad2D(layers.Layer):
             import paddle
             import paddle.nn as nn
             import numpy as np
-            paddle.disable_static()
             input_shape = (1, 1, 2, 3)
             pad = [1, 0, 1, 2]
             mode = "constant"
             data = paddle.arange(np.prod(input_shape), dtype="float32").reshape(input_shape) + 1
             my_pad = nn.Pad2D(padding=pad, mode=mode)
             result = my_pad(data)
-            print(result.numpy())
+            print(result)
             # [[[[0. 0. 0. 0.]
             #    [0. 1. 2. 3.]
             #    [0. 4. 5. 6.]
@@ -906,7 +904,7 @@ class Pad3D(layers.Layer):
             data = paddle.arange(np.prod(input_shape), dtype="float32").reshape(input_shape) + 1
             my_pad = nn.Pad3D(padding=pad, mode=mode)
             result = my_pad(data)
-            print(result.numpy())
+            print(result)
             # [[[[[0. 0. 0. 0.]
             #     [0. 1. 2. 3.]
             #     [0. 4. 5. 6.]
@@ -968,7 +966,6 @@ class CosineSimilarity(layers.Layer):
             import paddle
             import paddle.nn as nn
             import numpy as np
-            paddle.disable_static()
 
             np.random.seed(0)
             x1 = np.random.rand(2,3)
@@ -978,7 +975,7 @@ class CosineSimilarity(layers.Layer):
 
             cos_sim_func = nn.CosineSimilarity(axis=0)
             result = cos_sim_func(x1, x2)
-            print(result.numpy())
+            print(result)
             # [0.99806249 0.9817672  0.94987036]
     """
 

From bd8dfe38caef4d4769fe2a57263fbe90a6891592 Mon Sep 17 00:00:00 2001
From: Aurelius84 <zhangliujie@baidu.com>
Date: Fri, 6 Nov 2020 09:48:31 +0800
Subject: [PATCH 129/185] [Dy2Stat] Refine code example for 2.0 (#28440)

---
 python/paddle/fluid/dygraph/jit.py | 32 +++++++++++++-----------------
 1 file changed, 14 insertions(+), 18 deletions(-)

diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 4e026dab662c0..3f9d5fb97973f 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -176,24 +176,20 @@ def declarative(function=None, input_spec=None):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          import numpy as np
-          from paddle.fluid.dygraph.jit import declarative
-
-          fluid.enable_dygraph()
-
-          @declarative
-          def func(x):
-              x = fluid.dygraph.to_variable(x)
-              if fluid.layers.mean(x) < 0:
-                  x_v = x - 1
-              else:
-                  x_v = x + 1
-              return x_v
-
-          x = np.ones([1, 2])
-          x_v = func(x)
-          print(x_v.numpy()) # [[2. 2.]]
+            import paddle
+            from paddle.jit import to_static
+
+            @to_static
+            def func(x):
+                if paddle.mean(x) < 0:
+                    x_v = x - 1
+                else:
+                    x_v = x + 1
+                return x_v
+
+            x = paddle.ones([1, 2], dtype='float32')
+            x_v = func(x)
+            print(x_v) # [[2. 2.]]
 
     """
 

From ba0fe0a8128753d4a7c55e1d98cffbf08369e362 Mon Sep 17 00:00:00 2001
From: iducn <45056973+iducn@users.noreply.github.com>
Date: Fri, 6 Nov 2020 10:09:52 +0800
Subject: [PATCH 130/185] revert the modified shell script (#28453)

---
 paddle/.set_port.sh                           |   6 +-
 paddle/.set_python_path.sh                    |   8 +-
 paddle/fluid/inference/api/demo_ci/clean.sh   |   3 +-
 paddle/fluid/inference/api/demo_ci/run.sh     | 175 +++++++++---------
 paddle/fluid/inference/check_symbol.sh        |  12 +-
 paddle/fluid/train/demo/clean.sh              |   2 +-
 paddle/fluid/train/demo/run.sh                |  12 +-
 paddle/fluid/train/imdb_demo/run.sh           |   2 +-
 paddle/scripts/paddle_docker_build.sh         |  32 ++--
 tools/cudaError/start.sh                      |   4 +-
 .../dockerfile/build_scripts/install_nccl2.sh |   4 +-
 tools/gen_alias_mapping.sh                    |   4 +-
 .../manylinux1/build_scripts/install_nccl2.sh |  21 +--
 13 files changed, 131 insertions(+), 154 deletions(-)

diff --git a/paddle/.set_port.sh b/paddle/.set_port.sh
index e71f494aadf2c..617ac79a24889 100755
--- a/paddle/.set_port.sh
+++ b/paddle/.set_port.sh
@@ -13,6 +13,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-DIRNAME="$(dirname "$0")"
-sh "$DIRNAME"/.common_test_util.sh
-set_port "$@"
+DIRNAME=`dirname $0`
+source $DIRNAME/.common_test_util.sh
+set_port $@
diff --git a/paddle/.set_python_path.sh b/paddle/.set_python_path.sh
index 8da4565be617b..8fd58925ee482 100755
--- a/paddle/.set_python_path.sh
+++ b/paddle/.set_python_path.sh
@@ -24,14 +24,12 @@
 PYPATH=""
 set -x
 while getopts "d:" opt; do
-  case "$opt" in
+  case $opt in
     d)
       PYPATH=$OPTARG
       ;;
-    *)
-      ;;
   esac
 done
-shift $(("$OPTIND" - 1))
+shift $(($OPTIND - 1))
 export PYTHONPATH=$PYPATH:$PYTHONPATH
-"$@"
+$@
diff --git a/paddle/fluid/inference/api/demo_ci/clean.sh b/paddle/fluid/inference/api/demo_ci/clean.sh
index 5f603465776f1..0d9f3d2aa237a 100755
--- a/paddle/fluid/inference/api/demo_ci/clean.sh
+++ b/paddle/fluid/inference/api/demo_ci/clean.sh
@@ -1,5 +1,4 @@
-#!/bin/bash
 set -x
-cd "$(dirname "$0")" || exit
+cd `dirname $0`
 rm -rf build/ data/
 set +x
diff --git a/paddle/fluid/inference/api/demo_ci/run.sh b/paddle/fluid/inference/api/demo_ci/run.sh
index aee013e8f3652..6d283ca56cb65 100755
--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -1,29 +1,29 @@
 #!/bin/bash
 set -x
-PADDLE_ROOT="$1"
-TURN_ON_MKL="$2" # use MKL or Openblas
-TEST_GPU_CPU="$3" # test both GPU/CPU mode or only CPU mode
-DATA_DIR="$4" # dataset
-TENSORRT_INCLUDE_DIR="$5" # TensorRT header file dir, default to /usr/local/TensorRT/include
-TENSORRT_LIB_DIR="$6" # TensorRT lib file dir, default to /usr/local/TensorRT/lib
-MSVC_STATIC_CRT="$7"
-inference_install_dir="${PADDLE_ROOT}"/build/paddle_inference_install_dir
+PADDLE_ROOT=$1
+TURN_ON_MKL=$2 # use MKL or Openblas
+TEST_GPU_CPU=$3 # test both GPU/CPU mode or only CPU mode
+DATA_DIR=$4 # dataset
+TENSORRT_INCLUDE_DIR=$5 # TensorRT header file dir, default to /usr/local/TensorRT/include
+TENSORRT_LIB_DIR=$6 # TensorRT lib file dir, default to /usr/local/TensorRT/lib
+MSVC_STATIC_CRT=$7
+inference_install_dir=${PADDLE_ROOT}/build/paddle_inference_install_dir
 
-cd "$(dirname "$0")" || exit
-current_dir=$(pwd)
-if [ "$2" == ON ]; then
+cd `dirname $0`
+current_dir=`pwd`
+if [ $2 == ON ]; then
   # You can export yourself if move the install path
-  MKL_LIB="${inference_install_dir}"/third_party/install/mklml/lib
-  export LD_LIBRARY_PATH="$LD_LIBRARY_PATH":"${MKL_LIB}"
+  MKL_LIB=${inference_install_dir}/third_party/install/mklml/lib
+  export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:${MKL_LIB}
 fi
-if [ "$3" == ON ]; then
+if [ $3 == ON ]; then
   use_gpu_list='true false'
 else
   use_gpu_list='false'
 fi
 
 USE_TENSORRT=OFF
-if [ -d "$TENSORRT_INCLUDE_DIR" ] && [ -d "$TENSORRT_LIB_DIR" ]; then
+if [ -d "$TENSORRT_INCLUDE_DIR" -a -d "$TENSORRT_LIB_DIR" ]; then
   USE_TENSORRT=ON
 fi
 
@@ -32,79 +32,77 @@ URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
 
 # download vis_demo data
 function download() {
-  dir_name="$1"
-  mkdir -p "$dir_name"
-  cd "$dir_name" || exit
+  dir_name=$1
+  mkdir -p $dir_name
+  cd $dir_name
   if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
     echo "${PREFIX}${dir_name}.tar.gz has been downloaded."
   else
-      wget -q "${URL_ROOT}""$dir_name".tar.gz
-      tar xzf ./*.tar.gz
+      wget -q ${URL_ROOT}$dir_name.tar.gz
+      tar xzf *.tar.gz
   fi
-  cd .. || exit
+  cd ..
 }
-mkdir -p "$DATA_DIR"
-cd "$DATA_DIR" || exit
+mkdir -p $DATA_DIR
+cd $DATA_DIR
 vis_demo_list='se_resnext50 ocr mobilenet'
 for vis_demo_name in $vis_demo_list; do
-  download "$vis_demo_name"
+  download $vis_demo_name
 done
 
 # download word2vec data
 mkdir -p word2vec
-cd word2vec || exit
+cd word2vec
 if [[ -e "word2vec.inference.model.tar.gz" ]]; then
   echo "word2vec.inference.model.tar.gz has been downloaded."
 else
     wget -q http://paddle-inference-dist.bj.bcebos.com/word2vec.inference.model.tar.gz
-    tar xzf ./*.tar.gz
+    tar xzf *.tar.gz
 fi
 
 # compile and test the demo
-cd "$current_dir" || exit
+cd $current_dir
 mkdir -p build
-cd build || exit
-rm -rf ./*
+cd build
+rm -rf *
 
 for WITH_STATIC_LIB in ON OFF; do
-  if [ "$(uname | grep Win)" != "" ]; then
+  if [ $(echo `uname` | grep "Win") != "" ]; then
     # -----simple_on_word2vec on windows-----
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB="${inference_install_dir}" \
-      -DWITH_MKL="$TURN_ON_MKL" \
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
-      -DWITH_GPU="$TEST_GPU_CPU" \
-      -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
-      -DMSVC_STATIC_CRT="$MSVC_STATIC_CRT"
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       Release/simple_on_word2vec.exe \
-        --dirname="$DATA_DIR"/word2vec/word2vec.inference.model \
-        --use_gpu="$use_gpu"
-      EXCODE="$?"
-      if [ "$EXCODE" -ne 0 ]; then
+        --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+        --use_gpu=$use_gpu
+      if [ $? -ne 0 ]; then
         echo "simple_on_word2vec demo runs fail."
         exit 1
       fi
     done
 
     # -----vis_demo on windows-----
-    rm -rf ./*
-    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB="${inference_install_dir}" \
-      -DWITH_MKL="$TURN_ON_MKL" \
+    rm -rf *
+    cmake .. -G "Visual Studio 14 2015" -A x64 -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
-      -DWITH_GPU="$TEST_GPU_CPU" \
-      -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
-      -DMSVC_STATIC_CRT="$MSVC_STATIC_CRT"
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+      -DMSVC_STATIC_CRT=$MSVC_STATIC_CRT
     msbuild  /maxcpucount /property:Configuration=Release cpp_inference_demo.sln
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
         Release/vis_demo.exe \
-          --modeldir="$DATA_DIR"/"$vis_demo_name"/model \
-          --data="$DATA_DIR"/"$vis_demo_name"/data.txt \
-          --refer="$DATA_DIR"/"$vis_demo_name"/result.txt \
-          --use_gpu="$use_gpu"
-        EXCODE="$?"
-        if [ "$EXCODE" -ne 0 ]; then
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
           echo "vis demo $vis_demo_name runs fail."
           exit 1
         fi
@@ -112,66 +110,63 @@ for WITH_STATIC_LIB in ON OFF; do
     done
   else
     # -----simple_on_word2vec on linux/mac-----
-    rm -rf ./*
-    cmake .. -DPADDLE_LIB="${inference_install_dir}" \
-      -DWITH_MKL="$TURN_ON_MKL" \
+    rm -rf *
+    cmake .. -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=simple_on_word2vec \
-      -DWITH_GPU="$TEST_GPU_CPU" \
-      -DWITH_STATIC_LIB="$WITH_STATIC_LIB"
-    make -j"$(nproc)"
-    word2vec_model="$DATA_DIR"'/word2vec/word2vec.inference.model'
-    if [ -d "$word2vec_model" ]; then
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+    make -j$(nproc)
+    word2vec_model=$DATA_DIR'/word2vec/word2vec.inference.model'
+    if [ -d $word2vec_model ]; then
       for use_gpu in $use_gpu_list; do
         ./simple_on_word2vec \
-          --dirname="$DATA_DIR"/word2vec/word2vec.inference.model \
-          --use_gpu="$use_gpu"
-        EXCODE="$?"
-        if [ "$EXCODE" -ne 0 ]; then
+          --dirname=$DATA_DIR/word2vec/word2vec.inference.model \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
           echo "simple_on_word2vec demo runs fail."
           exit 1
         fi
       done
     fi
     # ---------vis_demo on linux/mac---------
-    rm -rf ./*
-    cmake .. -DPADDLE_LIB="${inference_install_dir}" \
-      -DWITH_MKL="$TURN_ON_MKL" \
+    rm -rf *
+    cmake .. -DPADDLE_LIB=${inference_install_dir} \
+      -DWITH_MKL=$TURN_ON_MKL \
       -DDEMO_NAME=vis_demo \
-      -DWITH_GPU="$TEST_GPU_CPU" \
-      -DWITH_STATIC_LIB="$WITH_STATIC_LIB"
-    make -j"$(nproc)"
+      -DWITH_GPU=$TEST_GPU_CPU \
+      -DWITH_STATIC_LIB=$WITH_STATIC_LIB
+    make -j$(nproc)
     for use_gpu in $use_gpu_list; do
       for vis_demo_name in $vis_demo_list; do
         ./vis_demo \
-          --modeldir="$DATA_DIR"/"$vis_demo_name"/model \
-          --data="$DATA_DIR"/"$vis_demo_name"/data.txt \
-          --refer="$DATA_DIR"/"$vis_demo_name"/result.txt \
-          --use_gpu="$use_gpu"
-        EXCODE="$?"
-        if [ "$EXCODE" -ne 0 ]; then
+          --modeldir=$DATA_DIR/$vis_demo_name/model \
+          --data=$DATA_DIR/$vis_demo_name/data.txt \
+          --refer=$DATA_DIR/$vis_demo_name/result.txt \
+          --use_gpu=$use_gpu
+        if [ $? -ne 0 ]; then
           echo "vis demo $vis_demo_name runs fail."
           exit 1
         fi
       done
     done
     # --------tensorrt mobilenet on linux/mac------
-    if [ "$USE_TENSORRT" == ON ] && [ "$TEST_GPU_CPU" == ON ]; then
-      rm -rf ./*
-      cmake .. -DPADDLE_LIB="${inference_install_dir}" \
-        -DWITH_MKL="$TURN_ON_MKL" \
+    if [ $USE_TENSORRT == ON -a $TEST_GPU_CPU == ON ]; then
+      rm -rf *
+      cmake .. -DPADDLE_LIB=${inference_install_dir} \
+        -DWITH_MKL=$TURN_ON_MKL \
         -DDEMO_NAME=trt_mobilenet_demo \
-        -DWITH_GPU="$TEST_GPU_CPU" \
-        -DWITH_STATIC_LIB="$WITH_STATIC_LIB" \
-        -DUSE_TENSORRT="$USE_TENSORRT" \
-        -DTENSORRT_INCLUDE_DIR="$TENSORRT_INCLUDE_DIR" \
-        -DTENSORRT_LIB_DIR="$TENSORRT_LIB_DIR"
-      make -j"$(nproc)"
+        -DWITH_GPU=$TEST_GPU_CPU \
+        -DWITH_STATIC_LIB=$WITH_STATIC_LIB \
+        -DUSE_TENSORRT=$USE_TENSORRT \
+        -DTENSORRT_INCLUDE_DIR=$TENSORRT_INCLUDE_DIR \
+        -DTENSORRT_LIB_DIR=$TENSORRT_LIB_DIR
+      make -j$(nproc)
       ./trt_mobilenet_demo \
-        --modeldir="$DATA_DIR"/mobilenet/model \
-        --data="$DATA_DIR"/mobilenet/data.txt \
-        --refer="$DATA_DIR"/mobilenet/result.txt 
-      EXCODE="$?"
-      if [ "$EXCODE" != 0 ]; then
+        --modeldir=$DATA_DIR/mobilenet/model \
+        --data=$DATA_DIR/mobilenet/data.txt \
+        --refer=$DATA_DIR/mobilenet/result.txt 
+      if [ $? -ne 0 ]; then
         echo "trt demo trt_mobilenet_demo runs fail."
         exit 1
       fi
diff --git a/paddle/fluid/inference/check_symbol.sh b/paddle/fluid/inference/check_symbol.sh
index 0c66946c4b8a1..a0f64796576c8 100755
--- a/paddle/fluid/inference/check_symbol.sh
+++ b/paddle/fluid/inference/check_symbol.sh
@@ -1,12 +1,12 @@
 #!/bin/sh
 
-lib="$1"
-if [ "$#" -ne 1 ]; then echo "No input library"; exit 1 ; fi
+lib=$1
+if [ $# -ne 1 ]; then echo "No input library"; exit -1 ; fi
 
-num_paddle_syms=$(nm -D "${lib}" | grep -c paddle )
-num_google_syms=$(nm -D "${lib}" | grep google | grep -v paddle | grep -c "T " )
+num_paddle_syms=$(nm -D ${lib} | grep paddle | wc -l)
+num_google_syms=$(nm -D ${lib} | grep google | grep -v paddle | grep "T " | wc -l)
 
-if [ "$num_paddle_syms" -le 0 ]; then echo "Have no paddle symbols"; exit 1 ; fi
-if [ "$num_google_syms" -ge 1 ]; then echo "Have some google symbols"; exit 1 ; fi
+if [ $num_paddle_syms -le 0 ]; then echo "Have no paddle symbols"; exit -1 ; fi
+if [ $num_google_syms -ge 1 ]; then echo "Have some google symbols"; exit -1 ; fi
 
 exit 0
diff --git a/paddle/fluid/train/demo/clean.sh b/paddle/fluid/train/demo/clean.sh
index 192bdf8752c15..a2064492c08b8 100755
--- a/paddle/fluid/train/demo/clean.sh
+++ b/paddle/fluid/train/demo/clean.sh
@@ -15,6 +15,6 @@
 # limitations under the License.
 
 set -x
-cd "$(dirname "$0")" || exit
+cd "$(dirname "$0")"
 rm -rf build/
 set +x
diff --git a/paddle/fluid/train/demo/run.sh b/paddle/fluid/train/demo/run.sh
index a9c0ed4ac68a2..2955e7574daa2 100755
--- a/paddle/fluid/train/demo/run.sh
+++ b/paddle/fluid/train/demo/run.sh
@@ -14,14 +14,14 @@ function download() {
 download
 
 # build demo trainer
-paddle_install_dir="${PADDLE_ROOT}"/build/paddle_install_dir
+paddle_install_dir=${PADDLE_ROOT}/build/paddle_install_dir
 
 mkdir -p build
-cd build || exit
-rm -rf ./*
-cmake .. -DPADDLE_LIB="$paddle_install_dir" \
-         -DWITH_MKLDNN="$TURN_ON_MKL" \
-         -DWITH_MKL="$TURN_ON_MKL"
+cd build
+rm -rf *
+cmake .. -DPADDLE_LIB=$paddle_install_dir \
+         -DWITH_MKLDNN=$TURN_ON_MKL \
+         -DWITH_MKL=$TURN_ON_MKL
 make
 
 cd ..
diff --git a/paddle/fluid/train/imdb_demo/run.sh b/paddle/fluid/train/imdb_demo/run.sh
index 8a585c614e53f..f71b4bac602a9 100644
--- a/paddle/fluid/train/imdb_demo/run.sh
+++ b/paddle/fluid/train/imdb_demo/run.sh
@@ -1,3 +1,3 @@
-#!/bin/bash
+
 set -exu
 build/demo_trainer --flagfile="train.cfg"
diff --git a/paddle/scripts/paddle_docker_build.sh b/paddle/scripts/paddle_docker_build.sh
index fdd0d490a6fdb..d6b639d0da2a5 100755
--- a/paddle/scripts/paddle_docker_build.sh
+++ b/paddle/scripts/paddle_docker_build.sh
@@ -15,14 +15,14 @@
 # limitations under the License.
 
 function start_build_docker() {
-    docker pull "$IMG"
+    docker pull $IMG
 
     apt_mirror='s#http://archive.ubuntu.com/ubuntu#mirror://mirrors.ubuntu.com/mirrors.txt#g'
     DOCKER_ENV=$(cat <<EOL
         -e FLAGS_fraction_of_gpu_memory_to_use=0.15 \
         -e CTEST_OUTPUT_ON_FAILURE=1 \
         -e CTEST_PARALLEL_LEVEL=1 \
-        -e APT_MIRROR="${apt_mirror}" \
+        -e APT_MIRROR=${apt_mirror} \
         -e WITH_GPU=ON \
         -e CUDA_ARCH_NAME=Auto \
         -e WITH_AVX=ON \
@@ -39,24 +39,24 @@ EOL
     )
 
     DOCKER_CMD="nvidia-docker"
-    if ! [ -x "$(command -v "${DOCKER_CMD}")" ]; then
+    if ! [ -x "$(command -v ${DOCKER_CMD})" ]; then
         DOCKER_CMD="docker"
     fi
     if [ ! -d "${HOME}/.ccache" ]; then
-        mkdir "${HOME}"/.ccache
+        mkdir ${HOME}/.ccache
     fi
     set -ex
-    "${DOCKER_CMD}" run -it \
-        "${DOCKER_ENV}" \
-        -e SCRIPT_NAME="$0" \
-        -e CONTENT_DEC_PASSWD="$CONTENT_DEC_PASSWD" \
-        -e TRAVIS_BRANCH="$TRAVIS_BRANCH" \
-        -e TRAVIS_PULL_REQUEST="$TRAVIS_PULL_REQUEST" \
-        -v "$PADDLE_ROOT":/paddle \
-        -v "${HOME}"/.ccache:/root/.ccache \
+    ${DOCKER_CMD} run -it \
+        ${DOCKER_ENV} \
+        -e SCRIPT_NAME=$0 \
+        -e CONTENT_DEC_PASSWD=$CONTENT_DEC_PASSWD \
+        -e TRAVIS_BRANCH=$TRAVIS_BRANCH \
+        -e TRAVIS_PULL_REQUEST=$TRAVIS_PULL_REQUEST \
+        -v $PADDLE_ROOT:/paddle \
+        -v ${HOME}/.ccache:/root/.ccache \
         -w /paddle \
-        "$IMG" \
-        paddle/scripts/paddle_build.sh "$@"
+        $IMG \
+        paddle/scripts/paddle_build.sh $@
     set +x
 }
 
@@ -65,7 +65,7 @@ function main() {
     VERSION="latest-dev"
     PADDLE_ROOT="$( cd "$( dirname "${BASH_SOURCE[0]}")/../../" && pwd )"
     IMG=${DOCKER_REPO}:${VERSION}
-    start_build_docker "$@"
+    start_build_docker $@
 }
 
-main "$@"
+main $@
diff --git a/tools/cudaError/start.sh b/tools/cudaError/start.sh
index b98d9491ca968..3c0e57ffe7ec1 100644
--- a/tools/cudaError/start.sh
+++ b/tools/cudaError/start.sh
@@ -1,6 +1,6 @@
 #!/usr/bin/env bash
 set -ex
-SYSTEM="$(uname -s)"
+SYSTEM=`uname -s`
 rm -f protoc-3.11.3-linux-x86_64.*
 if [ "$SYSTEM" == "Linux" ]; then
     wget --no-check-certificate https://github.com/protocolbuffers/protobuf/releases/download/v3.11.3/protoc-3.11.3-linux-x86_64.zip
@@ -28,5 +28,5 @@ if [ "$1" != "" ]; then
     fi
 fi
 
-python spider.py --version=$version --url="$url"
+python spider.py --version=$version --url=$url
 tar czf cudaErrorMessage.tar.gz cudaErrorMessage.pb
diff --git a/tools/dockerfile/build_scripts/install_nccl2.sh b/tools/dockerfile/build_scripts/install_nccl2.sh
index 2708f4f976d23..0c9bf1409d90d 100644
--- a/tools/dockerfile/build_scripts/install_nccl2.sh
+++ b/tools/dockerfile/build_scripts/install_nccl2.sh
@@ -24,8 +24,8 @@ wget -q -O $DIR/$DEB $URL
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
 for sub_deb in $DEBS; do
-  echo "$sub_deb"
-  ar x "$sub_deb" && tar xf data.tar.xz
+  echo $sub_deb
+  ar x $sub_deb && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
diff --git a/tools/gen_alias_mapping.sh b/tools/gen_alias_mapping.sh
index d199c535f9673..3ab1e68b37557 100755
--- a/tools/gen_alias_mapping.sh
+++ b/tools/gen_alias_mapping.sh
@@ -31,9 +31,9 @@
 #         <real API implement>\t<API recommend>,<API other alias name1>,<API other alias name2>,...
 
 
-PADDLE_ROOT="$(dirname "$(readlink -f "${BASH_SOURCE[0]}")")/.."
+PADDLE_ROOT="$(dirname $(readlink -f ${BASH_SOURCE[0]}))/.."
 
-find "${PADDLE_ROOT}"/python/ -name '*.py' \
+find ${PADDLE_ROOT}/python/ -name '*.py' \
     | xargs  grep -v '^#' \
     | grep 'DEFINE_ALIAS' \
     | perl -ne '
diff --git a/tools/manylinux1/build_scripts/install_nccl2.sh b/tools/manylinux1/build_scripts/install_nccl2.sh
index c2adf6a79de4b..0c9bf1409d90d 100644
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
@@ -1,19 +1,4 @@
 #!/bin/bash
-
-# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-# 
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-# 
-#     http://www.apache.org/licenses/LICENSE-2.0
-# 
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
 VERSION=$(nvcc --version | grep release | grep -oEi "release ([0-9]+)\.([0-9])"| sed "s/release //")
 if [ "$VERSION" == "10.0" ]; then
   DEB="nccl-repo-ubuntu1604-2.4.7-ga-cuda10.0_1-1_amd64.deb"
@@ -39,10 +24,10 @@ wget -q -O $DIR/$DEB $URL
 cd $DIR && ar x $DEB && tar xf data.tar.xz
 DEBS=$(find ./var/ -name "*.deb")
 for sub_deb in $DEBS; do
-  echo "$sub_deb"
-  ar x "$sub_deb" && tar xf data.tar.xz
+  echo $sub_deb
+  ar x $sub_deb && tar xf data.tar.xz
 done
 mv -f usr/include/nccl.h /usr/local/include/
 mv -f usr/lib/x86_64-linux-gnu/libnccl* /usr/local/lib/
 rm /usr/include/nccl.h
-rm -rf "$DIR"
+rm -rf $DIR

From ba036b88519ffd0ab78eddd68728d772891e25f2 Mon Sep 17 00:00:00 2001
From: gongweibao <weibao.gong@gmail.com>
Date: Fri, 6 Nov 2020 10:10:32 +0800
Subject: [PATCH 131/185] remove shellcheck test=develop (#28457)

---
 .pre-commit-config.yaml | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c8af603d53363..fc7e70619e3be 100755
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -50,11 +50,3 @@ repos:
         language: system
         files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|proto|py|sh)$
         exclude: (?!.*third_party)^.*$ | (?!.*book)^.*$
--   repo: local
-    hooks:
-    -   id: shellcheck
-        name: shellcheck
-        entry: shellcheck
-        language: system
-        files: .sh$
-        exclude: (paddle_build.sh|fast_install.sh|check_file_diff_approvals.sh)

From 7821759d48f7b09b33303478d3829b124768afe3 Mon Sep 17 00:00:00 2001
From: "joanna.wozna.intel" <joanna.wozna@intel.com>
Date: Fri, 6 Nov 2020 03:11:26 +0100
Subject: [PATCH 132/185] Add bfloat16 softmax and gelu (#28394)

* Add bfloat16 softmax and gelu

* Add pass attr bfloat16_enabled_op_types

* Changes from review
---
 .../framework/ir/graph_pattern_detector.cc    |  5 +-
 .../cpu_bfloat16_placement_pass_tester.cc     |  8 +-
 .../inference/analysis/ir_pass_manager.cc     |  4 +
 paddle/fluid/operators/gelu_op.cc             |  5 +
 .../operators/mkldnn/activation_mkldnn_op.cc  | 25 +++--
 .../operators/mkldnn/softmax_mkldnn_op.cc     |  3 +-
 paddle/fluid/operators/softmax_op.cc          |  5 +
 .../mkldnn/test_activation_mkldnn_op.py       | 78 +++++++++++++++-
 .../mkldnn/test_softmax_bf16_mkldnn_op.py     | 92 +++++++++++++++++++
 tools/static_mode_white_list.py               |  1 +
 10 files changed, 211 insertions(+), 15 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py

diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 4f1080952a11e..5704dd09cf287 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -2101,8 +2101,9 @@ PDNode *patterns::QuantizePlacement::operator()(
 PDNode *patterns::Bfloat16Placement::operator()(
     const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
   std::unordered_set<std::string> supported_op_types =
-      std::unordered_set<std::string>(
-          {"concat", "conv2d", "fusion_gru", "reshape2", "transpose2", "sum"});
+      std::unordered_set<std::string>({"concat", "conv2d", "fusion_gru", "gelu",
+                                       "reshape2", "softmax", "sum",
+                                       "transpose2"});
   if (!bfloat16_enabled_op_types.empty()) {
     supported_op_types = bfloat16_enabled_op_types;
   }
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
index 4ca9724026a9c..4e3704e510c87 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_bfloat16_placement_pass_tester.cc
@@ -33,7 +33,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
   if (type == "conv2d") {
     op->SetAttr("name", name);
     op->SetInput("Input", {inputs[0]});
-  } else if (type == "relu") {
+  } else if (type == "gelu") {
     op->SetInput("X", inputs);
   } else if (type == "concat") {
     op->SetAttr("axis", 1);
@@ -71,7 +71,7 @@ ProgramDesc BuildProgramDesc() {
 
   SetOp(&prog, "concat", "concat1", {"a", "b"}, {"c"});
   SetOp(&prog, "conv2d", "conv1", {"c"}, {"f"});
-  SetOp(&prog, "relu", "relu1", {"f"}, {"g"});
+  SetOp(&prog, "gelu", "gelu1", {"f"}, {"g"});
   SetOp(&prog, "pool2d", "pool1", {"g"}, {"h"});
   SetOp(&prog, "conv2d", "conv2", {"h"}, {"k"});
   SetOp(&prog, "pool2d", "pool2", {"k"}, {"l"});
@@ -126,7 +126,7 @@ void DefaultAttrTest(unsigned expected_bfloat16_data_type_count) {
 }
 
 TEST(Bfloat16PlacementPass, enable_all) {
-  MainTest({"conv2d", "pool2d", "relu", "concat", "sum"}, 8);
+  MainTest({"conv2d", "pool2d", "gelu", "concat", "sum"}, 8);
 }
 
 TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
@@ -134,7 +134,7 @@ TEST(Bfloat16PlacementPass, enabled_conv_and_pool) {
   MainTest({"conv2d", "pool2d"}, 3);
 }
 
-TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(6); }
+TEST(Bfloat16PlacementPass, default_attr_value) { DefaultAttrTest(7); }
 
 }  // namespace ir
 }  // namespace framework
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index e94590e847cd5..3566b856912da 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -79,6 +79,10 @@ void IRPassManager::CreatePasses(Argument *argument,
     } else if (pass_name == "cpu_quantize_pass") {
       pass->Set("quant_var_scales",
                 new VarQuantScale(argument->quant_var_scales()));
+    } else if (pass_name == "cpu_bfloat16_placement_pass") {
+      pass->Set("bfloat16_enabled_op_types",
+                new std::unordered_set<std::string>(
+                    argument->bfloat16_enabled_op_types()));
 #endif
     } else if (pass_name == "tensorrt_subgraph_pass") {
       pass->Set("workspace_size", new int(argument->tensorrt_workspace_size()));
diff --git a/paddle/fluid/operators/gelu_op.cc b/paddle/fluid/operators/gelu_op.cc
index c72cabad89180..9ca0d30362c5a 100644
--- a/paddle/fluid/operators/gelu_op.cc
+++ b/paddle/fluid/operators/gelu_op.cc
@@ -111,6 +111,11 @@ class GeluOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "int8", "bfloat16"});
     AddAttr<bool>("use_cudnn",
                   "(bool, default false) Only used in cudnn kernel, need "
                   "install cudnn")
diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
index aecf67fc3bb1d..22954203d6b41 100644
--- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
@@ -83,14 +83,14 @@ void eltwise_forward(const framework::ExecutionContext &ctx,
   const auto *x = ctx.Input<Tensor>("X");
   auto *y = ctx.Output<Tensor>("Out");
 
-  T alpha = ctx.HasAttr("alpha") ? ctx.Attr<T>("alpha") : 0;
-  T beta = ctx.HasAttr("beta") ? ctx.Attr<T>("beta") : 0;
+  float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
+  float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
 
   // paddle uses beta but mkldnn uses alpha for swish
   if (algorithm == mkldnn::algorithm::eltwise_swish) {
     std::swap(alpha, beta);
   } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
-    alpha = ctx.Attr<T>("threshold");
+    alpha = ctx.Attr<float>("threshold");
   }
 
   PADDLE_ENFORCE(
@@ -128,14 +128,14 @@ void eltwise_grad(const framework::ExecutionContext &ctx,
   const auto *diff_y = ctx.Input<Tensor>(framework::GradVarName("Out"));
   auto *diff_x = ctx.Output<Tensor>(framework::GradVarName("X"));
 
-  T alpha = ctx.HasAttr("alpha") ? ctx.Attr<T>("alpha") : 0;
-  T beta = ctx.HasAttr("beta") ? ctx.Attr<T>("beta") : 0;
+  float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 0;
+  float beta = ctx.HasAttr("beta") ? ctx.Attr<float>("beta") : 0;
 
   // paddle uses beta but mkldnn uses alpha for swish
   if (algorithm == mkldnn::algorithm::eltwise_swish) {
     std::swap(alpha, beta);
   } else if (algorithm == dnnl::algorithm::eltwise_bounded_relu) {
-    alpha = ctx.Attr<T>("threshold");
+    alpha = ctx.Attr<float>("threshold");
   }
 
   auto diff_dst_tz = framework::vectorize<int64_t>(diff_y->dims());
@@ -272,11 +272,20 @@ namespace ops = paddle::operators;
       act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,               \
       ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
 
+#define REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(act_type, functor,             \
+                                               grad_functor)                  \
+  REGISTER_OP_KERNEL(                                                         \
+      act_type, MKLDNN, ::paddle::platform::CPUPlace,                         \
+      ops::MKLDNNActivationKernel<ops::functor<float>>,                       \
+      ops::MKLDNNActivationKernel<ops::functor<paddle::platform::bfloat16>>); \
+  REGISTER_OP_KERNEL(                                                         \
+      act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace,                  \
+      ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
+
 #define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro)                     \
   __macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);          \
   __macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor);       \
   __macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor);    \
-  __macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor);          \
   __macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor);       \
   __macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradFunctor); \
   __macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradFunctor);          \
@@ -284,3 +293,5 @@ namespace ops = paddle::operators;
   __macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor);
 
 FOR_EACH_MKLDNN_KERNEL_FUNCTOR(REGISTER_ACTIVATION_MKLDNN_KERNEL);
+REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(gelu, GeluMKLDNNFunctor,
+                                       GeluMKLDNNGradFunctor);
diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
index 0b159f9dcfaaf..9d9e1e2d8ded5 100644
--- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc
@@ -181,6 +181,7 @@ class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel<T> {
 namespace ops = paddle::operators;
 
 REGISTER_OP_KERNEL(softmax, MKLDNN, ::paddle::platform::CPUPlace,
-                   ops::SoftmaxMKLDNNKernel<float>);
+                   ops::SoftmaxMKLDNNKernel<float>,
+                   ops::SoftmaxMKLDNNKernel<paddle::platform::bfloat16>);
 REGISTER_OP_KERNEL(softmax_grad, MKLDNN, ::paddle::platform::CPUPlace,
                    ops::SoftmaxMKLDNNGradKernel<float>);
diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index cf46b4fc3bdad..63a27a8ccbfca 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -115,6 +115,11 @@ class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<std::string>(
+        "mkldnn_data_type",
+        "(string, default \"float32\"). Data type of mkldnn kernel")
+        .SetDefault("float32")
+        .InEnum({"float32", "bfloat16"});
     AddAttr<bool>("is_test",
                   "(bool, default false) Set to true for inference only, false "
                   "for training. Some layers may run faster when this is true.")
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
index d904bdbfa96ae..63db1b1475d40 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_activation_mkldnn_op.py
@@ -18,7 +18,7 @@
 import numpy as np
 from scipy.special import expit
 import paddle.fluid.core as core
-from paddle.fluid.tests.unittests.op_test import OpTest
+from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
 from paddle.fluid.tests.unittests.test_activation_op import TestActivation, TestRelu, TestTanh, TestSqrt, TestAbs, TestLeakyRelu, TestSwish, TestRelu6, TestSigmoid
 from paddle.fluid.tests.unittests.test_gelu_op import gelu
 from mkldnn_op_test import check_if_mkldnn_primitives_exist_in_bwd
@@ -79,6 +79,44 @@ def setUp(self):
         self.attrs = {"use_mkldnn": True, "approximate": True}
 
 
+class TestMKLDNNGeluBf16Dim2(TestActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.dtype = np.uint16
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+        out = convert_float_to_uint16(gelu(x, False))
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+
+class TestMKLDNNGeluBf16Dim2Approx(TestActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.dtype = np.uint16
+
+        x = np.random.uniform(-1, 1, [11, 17]).astype(np.float32)
+        out = convert_float_to_uint16(gelu(x, True))
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True, "approximate": True}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+
 class TestMKLDNNTanhDim2(TestTanh):
     def setUp(self):
         super(TestMKLDNNTanhDim2, self).setUp()
@@ -187,6 +225,44 @@ def setUp(self):
         self.attrs = {"use_mkldnn": True, "approximate": True}
 
 
+class TestMKLDNNGeluBf16Dim4(TestActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.dtype = np.uint16
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype(np.float32)
+        out = convert_float_to_uint16(gelu(x, False))
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+
+class TestMKLDNNGeluBf16Dim4Approx(TestActivation):
+    def setUp(self):
+        self.op_type = "gelu"
+        self.dtype = np.uint16
+
+        x = np.random.uniform(-1, 1, [2, 4, 3, 5]).astype(np.float32)
+        out = convert_float_to_uint16(gelu(x, True))
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {"use_mkldnn": True, "approximate": True}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+
 class TestMKLDNNTanhDim4(TestTanh):
     def setUp(self):
         super(TestMKLDNNTanhDim4, self).setUp()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
new file mode 100644
index 0000000000000..5ba944c3b98f4
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_softmax_bf16_mkldnn_op.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from paddle.fluid.tests.unittests.op_test import convert_float_to_uint16
+import paddle.fluid.core as core
+from paddle.fluid.tests.unittests.test_softmax_op import TestSoftmaxOp, TestSoftmaxOp2, TestSoftmaxOp3, TestSoftmaxOp4, TestSoftmaxOp5, TestSoftmaxOp6
+from paddle import enable_static
+
+
+def stable_softmax(x):
+    """Compute the softmax of vector x in a numerically stable way."""
+    shiftx = x - np.max(x).clip(-64.)
+    exps = np.exp(shiftx)
+    return exps / np.sum(exps)
+
+
+class TestSoftmaxMKLDNNOp(TestSoftmaxOp):
+    def get_x_shape(self):
+        return [10, 10]
+
+    def get_axis(self):
+        return -1
+
+    def setUp(self):
+        self.op_type = "softmax"
+        self.use_mkldnn = True
+        self.dtype = np.uint16
+        self.init_kernel_type()
+        self.shape = self.get_x_shape()
+        self.axis = self.get_axis()
+
+        x = np.random.uniform(0.1, 1, self.shape).astype(np.float)
+        out = convert_float_to_uint16(
+            np.apply_along_axis(stable_softmax, self.axis, x))
+
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': out}
+        self.attrs = {'axis': self.axis, 'use_mkldnn': self.use_mkldnn}
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CPUPlace())
+
+    def test_check_grad(self):
+        pass
+
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp2(TestSoftmaxOp2):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp3(TestSoftmaxOp3):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp4(TestSoftmaxOp4):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp5(TestSoftmaxOp5):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+class TestSoftmaxMKLDNNOp6(TestSoftmaxOp6):
+    def init_kernel_type(self):
+        self.use_mkldnn = True
+
+
+if __name__ == '__main__':
+    enable_static()
+    unittest.main()
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 77e7372290d9c..6a2a121cd616f 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -601,6 +601,7 @@
     'test_quantize_mkldnn_op',
     'test_requantize_mkldnn_op',
     'test_softmax_mkldnn_op',
+    'test_softmax_bf16_mkldnn_op',
     'test_sum_mkldnn_op',
     'test_sum_bf16_mkldnn_op',
     'test_transpose_int8_mkldnn_op',

From ced5c40c41fbe2c7cb6ccd1b6052a9dbbe4d81b8 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Thu, 5 Nov 2020 20:23:44 -0600
Subject: [PATCH 133/185] Update memory release interface. (#28456)

---
 paddle/fluid/memory/allocation/allocator.h    |  6 +++--
 .../memory/allocation/allocator_facade.cc     |  4 +--
 .../memory/allocation/allocator_facade.h      |  2 +-
 .../auto_growth_best_fit_allocator.cc         |  5 +++-
 .../auto_growth_best_fit_allocator.h          |  6 +++--
 .../allocation/naive_best_fit_allocator.cc    | 26 +++++++++----------
 .../allocation/naive_best_fit_allocator.h     |  2 +-
 .../fluid/memory/allocation/retry_allocator.h |  4 +--
 .../allocation/thread_local_allocator.cc      |  4 ++-
 .../allocation/thread_local_allocator.h       |  4 +--
 paddle/fluid/memory/detail/buddy_allocator.cc |  3 ++-
 paddle/fluid/memory/detail/buddy_allocator.h  |  2 +-
 paddle/fluid/memory/malloc.cc                 |  2 +-
 paddle/fluid/memory/malloc.h                  |  2 +-
 14 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/paddle/fluid/memory/allocation/allocator.h b/paddle/fluid/memory/allocation/allocator.h
index b83d3efb72b71..b11c657b96b74 100644
--- a/paddle/fluid/memory/allocation/allocator.h
+++ b/paddle/fluid/memory/allocation/allocator.h
@@ -178,7 +178,9 @@ class Allocator {
     FreeImpl(allocation);
   }
 
-  inline void Release(const platform::Place& place) { ReleaseImpl(place); }
+  inline uint64_t Release(const platform::Place& place) {
+    return ReleaseImpl(place);
+  }
 
   // True if the `Allocate` is thread safe.
   virtual bool IsAllocThreadSafe() const;
@@ -186,7 +188,7 @@ class Allocator {
  protected:
   virtual Allocation* AllocateImpl(size_t size) = 0;
   virtual void FreeImpl(Allocation* allocation);
-  virtual void ReleaseImpl(const platform::Place& place) {}
+  virtual uint64_t ReleaseImpl(const platform::Place& place) { return 0; }
 };
 
 using AllocationDeleter = Allocator::AllocationDeleter;
diff --git a/paddle/fluid/memory/allocation/allocator_facade.cc b/paddle/fluid/memory/allocation/allocator_facade.cc
index 59b06d082872c..4515dba4363ba 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.cc
+++ b/paddle/fluid/memory/allocation/allocator_facade.cc
@@ -287,8 +287,8 @@ AllocationPtr AllocatorFacade::Alloc(const platform::Place& place,
   return m_->GetAllocator(place, size)->Allocate(size);
 }
 
-void AllocatorFacade::Release(const platform::Place& place) {
-  m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
+uint64_t AllocatorFacade::Release(const platform::Place& place) {
+  return m_->GetAllocator(place, /* A non-zero num to choose allocator_ */ 1)
       ->Release(place);
 }
 
diff --git a/paddle/fluid/memory/allocation/allocator_facade.h b/paddle/fluid/memory/allocation/allocator_facade.h
index 2f2f222f6c74a..fa906fbf5ce8f 100644
--- a/paddle/fluid/memory/allocation/allocator_facade.h
+++ b/paddle/fluid/memory/allocation/allocator_facade.h
@@ -45,7 +45,7 @@ class AllocatorFacade {
   AllocationPtr Alloc(const platform::Place& place, size_t size);
 
   // Release unused memory pool.
-  void Release(const platform::Place& place);
+  uint64_t Release(const platform::Place& place);
 
   // TODO(yy): Allocate a Copy-On-Write allocation?
  private:
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
index 8cd7335cf4fa6..7e6cce59eeb01 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.cc
@@ -138,18 +138,21 @@ void AutoGrowthBestFitAllocator::FreeImpl(Allocation *allocation) {
   }
 }
 
-void AutoGrowthBestFitAllocator::FreeIdleChunks() {
+uint64_t AutoGrowthBestFitAllocator::FreeIdleChunks() {
+  uint64_t bytes = 0;
   for (auto chunk_it = chunks_.begin(); chunk_it != chunks_.end();) {
     auto &blocks = chunk_it->blocks_;
     if (blocks.size() == 1 && blocks.begin()->is_free_) {
       auto &block = *blocks.begin();
       VLOG(2) << "Free chunk with size " << block.size_;
+      bytes += block.size_;
       free_blocks_.erase(std::make_pair(block.size_, block.ptr_));
       chunk_it = chunks_.erase(chunk_it);
     } else {
       ++chunk_it;
     }
   }
+  return bytes;
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
index b55ebf18934f2..eb52cab2594df 100644
--- a/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/auto_growth_best_fit_allocator.h
@@ -40,10 +40,12 @@ class AutoGrowthBestFitAllocator : public Allocator {
   void FreeImpl(Allocation *allocation) override;
 
   // Release the memory block which is not used in pool.
-  void ReleaseImpl(const platform::Place &place) override { FreeIdleChunks(); }
+  uint64_t ReleaseImpl(const platform::Place &place) override {
+    return FreeIdleChunks();
+  }
 
  private:
-  void FreeIdleChunks();
+  uint64_t FreeIdleChunks();
 
   template <typename T>
   using List = std::list<T>;
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
index 842ebd16cf8af..fcde4cbab4268 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -54,7 +54,7 @@ template <typename Place>
 void Free(const Place &place, void *p, size_t size);
 
 template <typename Place>
-void Release(const Place &place);
+uint64_t Release(const Place &place);
 
 template <typename Place>
 size_t Used(const Place &place);
@@ -103,8 +103,8 @@ void Free<platform::CPUPlace>(const platform::CPUPlace &place, void *p,
 }
 
 template <>
-void Release<platform::CPUPlace>(const platform::CPUPlace &place) {
-  GetCPUBuddyAllocator()->Release();
+uint64_t Release<platform::CPUPlace>(const platform::CPUPlace &place) {
+  return GetCPUBuddyAllocator()->Release();
 }
 
 template <>
@@ -195,7 +195,7 @@ void Free<platform::XPUPlace>(const platform::XPUPlace &place, void *p,
 }
 
 template <>
-void Release<platform::XPUPlace>(const platform::XPUPlace &place) {
+uint64_t Release<platform::XPUPlace>(const platform::XPUPlace &place) {
 #ifdef PADDLE_WITH_XPU
   PADDLE_THROW(
       platform::errors::PermissionDenied("Release XPU pool is not supported."));
@@ -333,9 +333,9 @@ void Free<platform::CUDAPlace>(const platform::CUDAPlace &place, void *p,
 }
 
 template <>
-void Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
+uint64_t Release<platform::CUDAPlace>(const platform::CUDAPlace &place) {
 #ifdef PADDLE_WITH_CUDA
-  GetGPUBuddyAllocator(place.device)->Release();
+  return GetGPUBuddyAllocator(place.device)->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
       "'CUDAPlace' is not supported in CPU only device."));
@@ -401,10 +401,10 @@ void Free<platform::CUDAPinnedPlace>(const platform::CUDAPinnedPlace &place,
 }
 
 template <>
-void Release<platform::CUDAPinnedPlace>(
+uint64_t Release<platform::CUDAPinnedPlace>(
     const platform::CUDAPinnedPlace &place) {
 #ifdef PADDLE_WITH_CUDA
-  GetCUDAPinnedBuddyAllocator()->Release();
+  return GetCUDAPinnedBuddyAllocator()->Release();
 #else
   PADDLE_THROW(platform::errors::PermissionDenied(
       "'CUDAPinnedPlace' is not supported in CPU only device."));
@@ -437,10 +437,10 @@ struct FreeVisitor : public boost::static_visitor<void> {
   size_t size_;
 };
 
-struct ReleaseVisitor : public boost::static_visitor<void> {
+struct ReleaseVisitor : public boost::static_visitor<uint64_t> {
   template <typename Place>
-  inline void operator()(const Place &place) const {
-    Release<Place>(place);
+  inline uint64_t operator()(const Place &place) const {
+    return Release<Place>(place);
   }
 };
 
@@ -486,8 +486,8 @@ void NaiveBestFitAllocator::FreeImpl(Allocation *allocation) {
   delete allocation;
 }
 
-void NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
-  boost::apply_visitor(legacy::ReleaseVisitor(), place);
+uint64_t NaiveBestFitAllocator::ReleaseImpl(const platform::Place &place) {
+  return boost::apply_visitor(legacy::ReleaseVisitor(), place);
 }
 
 }  // namespace allocation
diff --git a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
index ba4c4ca226b1e..b7d211482152f 100644
--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.h
@@ -35,7 +35,7 @@ class NaiveBestFitAllocator : public Allocator {
  protected:
   Allocation *AllocateImpl(size_t size) override;
   void FreeImpl(Allocation *allocation) override;
-  void ReleaseImpl(const platform::Place &place) override;
+  uint64_t ReleaseImpl(const platform::Place &place) override;
 
  private:
   platform::Place place_;
diff --git a/paddle/fluid/memory/allocation/retry_allocator.h b/paddle/fluid/memory/allocation/retry_allocator.h
index 74828a0ede3f4..031a5e2b97f17 100644
--- a/paddle/fluid/memory/allocation/retry_allocator.h
+++ b/paddle/fluid/memory/allocation/retry_allocator.h
@@ -47,8 +47,8 @@ class RetryAllocator : public Allocator {
  protected:
   void FreeImpl(Allocation* allocation) override;
   Allocation* AllocateImpl(size_t size) override;
-  void ReleaseImpl(const platform::Place& place) override {
-    underlying_allocator_->Release(place);
+  uint64_t ReleaseImpl(const platform::Place& place) override {
+    return underlying_allocator_->Release(place);
   }
 
  private:
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.cc b/paddle/fluid/memory/allocation/thread_local_allocator.cc
index d2a8250d3db58..98af151007594 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.cc
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.cc
@@ -72,7 +72,9 @@ void ThreadLocalAllocatorImpl::FreeImpl(ThreadLocalAllocation* allocation) {
   delete allocation;
 }
 
-void ThreadLocalAllocatorImpl::ReleaseImpl() { buddy_allocator_->Release(); }
+uint64_t ThreadLocalAllocatorImpl::ReleaseImpl() {
+  return buddy_allocator_->Release();
+}
 
 }  // namespace allocation
 }  // namespace memory
diff --git a/paddle/fluid/memory/allocation/thread_local_allocator.h b/paddle/fluid/memory/allocation/thread_local_allocator.h
index 764509e75ba23..654fb3fe7bc04 100644
--- a/paddle/fluid/memory/allocation/thread_local_allocator.h
+++ b/paddle/fluid/memory/allocation/thread_local_allocator.h
@@ -52,7 +52,7 @@ class ThreadLocalAllocatorImpl
   explicit ThreadLocalAllocatorImpl(const platform::Place& p);
   ThreadLocalAllocation* AllocateImpl(size_t size);
   void FreeImpl(ThreadLocalAllocation* allocation);
-  void ReleaseImpl();
+  uint64_t ReleaseImpl();
 
  private:
   std::unique_ptr<memory::detail::BuddyAllocator> buddy_allocator_;
@@ -92,7 +92,7 @@ class ThreadLocalCUDAAllocator : public Allocator {
     auto allocator_impl = tl_allocation->GetAllocator();
     allocator_impl->FreeImpl(tl_allocation);
   }
-  void ReleaseImpl(const platform::Place& p) override {
+  uint64_t ReleaseImpl(const platform::Place& p) override {
     return ThreadLocalCUDAAllocatorPool::Instance().Get(gpu_id_)->ReleaseImpl();
   }
 
diff --git a/paddle/fluid/memory/detail/buddy_allocator.cc b/paddle/fluid/memory/detail/buddy_allocator.cc
index e7738d0714751..5b521e89680e4 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.cc
+++ b/paddle/fluid/memory/detail/buddy_allocator.cc
@@ -162,7 +162,7 @@ void BuddyAllocator::Free(void* p) {
       IndexSizeAddress(desc->get_index(), desc->get_total_size(), block));
 }
 
-void BuddyAllocator::Release() {
+uint64_t BuddyAllocator::Release() {
   std::lock_guard<std::mutex> lock(mutex_);
   int num = 0;
   uint64_t bytes = 0;
@@ -193,6 +193,7 @@ void BuddyAllocator::Release() {
     }
   }
   VLOG(10) << "Release " << num << " chunk, Free " << bytes << " bytes.";
+  return bytes;
 }
 
 size_t BuddyAllocator::Used() { return total_used_; }
diff --git a/paddle/fluid/memory/detail/buddy_allocator.h b/paddle/fluid/memory/detail/buddy_allocator.h
index 0bfc8918503b9..de77108f3404a 100644
--- a/paddle/fluid/memory/detail/buddy_allocator.h
+++ b/paddle/fluid/memory/detail/buddy_allocator.h
@@ -41,7 +41,7 @@ class BuddyAllocator {
   void* Alloc(size_t unaligned_size);
   void Free(void* ptr);
   // Release the unused memory pool, a real free operation for the OS.
-  void Release();
+  uint64_t Release();
   size_t Used();
   size_t GetMinChunkSize();
   size_t GetMaxChunkSize();
diff --git a/paddle/fluid/memory/malloc.cc b/paddle/fluid/memory/malloc.cc
index 2fbde03b42bcc..8e0a5c6c06dc2 100644
--- a/paddle/fluid/memory/malloc.cc
+++ b/paddle/fluid/memory/malloc.cc
@@ -31,7 +31,7 @@ AllocationPtr Alloc(const platform::Place &place, size_t size) {
   return allocation::AllocatorFacade::Instance().Alloc(place, size);
 }
 
-void Release(const platform::Place &place) {
+uint64_t Release(const platform::Place &place) {
   return allocation::AllocatorFacade::Instance().Release(place);
 }
 
diff --git a/paddle/fluid/memory/malloc.h b/paddle/fluid/memory/malloc.h
index 3d6836e1d255b..3b8d07548ee0c 100644
--- a/paddle/fluid/memory/malloc.h
+++ b/paddle/fluid/memory/malloc.h
@@ -38,7 +38,7 @@ extern AllocationPtr Alloc(const platform::Place& place, size_t size);
 
 extern AllocationPtr Alloc(const platform::DeviceContext& dev_ctx, size_t size);
 
-extern void Release(const platform::Place& place);
+extern uint64_t Release(const platform::Place& place);
 
 }  // namespace memory
 }  // namespace paddle

From 6bba8e57b1be28cd0b88714e18d680c07caeb806 Mon Sep 17 00:00:00 2001
From: QingshuChen <qingshu.chen714@gmail.com>
Date: Fri, 6 Nov 2020 11:14:23 +0800
Subject: [PATCH 134/185] fix batch_norm_xpu bug & remove xpusimulator
 dependence (#28430)

*test=kunlun
---
 cmake/external/xpu.cmake                    | 12 ++++--------
 paddle/fluid/operators/batch_norm_op_xpu.cc |  9 +++++----
 python/setup.py.in                          |  4 +---
 3 files changed, 10 insertions(+), 15 deletions(-)

diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 07fe7d245ef57..eb00b822209c5 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -4,30 +4,26 @@ endif()
 
 INCLUDE(ExternalProject)
 SET(XPU_PROJECT                 "extern_xpu")
-SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu.tar.gz" CACHE STRING "" FORCE)
+SET(XPU_URL    "https://baidu-kunlun-public.su.bcebos.com/paddle_depence/xpu_2020_09_22_api_2020_11_05.tar.gz" CACHE STRING "" FORCE)
 SET(XPU_SOURCE_DIR              "${THIRD_PARTY_PATH}/xpu")
 SET(XPU_DOWNLOAD_DIR            "${XPU_SOURCE_DIR}/src/${XPU_PROJECT}")
 SET(XPU_INSTALL_DIR             "${THIRD_PARTY_PATH}/install/xpu")
-SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/api/include")
-SET(XPU_RUNTIME_INC_DIR         "${THIRD_PARTY_PATH}/install/xpu/runtime/include")
+SET(XPU_API_INC_DIR             "${THIRD_PARTY_PATH}/install/xpu/include")
 SET(XPU_LIB_DIR                 "${THIRD_PARTY_PATH}/install/xpu/lib")
 
 SET(XPU_API_LIB_NAME            "libxpuapi.so")
 SET(XPU_RT_LIB_NAME             "libxpurt.so")
-SET(XPU_SIM_LIB_NAME            "libxpusim.so")
 SET(XPU_API_LIB                 "${XPU_LIB_DIR}/${XPU_API_LIB_NAME}")
 SET(XPU_RT_LIB                  "${XPU_LIB_DIR}/${XPU_RT_LIB_NAME}")
-SET(XPU_SIM_LIB                 "${XPU_LIB_DIR}/${XPU_SIM_LIB_NAME}")
 
 SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${XPU_INSTALL_DIR}/lib")
 
 INCLUDE_DIRECTORIES(${XPU_API_INC_DIR})
-INCLUDE_DIRECTORIES(${XPU_RUNTIME_INC_DIR})
 
 FILE(WRITE ${XPU_DOWNLOAD_DIR}/CMakeLists.txt
   "PROJECT(XPU)\n"
   "cmake_minimum_required(VERSION 3.0)\n"
-  "install(DIRECTORY xpu/api xpu/runtime xpu/lib \n"
+  "install(DIRECTORY xpu/include xpu/lib \n"
   "        DESTINATION ${XPU_INSTALL_DIR})\n")
 
 ExternalProject_Add(
@@ -50,5 +46,5 @@ set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
 # for cc_library(xxx SRCS xxx.c DEPS xpulib)
 generate_dummy_static_lib(LIB_NAME "xpulib" GENERATOR "xpu.cmake")
 
-TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB} ${XPU_SIM_LIB})
+TARGET_LINK_LIBRARIES(xpulib ${XPU_API_LIB} ${XPU_RT_LIB})
 ADD_DEPENDENCIES(xpulib ${XPU_PROJECT})
diff --git a/paddle/fluid/operators/batch_norm_op_xpu.cc b/paddle/fluid/operators/batch_norm_op_xpu.cc
index 624d5fe65ead7..c9208362bc8d2 100644
--- a/paddle/fluid/operators/batch_norm_op_xpu.cc
+++ b/paddle/fluid/operators/batch_norm_op_xpu.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_WITH_XPU
 
 #include "paddle/fluid/operators/batch_norm_op.h"
+#include "xpu/refactor/nn.h"
 
 namespace paddle {
 namespace operators {
@@ -72,10 +73,10 @@ class BatchNormXPUKernel : public framework::OpKernel<T> {
       auto* variance_out_data = variance_out->data<T>();
       auto* saved_mean_data = saved_mean->data<T>();
       auto* saved_variance_data = saved_variance->data<T>();
-      int r = xpu::batch_norm_train_forward(
-          dev_ctx.x_context(), epsilon, momentum, N, C, H, W, x_data, y_data,
-          scale_data, bias_data, mean_out_data, variance_out_data,
-          saved_mean_data, saved_variance_data);
+      int r = xpu::batch_norm<T>(dev_ctx.x_context(), x_data, y_data, N, C, H,
+                                 W, epsilon, momentum, scale_data, bias_data,
+                                 saved_mean_data, saved_variance_data,
+                                 mean_out_data, variance_out_data, true);
       PADDLE_ENFORCE_EQ(
           r, XPU_SUCCESS,
           platform::errors::External("XPU API(batch_norm_train_forward) return "
diff --git a/python/setup.py.in b/python/setup.py.in
index b7a6289d38f17..a4570c9d19563 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -319,10 +319,8 @@ if '${WITH_XPU}' == 'ON':
                 raise Exception("patch ${XPU_API_LIB} failed, command: %s" % command)
     shutil.copy('${XPU_API_LIB}', libs_path)
     shutil.copy('${XPU_RT_LIB}', libs_path)
-    shutil.copy('${XPU_SIM_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_API_LIB_NAME}',
-                                  '${XPU_RT_LIB_NAME}',
-                                  '${XPU_SIM_LIB_NAME}']
+                                  '${XPU_RT_LIB_NAME}']
 
 # copy libfuild_framework.so to libs
 if os.name != 'nt' and sys.platform != 'darwin':

From fad4744aa49a77dfeb381542649a0ed6782754a5 Mon Sep 17 00:00:00 2001
From: taixiurong <taixiurong@126.com>
Date: Fri, 6 Nov 2020 13:45:35 +0800
Subject: [PATCH 135/185] fix crash in adam in xpu, *test=kunlun (#28433)

---
 .../fluid/operators/optimizers/adam_op_xpu.cc | 59 +++++++++++++------
 1 file changed, 41 insertions(+), 18 deletions(-)

diff --git a/paddle/fluid/operators/optimizers/adam_op_xpu.cc b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
index 05b4544c02a12..2abc690fc51b2 100644
--- a/paddle/fluid/operators/optimizers/adam_op_xpu.cc
+++ b/paddle/fluid/operators/optimizers/adam_op_xpu.cc
@@ -74,7 +74,7 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
                           "output size is 1, but received "
                           "value is:%d.",
                           beta2_pow_out->numel()));
-
+                          
     T beta1 = static_cast<T>(ctx.Attr<float>("beta1"));
     if (ctx.HasInput("Beta1Tensor")) {
       auto* beta1_tensor = ctx.Input<framework::Tensor>("Beta1Tensor");
@@ -88,30 +88,53 @@ class AdamOpXPUKernel : public framework::OpKernel<T> {
     if (grad_var->IsType<framework::LoDTensor>()) {
       auto& grad = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Grad"), "Input",
                                    "Grad", "Adam");
-
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
+      const T* beta1_pow_ptr = beta1_pow.template data<T>();
+      const T* beta2_pow_ptr = beta2_pow.template data<T>();
+      Tensor xpu_beta1_pow;
+      Tensor xpu_beta2_pow;
+      if (beta1_pow.place() == platform::CPUPlace() &&
+          beta2_pow.place() == platform::CPUPlace()) {
+        TensorCopy(beta1_pow, ctx.GetPlace(), dev_ctx, &xpu_beta1_pow);
+        TensorCopy(beta2_pow, ctx.GetPlace(), dev_ctx, &xpu_beta2_pow);
+        dev_ctx.Wait();
+        beta1_pow_ptr = xpu_beta1_pow.template data<T>();
+        beta2_pow_ptr = xpu_beta2_pow.template data<T>();
+      }
       int r = xpu::adam(
           dev_ctx.x_context(), grad.template data<T>(), mom1.template data<T>(),
-          mom2.template data<T>(), param.template data<T>(),
-          beta1_pow.template data<T>(), beta2_pow.template data<T>(), beta1,
-          beta2, epsilon, lr.template data<T>(),
+          mom2.template data<T>(), param.template data<T>(), beta1_pow_ptr,
+          beta2_pow_ptr, beta1, beta2, epsilon, lr.template data<T>(),
           mom1_out.template mutable_data<T>(ctx.GetPlace()),
           mom2_out.template mutable_data<T>(ctx.GetPlace()),
           param_out.template mutable_data<T>(ctx.GetPlace()), param.numel());
 
-      const float* ptr0 = beta1_pow.template data<T>();
-      float* ptr1 = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
-      float cpudata;
-      xpu_memcpy(&cpudata, ptr0, sizeof(float), XPU_DEVICE_TO_HOST);
-      cpudata = cpudata * beta1;
-      xpu_memcpy(ptr1, &cpudata, sizeof(float), XPU_HOST_TO_DEVICE);
-
-      const float* ptr2 = beta2_pow.template data<T>();
-      float* ptr3 = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
-      float cpudata1;
-      xpu_memcpy(&cpudata1, ptr2, sizeof(float), XPU_DEVICE_TO_HOST);
-      cpudata1 = cpudata1 * beta2;
-      xpu_memcpy(ptr3, &cpudata1, sizeof(float), XPU_HOST_TO_DEVICE);
+      //update in cpu and then copy to xpu
+      if (beta1_pow.place() == platform::CPUPlace() &&
+          beta2_pow.place() == platform::CPUPlace()) {
+        const T* beta1_pow_p = beta1_pow.template data<T>();
+        beta1_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+            beta1 * beta1_pow_p[0];
+        const T* beta2_pow_p = beta2_pow.template data<T>();
+        beta2_pow_out->mutable_data<T>(platform::CPUPlace())[0] =
+            beta2 * beta2_pow_p[0];
+      } else {
+        T cpu_beta1_pow_out_data;
+        T cpu_beta2_pow_out_data;
+        xpu_memcpy(&cpu_beta1_pow_out_data, beta1_pow_ptr, sizeof(T),
+                   XPU_DEVICE_TO_HOST);
+        cpu_beta1_pow_out_data = cpu_beta1_pow_out_data * beta1;
+        xpu_memcpy(&cpu_beta2_pow_out_data, beta2_pow_ptr, sizeof(T),
+                   XPU_DEVICE_TO_HOST);
+        cpu_beta2_pow_out_data = cpu_beta2_pow_out_data * beta2;
+
+        T* beta1_pow_out_p = beta1_pow_out->mutable_data<T>(ctx.GetPlace());
+        T* beta2_pow_out_p = beta2_pow_out->mutable_data<T>(ctx.GetPlace());
+        xpu_memcpy(beta1_pow_out_p, &cpu_beta1_pow_out_data, sizeof(T),
+                   XPU_HOST_TO_DEVICE);
+        xpu_memcpy(beta2_pow_out_p, &cpu_beta2_pow_out_data, sizeof(T),
+                   XPU_HOST_TO_DEVICE);
+      }
 
       PADDLE_ENFORCE_EQ(r == xpu::Error_t::SUCCESS, true,
                         platform::errors::External(

From 7fe5f9ccad399fb9548eab816f9c0eb486706e1b Mon Sep 17 00:00:00 2001
From: Zhang Ting <zhangting_2017@163.com>
Date: Fri, 6 Nov 2020 19:14:15 +0800
Subject: [PATCH 136/185] Fix unittest random failure (#28363)

* fix random failure

* use two input(x)

* fix model
---
 .../unittests/test_fuse_bn_add_act_pass.py    | 56 +++++++++----------
 1 file changed, 28 insertions(+), 28 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
index 316c40971aaac..f4cb53b31c574 100644
--- a/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
+++ b/python/paddle/fluid/tests/unittests/test_fuse_bn_add_act_pass.py
@@ -122,24 +122,23 @@ def build_origin_program(self,
                 param_attr=self.conv_param_attr1,
                 bias_attr=False,
                 data_format='NHWC')
+            bn1 = fluid.layers.batch_norm(
+                input=conv1_1,
+                param_attr=self.bn_param_attr1,
+                bias_attr=self.bn_bias_attr1,
+                act=None,
+                data_layout='NHWC')
             conv1_2 = fluid.layers.conv2d(
-                input=x,
-                filter_size=3,
+                input=conv1_1,
+                filter_size=1,
                 num_filters=32,
                 stride=1,
-                padding=1,
                 act=None,
                 param_attr=self.conv_param_attr2,
                 bias_attr=False,
                 data_format='NHWC')
-            bn1 = fluid.layers.batch_norm(
-                input=conv1_1,
-                param_attr=self.bn_param_attr1,
-                bias_attr=self.bn_bias_attr1,
-                act=None,
-                data_layout='NHWC')
             bn2 = fluid.layers.batch_norm(
-                input=conv1_2,
+                input=conv1_1,
                 param_attr=self.bn_param_attr2,
                 bias_attr=self.bn_bias_attr2,
                 act=None,
@@ -157,7 +156,7 @@ def build_origin_program(self,
                 sgd, use_dynamic_loss_scaling=True, init_loss_scaling=128.0)
             sgd.minimize(loss)
 
-        return x, y, loss
+        return loss
 
     def check(self, place, use_cuda):
         paddle.seed(1)
@@ -168,24 +167,27 @@ def check(self, place, use_cuda):
         # build_fused_program: turn on fuse_bn_add_act_ops
         main_program = fluid.Program()
         startup_program = fluid.Program()
-        x, y, loss = self.build_origin_program(main_program, startup_program,
-                                               use_cuda)
-        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
+        loss = self.build_origin_program(main_program, startup_program,
+                                         use_cuda)
         build_strategy_fused = fluid.BuildStrategy()
         build_strategy_fused.fuse_bn_add_act_ops = True
         binary_fused = fluid.CompiledProgram(main_program).with_data_parallel(
             loss_name=loss.name, build_strategy=build_strategy_fused)
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
         exe = fluid.Executor(place)
         loss_vals_fused = []
+        x_data = []
+        y_data = []
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             for _ in range(iters):
-                data = next(train_reader())
+                x = np.random.random((batch_size, 1, 28, 28)).astype("float32")
+                y = np.random.random((batch_size, 1)).astype("int64")
+                x_data.append(x)
+                y_data.append(y)
                 loss_v = exe.run(binary_fused,
-                                 feed=feeder.feed(data),
+                                 feed={"x": x,
+                                       "y": y},
                                  fetch_list=[loss])
                 loss_vals_fused.append(loss_v[0][0])
 
@@ -193,17 +195,15 @@ def check(self, place, use_cuda):
         build_strategy = fluid.BuildStrategy()
         build_strategy.fuse_bn_add_act_ops = False
         binary = fluid.CompiledProgram(main_program).with_data_parallel(
-            loss_name=loss.name, build_strategy=build_strategy)
-        train_reader = paddle.batch(
-            paddle.dataset.mnist.train(), batch_size=batch_size)
+            loss_name=loss.name, build_strategy=build_strategy_fused)
         loss_vals = []
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             exe.run(startup_program)
-            for _ in range(iters):
-                data = next(train_reader())
+            for i in range(iters):
                 loss_v = exe.run(binary,
-                                 feed=feeder.feed(data),
+                                 feed={"x": x_data[i],
+                                       "y": y_data[i]},
                                  fetch_list=[loss])
                 loss_vals.append(loss_v[0][0])
 
@@ -222,16 +222,16 @@ def test_fuse_bn_add_act_API(self):
         place = fluid.CUDAPlace(0)
         x, y, loss = self.build_fused_program(
             main_program, startup_program, use_cuda=True)
-        feeder = fluid.DataFeeder(feed_list=[x, y], place=place)
-        train_reader = paddle.batch(paddle.dataset.mnist.train(), batch_size=16)
         exe = fluid.Executor(place)
         scope = fluid.Scope()
         with fluid.scope_guard(scope):
             exe.run(startup_program)
             for _ in range(5):
-                data = next(train_reader())
+                x = np.random.random((4, 1, 28, 28)).astype("float32")
+                y = np.random.random((4, 1)).astype("int64")
                 loss_v = exe.run(main_program,
-                                 feed=feeder.feed(data),
+                                 feed={"x": x,
+                                       "y": y},
                                  fetch_list=[loss])
 
 

From b5e662f8372069ebf4d6ce9671de407fa84f331e Mon Sep 17 00:00:00 2001
From: WeiXin <weixin10@baidu.com>
Date: Fri, 6 Nov 2020 19:41:28 +0800
Subject: [PATCH 137/185] refine jit.save/load to add support for other method,
 not only forward (#28376)

* refine jit.save/load to add support for other method, not only forward

* refine the code based on unit tests

* Add unit test for the code

* Add unit test for the code

* Modify the code according to the unit test

* Delete useless comments, save only one info file, etc.

* remove static_mode_white_list.pyc

* edit the code that generate 'extra_var_info'
---
 python/paddle/fluid/dygraph/io.py             |  31 ++-
 python/paddle/fluid/dygraph/jit.py            | 200 ++++++++++--------
 .../dygraph_to_static/test_declarative.py     |   1 +
 .../tests/unittests/test_jit_save_load.py     |  49 +++++
 4 files changed, 187 insertions(+), 94 deletions(-)

diff --git a/python/paddle/fluid/dygraph/io.py b/python/paddle/fluid/dygraph/io.py
index a10adeb14aa7d..c84e855d17290 100644
--- a/python/paddle/fluid/dygraph/io.py
+++ b/python/paddle/fluid/dygraph/io.py
@@ -500,8 +500,21 @@ def _construct_program_holders(model_path, model_filename=None):
         # [compatible] if assign model_filename, only can load one program as Layer.forward
         model_filename = os.path.basename(model_filename)
         model_file_path = os.path.join(model_path, model_filename)
-        program_holder_dict['forward'] = _ProgramHolder(
-            _load_program_desc(model_file_path))
+        model_name = model_filename[:-len(INFER_MODEL_SUFFIX)]
+        #Load every file that meets the requirements in the directory model_path.
+        for filename in os.listdir(model_path):
+            if model_filename == filename:
+                func_name = 'forward'
+                model_file_path = os.path.join(model_path, model_filename)
+            elif filename.endswith(INFER_MODEL_SUFFIX) and filename.startswith(
+                    model_name):
+                func_name = filename[len(model_name) + 1:-len(
+                    INFER_MODEL_SUFFIX)]
+                model_file_path = os.path.join(model_path, filename)
+            else:
+                continue
+            program_holder_dict[func_name] = _ProgramHolder(
+                _load_program_desc(model_file_path))
     else:
         for _, _, file_names in os.walk(model_path):
             for name in file_names:
@@ -524,9 +537,23 @@ def _construct_params_and_buffers(model_path,
                                   append_suffix=True):
     var_info_filename = str(params_filename) + ".info"
     var_info_path = os.path.join(model_path, var_info_filename)
+
     if os.path.exists(var_info_path):
         var_dict = _load_persistable_vars(model_path, var_info_path,
                                           programs['forward'], params_filename)
+        model_name = params_filename[:-len(INFER_PARAMS_SUFFIX)]
+        #Load every file that meets the requirements in the directory model_path.
+        for file_name in os.listdir(model_path):
+            if file_name.endswith(INFER_PARAMS_SUFFIX) and file_name.startswith(
+                    model_name) and file_name != params_filename:
+                func_name = file_name[len(model_name) + 1:-len(
+                    INFER_PARAMS_SUFFIX)]
+            else:
+                continue
+            var_info_path = os.path.join(model_path, var_info_filename)
+            var_dict.update(
+                _load_persistable_vars(model_path, var_info_path, programs[
+                    func_name], file_name))
     else:
         var_dict = _load_persistable_vars_by_program(
             model_path, programs['forward'], params_filename)
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index 3f9d5fb97973f..d4bfb8b112637 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -594,6 +594,13 @@ def train(layer, loader, loss_fn, opt):
     # avoid change user given input_spec
     inner_input_spec = None
     if input_spec is not None:
+        for attr_func in dir(layer):
+            static_func = getattr(layer, attr_func, None)
+            if isinstance(static_func,
+                          StaticFunction) and 'forward' != attr_func:
+                raise ValueError(
+                    "If there are static functions other than 'forward' that need to be saved, the input 'input_spec' should be None, but received the type of 'input_spec' is %s."
+                    % type(input_spec))
         if not isinstance(input_spec, list):
             raise TypeError(
                 "The input input_spec should be 'list', but received input_spec's type is %s."
@@ -612,102 +619,111 @@ def train(layer, loader, loss_fn, opt):
 
     # parse configs
     configs = _parse_save_configs(configs)
-
-    # 2. get program from Layer
-    # TODO(chenweihang): add support for other method, not only forward
-    if isinstance(layer.forward, StaticFunction):
-        concrete_program = layer.forward.concrete_program
-    else:
-        # transform in jit.save, if input_spec is incomplete, declarative will throw error
-        static_forward = declarative(layer.forward, input_spec=inner_input_spec)
-        concrete_program = static_forward.concrete_program
-        # the input_spec has been used in declarative, which is equal to 
-        # @declarative with input_spec and jit.save without input_spec,
-        # avoid needless warning
-        inner_input_spec = None
-
-    # 3. build input & output of save_infernece_model
-    # NOTE(chenweihang): [ Get input variables name ]
-    # There are two cases, whether to prune the inputs or not
-    # - not prune inputs (recommend):
-    #   - the len(input_spec) == len((concrete_program.inputs) - 1
-    #   - here can use concrete_program.inputs directly
-    # - prune inputs:
-    #   - the input_spec length < len((concrete_program.inputs) - 1
-    #   - the input_spec's name should be in concrete_program.inputs
-    input_var_names = _get_input_var_names(concrete_program.inputs,
-                                           inner_input_spec)
-
-    # NOTE(chenweihang): [ Get output variables ]
-    # the rule is like [ Get input variables name ]. For output var, 
-    # we only support VarBase spec, and actually, we only need the 
-    # var name of output, and we don't recommended to use output_spec
-    output_vars = _get_output_vars(concrete_program.outputs,
-                                   configs.output_spec)
-
-    # NOTE(chenweihang): we maintain the mapping of variable name to
-    # structured name, the buffer variable (non-persistable)
-    # saved to inference program may not need by dygraph Layer, 
-    # we only record the state_dict variable's structured name
-    state_names_dict = dict()
-    for structured_name, var in six.iteritems(layer.state_dict()):
-        state_names_dict[var.name] = structured_name
-
-    # 4. share parameters from Layer to scope & record var info
     scope = core.Scope()
     extra_var_info = dict()
-    for param_or_buffer in concrete_program.parameters:
-        # share to scope
-        param_or_buffer_tensor = scope.var(param_or_buffer.name).get_tensor()
-        src_tensor = param_or_buffer.value().get_tensor()
-        param_or_buffer_tensor._share_data_with(src_tensor)
-        # record var info
-        extra_info_dict = dict()
-        if param_or_buffer.name in state_names_dict:
-            extra_info_dict['structured_name'] = state_names_dict[
-                param_or_buffer.name]
-        extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
-        if isinstance(param_or_buffer, ParamBase):
-            extra_info_dict['trainable'] = param_or_buffer.trainable
-        extra_var_info[param_or_buffer.name] = extra_info_dict
-
-    # 5. save inference model
-    from paddle.fluid.io import save_inference_model
-
-    # construct new save_inference_model arguments
-    model_path = dirname
-    # NOTE(chenweihang): because prefix contains model and params filename,
-    # so we don't support set model_filename & params_filename 
-    model_filename = file_prefix + INFER_MODEL_SUFFIX
-    params_filename = file_prefix + INFER_PARAMS_SUFFIX
+    for attr_func in dir(layer):
+        static_func = getattr(layer, attr_func, None)
+        if isinstance(static_func, StaticFunction):
+            concrete_program = static_func.concrete_program
+        elif 'forward' == attr_func:
+            # transform in jit.save, if input_spec is incomplete, declarative will throw error
+            static_forward = declarative(
+                layer.forward, input_spec=inner_input_spec)
+            concrete_program = static_forward.concrete_program
+            # the input_spec has been used in declarative, which is equal to 
+            # @declarative with input_spec and jit.save without input_spec,
+            # avoid needless warning
+            inner_input_spec = None
+        else:
+            continue
+
+        # 3. build input & output of save_infernece_model
+        # NOTE(chenweihang): [ Get input variables name ]
+        # There are two cases, whether to prune the inputs or not
+        # - not prune inputs (recommend):
+        #   - the len(input_spec) == len((concrete_program.inputs) - 1
+        #   - here can use concrete_program.inputs directly
+        # - prune inputs:
+        #   - the input_spec length < len((concrete_program.inputs) - 1
+        #   - the input_spec's name should be in concrete_program.inputs
+        input_var_names = _get_input_var_names(concrete_program.inputs,
+                                               inner_input_spec)
+
+        # NOTE(chenweihang): [ Get output variables ]
+        # the rule is like [ Get input variables name ]. For output var, 
+        # we only support VarBase spec, and actually, we only need the 
+        # var name of output, and we don't recommended to use output_spec
+        output_vars = _get_output_vars(concrete_program.outputs,
+                                       configs.output_spec)
+
+        # NOTE(chenweihang): we maintain the mapping of variable name to
+        # structured name, the buffer variable (non-persistable)
+        # saved to inference program may not need by dygraph Layer, 
+        # we only record the state_dict variable's structured name
+        state_names_dict = dict()
+        for structured_name, var in six.iteritems(layer.state_dict()):
+            state_names_dict[var.name] = structured_name
+
+        # 4. share parameters from Layer to scope & record var info        
+        for param_or_buffer in concrete_program.parameters:
+            # share to scope
+            param_or_buffer_tensor = scope.var(param_or_buffer.name).get_tensor(
+            )
+            src_tensor = param_or_buffer.value().get_tensor()
+            param_or_buffer_tensor._share_data_with(src_tensor)
+            # record var info
+            if param_or_buffer.name not in extra_var_info:
+                extra_info_dict = dict()
+                if param_or_buffer.name in state_names_dict:
+                    extra_info_dict['structured_name'] = state_names_dict[
+                        param_or_buffer.name]
+                extra_info_dict['stop_gradient'] = param_or_buffer.stop_gradient
+                if isinstance(param_or_buffer, ParamBase):
+                    extra_info_dict['trainable'] = param_or_buffer.trainable
+                extra_var_info[param_or_buffer.name] = extra_info_dict
+
+        # 5. save inference model
+        from paddle.fluid.io import save_inference_model
 
+        # construct new save_inference_model arguments
+        model_path = dirname
+        # NOTE(chenweihang): because prefix contains model and params filename,
+        # so we don't support set model_filename & params_filename 
+        if 'forward' == attr_func:
+            model_filename = file_prefix + INFER_MODEL_SUFFIX
+            params_filename = file_prefix + INFER_PARAMS_SUFFIX
+        else:
+            model_filename = file_prefix + '.' + attr_func + INFER_MODEL_SUFFIX
+            params_filename = file_prefix + '.' + attr_func + INFER_PARAMS_SUFFIX
+
+        with scope_guard(scope):
+            save_inference_model(
+                dirname=model_path,
+                feeded_var_names=input_var_names,
+                target_vars=output_vars,
+                executor=Executor(_current_expected_place()),
+                main_program=concrete_program.main_program.clone(),
+                model_filename=model_filename,
+                params_filename=params_filename,
+                export_for_deployment=configs._export_for_deployment,
+                program_only=configs._program_only)
+
+    # NOTE(chenweihang): [ Save extra variable info ]
+    # save_inference_model will lose some important variable information, including:
+    #   - Variable name and correspondence (when saved variables as one file)
+    #   - Variable.stop_gradient information
+    #   - Which persistent variable are parameter and which are not
+    #   - Parameter.trainable information
+    #
+    # The lost information cannot be recovered when it is loaded again, 
+    # so if we want to perform fine-tune after loading, we may need to 
+    # configure redundant information to proceed.
+    #
+    # Due to compatibility issues, we cannot change the original storage structure, 
+    # but we can save these information in `jit.save` without changing the original 
+    # storage to improve user experience. So we save extra information into
+    # file `***.pdiparams.info`
     with scope_guard(scope):
-        save_inference_model(
-            dirname=model_path,
-            feeded_var_names=input_var_names,
-            target_vars=output_vars,
-            executor=Executor(_current_expected_place()),
-            main_program=concrete_program.main_program.clone(),
-            model_filename=model_filename,
-            params_filename=params_filename,
-            export_for_deployment=configs._export_for_deployment,
-            program_only=configs._program_only)
-
-        # NOTE(chenweihang): [ Save extra variable info ]
-        # save_inference_model will lose some important variable information, including:
-        #   - Variable name and correspondence (when saved variables as one file)
-        #   - Variable.stop_gradient information
-        #   - Which persistent variable are parameter and which are not
-        #   - Parameter.trainable information
-        #
-        # The lost information cannot be recovered when it is loaded again, 
-        # so if we want to perform fine-tune after loading, we may need to 
-        # configure redundant information to proceed.
-        #
-        # Due to compatibility issues, we cannot change the original storage structure, 
-        # but we can save these information in `jit.save` without changing the original 
-        # storage to improve user experience. So we save extra information into
-        # file `***.pdiparams.info`
         extra_var_info_path = path + INFER_PARAMS_INFO_SUFFIX
         with open(extra_var_info_path, 'wb') as f:
             pickle.dump(extra_var_info, f, protocol=2)
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
index 095eda2a5cba7..a5c49e4d7d931 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_declarative.py
@@ -115,6 +115,7 @@ def test_with_input_spec(self):
             self.assertTrue(len(net.forward.program_cache) == 1)
 
             # 2. test save load
+            net.inner_function(x)
             jit.save(net, './simple_net')
             infer_net = fluid.dygraph.jit.load('./simple_net')
             pred = infer_net(x)
diff --git a/python/paddle/fluid/tests/unittests/test_jit_save_load.py b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
index b954f5c829aa6..5973199125716 100644
--- a/python/paddle/fluid/tests/unittests/test_jit_save_load.py
+++ b/python/paddle/fluid/tests/unittests/test_jit_save_load.py
@@ -187,6 +187,26 @@ def forward(self, x, y):
         return x + y
 
 
+class LinearNetWithMultiStaticFunc(fluid.dygraph.Layer):
+    def __init__(self, in_size, out_size):
+        super(LinearNetWithMultiStaticFunc, self).__init__()
+        self._linear_0 = Linear(in_size, out_size)
+        self._linear_1 = Linear(in_size, out_size)
+        self._scale = paddle.to_tensor(9.9)
+
+    @paddle.jit.to_static
+    def forward(self, x):
+        return self._linear_0(x)
+
+    @paddle.jit.to_static
+    def forward_no_param(self, x):
+        return x
+
+    @paddle.jit.to_static
+    def forward_general(self, x):
+        return self._linear_0(x) + self._linear_1(x) * self._scale
+
+
 def train(layer, input_size=784, label_size=1):
     # create optimizer
     sgd = fluid.optimizer.SGDOptimizer(
@@ -764,5 +784,34 @@ def test_save_load_no_param_layer(self):
         self.assertTrue(np.array_equal(out, load_out))
 
 
+class TestJitSaveLoadMultiMethods(unittest.TestCase):
+    def setUp(self):
+        # enable dygraph mode
+        paddle.disable_static()
+
+    def test_jit_save_load_inference(self):
+        model_path_inference = "jit_save_load_multi_methods/model"
+        IMAGE_SIZE = 224
+        layer = LinearNetWithMultiStaticFunc(IMAGE_SIZE, 10)
+        inps = paddle.randn([1, IMAGE_SIZE])
+        result_origin = {}
+        for func in dir(layer):
+            if func.startswith('forward'):
+                result_origin[func] = getattr(layer, func, None)(inps)
+        paddle.jit.save(layer, model_path_inference)
+        load_net = paddle.jit.load(model_path_inference)
+        for func, result in result_origin.items():
+            self.assertTrue(
+                float((result - getattr(load_net, func, None)(inps)).abs().max(
+                )) < 1e-5)
+
+    def test_jit_save_load_multi_methods_inputspec(self):
+        model_path = 'jit_save_load_multi_methods/model'
+        layer = LinearNetWithMultiStaticFunc(784, 1)
+        with self.assertRaises(ValueError):
+            paddle.jit.save(
+                layer, model_path, input_spec=[InputSpec(shape=[None, 784])])
+
+
 if __name__ == '__main__':
     unittest.main()

From 155b4f9b6c82c8f0516e7c89fae9eb925be9bc1e Mon Sep 17 00:00:00 2001
From: Chen Weihang <chenweihang@baidu.com>
Date: Fri, 6 Nov 2020 19:59:30 +0800
Subject: [PATCH 138/185] Remove selected rows all reduce over height check
 (#28460)

* remove slelected rows all reduce over height check

* polish unittest
---
 paddle/fluid/imperative/all_reduce.cc         | 16 ++----
 .../fluid/tests/unittests/CMakeLists.txt      |  3 ++
 ...el_dygraph_sparse_embedding_over_height.py | 52 +++++++++++++++++++
 ...el_dygraph_sparse_embedding_over_height.py | 52 +++++++++++++++++++
 4 files changed, 111 insertions(+), 12 deletions(-)
 create mode 100644 python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py

diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 42922aa6f3a75..0a601417de147 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -53,7 +53,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
 static void AllReduce(const framework::SelectedRows &src,
                       framework::SelectedRows *dst,
                       const ParallelStrategy &strategy, cudaStream_t stream) {
-  VLOG(0) << "SelectedRows AllReduce start";
+  VLOG(3) << "SelectedRows AllReduce start";
   const auto &src_tensor = src.value();
   const auto &place = src_tensor.place();
   PADDLE_ENFORCE_EQ(
@@ -87,18 +87,10 @@ static void AllReduce(const framework::SelectedRows &src,
                       static_cast<int64_t>(0));
   dst->set_height(src.height());
 
-  VLOG(0) << "Gather rows: " << string::join_strings(rows_num_vector, ',')
+  VLOG(3) << "Gather rows: " << string::join_strings(rows_num_vector, ',')
           << ", total rows number: " << rows_num
           << ", height: " << src.height();
 
-  PADDLE_ENFORCE_LE(
-      rows_num, src.height(),
-      platform::errors::Unimplemented(
-          "The gathered SelectedRows's rows number should less than or equal "
-          "to the SelectedRows's height, but the actual rows number is %d, the "
-          "SelectedRows's height is %d.",
-          rows_num, src.height()));
-
   auto *dst_rows = dst->mutable_rows();
   dst_rows->resize(rows_num);
   auto *dst_rows_ptr = dst_rows->CUDAMutableData(place);
@@ -130,9 +122,9 @@ static void AllReduce(const framework::SelectedRows &src,
     }
   }
 
-  VLOG(0) << "Original SelectedRows rows: "
+  VLOG(3) << "Original SelectedRows rows: "
           << string::join_strings(src_rows, ',');
-  VLOG(0) << "Result SelectedRows rows: "
+  VLOG(3) << "Result SelectedRows rows: "
           << string::join_strings(*dst_rows, ',');
 }
 #endif
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 52950a4d92a71..de6912e76ddaf 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -12,6 +12,7 @@ string(REPLACE ".py" "" DIST_TEST_OPS "${DIST_TEST_OPS}")
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mnist)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_se_resnext)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
+list(APPEND DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_transformer)
 list(APPEND DIST_TEST_OPS test_listen_and_serv_op)
 list(APPEND DIST_TEST_OPS test_fleet_graph_execution_meta_optimizer)
@@ -127,6 +128,7 @@ if (NOT ${WITH_GPU})
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_mnist) # TODO(Yancey1989): parallel dygraph support CPU device in future
     list(REMOVE_ITEM TEST_OPS test_parallel_dygraph_se_resnext)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_transformer)
     LIST(REMOVE_ITEM TEST_OPS test_parallel_dygraph_sync_batch_norm)
     LIST(REMOVE_ITEM TEST_OPS test_imperative_auto_mixed_precision)
@@ -139,6 +141,7 @@ endif()
 if (WITH_NCCL)
     if (${NCCL_VERSION} VERSION_LESS 2212)
         LIST(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_sparse_embedding)
+        LIST(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_sparse_embedding_over_height)
         LIST(REMOVE_ITEM DIST_TEST_OPS test_parallel_dygraph_transformer)
     endif()
 endif()
diff --git a/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py
new file mode 100644
index 0000000000000..61749a24c9821
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_dygraph_sparse_embedding_over_height.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import paddle
+import paddle.fluid as fluid
+from parallel_dygraph_sparse_embedding import SimpleNet, fake_sample_reader, TestSparseEmbedding
+
+from test_dist_base import runtime_main, TestParallelDyGraphRunnerBase
+
+# global configs
+# using small `vocab_size` to test rows number over height
+batch_size = 4
+batch_num = 200
+hidden_size = 10
+vocab_size = 10
+num_steps = 3
+init_scale = 0.1
+
+
+class TestSparseEmbeddingOverHeight(TestSparseEmbedding):
+    def get_model(self):
+        model = SimpleNet(
+            hidden_size=hidden_size,
+            vocab_size=vocab_size,
+            num_steps=num_steps,
+            init_scale=init_scale,
+            is_sparse=True)
+
+        train_reader = paddle.batch(
+            fake_sample_reader(), batch_size=batch_size, drop_last=True)
+
+        optimizer = fluid.optimizer.SGD(learning_rate=0.001,
+                                        parameter_list=model.parameters())
+
+        return model, train_reader, optimizer
+
+
+if __name__ == "__main__":
+    runtime_main(TestSparseEmbeddingOverHeight)
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
new file mode 100644
index 0000000000000..9aca448f16121
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_dygraph_sparse_embedding_over_height.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import sys
+import unittest
+
+import paddle.fluid as fluid
+from test_dist_base import TestDistBase
+from spawn_runner_base import TestDistSpawnRunner
+from parallel_dygraph_sparse_embedding_over_height import TestSparseEmbeddingOverHeight
+
+flag_name = os.path.splitext(__file__)[0]
+
+
+class TestParallelDygraphSparseEmdeddingOverHeight(TestDistBase):
+    def _setup_config(self):
+        self._sync_mode = False
+        self._nccl2_mode = True
+        self._dygraph = True
+
+    def test_sparse_embedding(self):
+        if fluid.core.is_compiled_with_cuda():
+            self.check_with_place(
+                "parallel_dygraph_sparse_embedding_over_height.py",
+                delta=1e-5,
+                check_error_log=True,
+                log_name=flag_name)
+
+
+class TestParallelDygraphSparseEmdeddingOverHeightSpawn(TestDistSpawnRunner):
+    def test_sparse_embedding_with_spawn(self):
+        if fluid.core.is_compiled_with_cuda() and sys.version_info >= (3, 4):
+            self.check_dist_result_with_spawn(
+                test_class=TestSparseEmbeddingOverHeight, delta=1e-5)
+
+
+if __name__ == "__main__":
+    unittest.main()

From ba0756325a8a64eedc5586cace20d9e2768d1f06 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Sun, 8 Nov 2020 20:46:22 +0800
Subject: [PATCH 139/185] exec ut no more than 15s 1 (#28439)

* disable ut test_parallel_executor_fetch_isolated_var,test=document_fix

* test for limiting ut exec time as 15S

* fix an error caused by cannot find ut

* fix some error

* can not find test_transformer

* fix error caused by ut not run in windows

* fix error caused by Compiler Options

* fix error caused by setting timeout value as 15 in python/paddle/tests/CMakeLists.txt

* setting timeout value to 120s for old ut

* add the timeout value setting

* fix error caused by ut only run in coverage_ci

* add analyzer_transformer_profile_tester

* fix some error

* fix some error

* fix error with inference option

* fix error with inference option setting as ON_INFER

* add some ut to set timeout

* modified some option

* fix error

* fix some timeout error

* fix error

* fix error

* fix timeout for test_analyzer_bfloat16_resnet50

* fix error

* setting timeout properity for some ut

* first pr for new ut timeout as 15S
---
 paddle/fluid/framework/CMakeLists.txt         |   3 +
 .../framework/ir/fusion_group/CMakeLists.txt  |   3 +
 paddle/fluid/inference/api/CMakeLists.txt     |   5 +
 .../fluid/inference/tests/api/CMakeLists.txt  |  35 ++++
 .../operators/distributed/CMakeLists.txt      |   4 +
 paddle/fluid/operators/jit/CMakeLists.txt     |   3 +
 paddle/fluid/operators/math/CMakeLists.txt    |   3 +
 .../fluid/contrib/slim/tests/CMakeLists.txt   |  22 +++
 .../paddle/fluid/contrib/tests/CMakeLists.txt |   2 +
 python/paddle/fluid/tests/CMakeLists.txt      |   2 +
 python/paddle/fluid/tests/book/CMakeLists.txt |   6 +
 .../fluid/tests/unittests/CMakeLists.txt      | 165 +++++++++++++++++-
 .../dygraph_to_static/CMakeLists.txt          |  12 ++
 .../unittests/ir/inference/CMakeLists.txt     |   3 +
 .../tests/unittests/mkldnn/CMakeLists.txt     |   1 +
 .../fluid/tests/unittests/rnn/CMakeLists.txt  |   2 +
 .../tests/unittests/sequence/CMakeLists.txt   |   3 +
 python/paddle/reader/tests/CMakeLists.txt     |   1 +
 python/paddle/tests/CMakeLists.txt            |  13 +-
 19 files changed, 285 insertions(+), 3 deletions(-)

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index b40cbdcc1b1bf..6b724b656ddad 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -327,3 +327,6 @@ if(APPLE)
       ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dylib
       CACHE INTERNAL "Fluid framework lib")
 endif()
+if(WITH_TESTING)
+set_tests_properties(selected_rows_test PROPERTIES TIMEOUT 120)
+endif()
diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index fe2bd27524fbf..f32a5eafefcef 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -9,3 +9,6 @@ cc_library(fusion_group_pass
     SRCS fusion_group_pass.cc elementwise_group_detector.cc
     DEPS subgraph_detector fuse_pass_base code_generator device_code)
 cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass)
+if(NOT ON_INFER)
+set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120)
+endif()
diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index c0d3b14e0e43e..5a1b6678074a2 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -68,3 +68,8 @@ elseif (WIN32)
   cc_test(test_analysis_predictor SRCS analysis_predictor_tester.cc DEPS analysis_predictor benchmark ${inference_deps}
           ARGS --dirname=${WORD2VEC_MODEL_DIR})
 endif()
+if(WITH_TESTING)
+    if(NOT APPLE)
+        set_tests_properties(test_api_impl PROPERTIES TIMEOUT 120)
+    endif()
+endif()
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index 17d2c0c0eef8b..bfc2984dc65c6 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -581,3 +581,38 @@ if(WITH_GPU)
         EXTRA_DEPS ${INFERENCE_EXTRA_DEPS}
         ARGS --infer_model=${RESNET50_MODEL_DIR})
 endif()
+
+if(WITH_GPU AND TENSORRT_FOUND)
+    set_tests_properties(trt_resnext_test PROPERTIES TIMEOUT 120)
+    set_tests_properties(trt_quant_int8_yolov3_r50_test PROPERTIES TIMEOUT 120)
+    set_tests_properties(trt_resnet50_test PROPERTIES TIMEOUT 120)
+    set_tests_properties(trt_cascade_rcnn_test PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_trt_dynamic_shape_ernie_ser_deser PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_trt_dynamic_shape_ernie PROPERTIES TIMEOUT 120)
+endif()
+
+if(WITH_MKLDNN)
+    set_tests_properties(test_analyzer_int8_resnet50 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_int8_mobilenet_ssd PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_quant_performance_benchmark PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_int8_mobilenetv2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_int8_mobilenetv1 PROPERTIES TIMEOUT 120)
+endif()
+
+set_tests_properties(lite_resnet50_test PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_mobilenet_transpose PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_resnet50 PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_ner PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_ernie PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_googlenet PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_small_dam PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120)
+if(WITH_GPU AND TENSORRT_FOUND)
+set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
+set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
+endif()
+if(ON_INFER OR WITH_GPU)
+set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
+endif()
diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt
index 47fbb42fd6a81..a4c9caf6f69ac 100644
--- a/paddle/fluid/operators/distributed/CMakeLists.txt
+++ b/paddle/fluid/operators/distributed/CMakeLists.txt
@@ -68,3 +68,7 @@ if(WITH_GPU)
         DEPS sendrecvop_rpc executor ${RPC_DEPS}
         selected_rows_functor  scope math_function)
 endif()
+if(WITH_TESTING)
+set_tests_properties(rpc_server_test PROPERTIES TIMEOUT 120)
+set_tests_properties(heart_beat_monitor_test PROPERTIES TIMEOUT 120)
+endif()
diff --git a/paddle/fluid/operators/jit/CMakeLists.txt b/paddle/fluid/operators/jit/CMakeLists.txt
index 1c56efeab416e..95361b17aae6b 100644
--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -26,3 +26,6 @@ cc_test(jit_kernel_test SRCS test.cc DEPS jit_kernel_helper)
 if(NOT WIN32)
     cc_binary(jit_kernel_benchmark SRCS benchmark.cc DEPS jit_kernel_helper device_tracer tensor)
 endif()
+if(WITH_TESTING)
+set_tests_properties(jit_kernel_test PROPERTIES TIMEOUT 120)
+endif()
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 24ed4fcf66849..384393d9601e3 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -91,3 +91,6 @@ if(WITH_GPU)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
+if(WITH_TESTING)
+set_tests_properties(im2col_test PROPERTIES TIMEOUT 120)
+endif()
diff --git a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
index 6c02076eae0de..0f05d941a9189 100644
--- a/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/slim/tests/CMakeLists.txt
@@ -277,4 +277,26 @@ endforeach()
 if(NOT WIN32)
     set_tests_properties(test_post_training_quantization_mobilenetv1 PROPERTIES TIMEOUT 250 LABELS "RUN_TYPE=NIGHTLY")
 	set_tests_properties(test_post_training_quantization_resnet50 PROPERTIES TIMEOUT 200 LABELS "RUN_TYPE=NIGHTLY")
+    set_tests_properties(test_post_training_quantization_mnist PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_weight_quantization_mobilenetv1 PROPERTIES TIMEOUT 120)
+endif()
+
+set_tests_properties(test_graph PROPERTIES TIMEOUT 120)
+set_tests_properties(test_quantization_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat_channelwise PROPERTIES TIMEOUT 120)
+set_tests_properties(test_user_defined_quantization PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_qat PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_out_scale PROPERTIES TIMEOUT 120)
+if(LINUX AND WITH_MKLDNN)
+    set_tests_properties(test_quant2_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(convert_model2dot_ernie PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant2_int8_resnet50_channelwise_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant_int8_mobilenetv2_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant2_int8_resnet50_range_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(save_quant2_model_resnet50 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant_int8_mobilenetv1_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant2_int8_ernie_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant_int8_googlenet_mkldnn PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_quant2_int8_resnet50_mkldnn PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/contrib/tests/CMakeLists.txt b/python/paddle/fluid/contrib/tests/CMakeLists.txt
index 7431b11817894..ab84257205460 100644
--- a/python/paddle/fluid/contrib/tests/CMakeLists.txt
+++ b/python/paddle/fluid/contrib/tests/CMakeLists.txt
@@ -4,3 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(src ${TEST_OPS})
         py_test(${src} SRCS ${src}.py)
 endforeach()
+set_tests_properties(test_image_classification_fp16 PROPERTIES TIMEOUT 120)
+set_tests_properties(test_weight_decay_extend PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index 17bc861070fd9..bee49945f0074 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -1,6 +1,7 @@
 file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
 string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 
+
 foreach(src ${TEST_OPS})
   py_test(${src} SRCS ${src}.py)
 endforeach()
@@ -11,3 +12,4 @@ add_subdirectory(book)
 if(NOT APPLE AND NOT WIN32)
   add_subdirectory(custom_op)
 endif()
+set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt
index 96321aae566d1..e78ba297bf125 100644
--- a/python/paddle/fluid/tests/book/CMakeLists.txt
+++ b/python/paddle/fluid/tests/book/CMakeLists.txt
@@ -6,3 +6,9 @@ foreach(src ${TEST_OPS})
     py_test(${src} SRCS ${src}.py)
     set_tests_properties(${src} PROPERTIES FIXTURES_SETUP ${src}_infer_model)
 endforeach()
+set_tests_properties(test_word2vec PROPERTIES TIMEOUT 120)
+set_tests_properties(test_recognize_digits PROPERTIES TIMEOUT 120)
+set_tests_properties(test_image_classification PROPERTIES TIMEOUT 120)
+set_tests_properties(test_label_semantic_roles PROPERTIES TIMEOUT 120)
+set_tests_properties(test_machine_translation PROPERTIES TIMEOUT 120)
+set_tests_properties(test_rnn_encoder_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index de6912e76ddaf..a344e04ed4d0c 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -273,7 +273,7 @@ function(parallel_bash_test_modules TARGET_NAME)
     cmake_parse_arguments(parallel_bash_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
 
-    set(timeout 120)
+    set(timeout 15)
     if(${parallel_bash_test_modules_TIMEOUT})
         set(timeout ${parallel_bash_test_modules_TIMEOUT})
     endif()
@@ -612,4 +612,167 @@ if(NOT WIN32 AND NOT APPLE)
     set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
     set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
     set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
+    set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_fleet_launch PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_launch PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
+endif()
+
+if (NOT WIN32)
+set_tests_properties(test_multiprocess_reader_exception PROPERTIES TIMEOUT 120)
+set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
+set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
+set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT 120)
+
+endif()
+
+# setting timeout value as 15S
+set_tests_properties(test_sync_batch_norm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cross_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_lod_tensor_to_selected_rows PROPERTIES TIMEOUT 120)
+set_tests_properties(test_lstm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_star_gan_with_gradient_penalty PROPERTIES TIMEOUT 120)
+set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bicubic_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_deformable_conv_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nearest_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_inplace_softmax_with_cross_entropy PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cross_entropy2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sequence_conv PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fetch_unmerged PROPERTIES TIMEOUT 120)
+set_tests_properties(test_gru_unit_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_activation_nn_grad PROPERTIES TIMEOUT 120)
+set_tests_properties(test_empty_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_transformer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_div_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_regularizer_api PROPERTIES TIMEOUT 120)
+set_tests_properties(test_multiclass_nms_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_ir_memory_optimize_nlp PROPERTIES TIMEOUT 120)
+set_tests_properties(test_add_reader_dependency PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bilateral_slice_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_buffer_shared_memory_reuse_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fuse_relu_depthwise_conv_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fleet_util PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_transformer_sorted_gradient PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matmul_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nearest_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trilinear_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bicubic_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_gather_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_static_save_load PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_selected_rows_to_lod_tensor PROPERTIES TIMEOUT 120)
+set_tests_properties(test_index_select_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_ssa_graph_inference_feed_partial_data PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_crf PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_save_load PROPERTIES TIMEOUT 120)
+set_tests_properties(test_concat_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_eager_deletion_transformer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_seresnext_with_reduce_gpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dropout_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_profiler PROPERTIES TIMEOUT 120)
+set_tests_properties(test_argsort_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sequence_pool PROPERTIES TIMEOUT 120)
+set_tests_properties(test_gather_nd_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nn_grad PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_sub_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_row_conv_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_seresnext_with_fuse_all_reduce_gpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_min_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_nan_inf PROPERTIES TIMEOUT 120)
+set_tests_properties(test_deformable_conv_v1_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_transformer_auto_growth PROPERTIES TIMEOUT 120)
+set_tests_properties(test_py_reader_using_executor PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_add_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_weight_decay PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_ptb_rnn_sorted_gradient PROPERTIES TIMEOUT 120)
+set_tests_properties(test_crop_tensor_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_eager_deletion_lstm_net PROPERTIES TIMEOUT 120)
+set_tests_properties(test_parallel_executor_mnist PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_ptb_rnn PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_save_load_v2 PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv3d_transpose_part2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_prroi_pool_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_multiprocess_dataloader_iterable_dataset_static PROPERTIES TIMEOUT 120)
+set_tests_properties(test_lstm_cudnn_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_stack_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bilinear_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sequence_concat PROPERTIES TIMEOUT 120)
+set_tests_properties(test_deformable_psroi_pooling PROPERTIES TIMEOUT 120)
+set_tests_properties(test_trilinear_interp_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_static_runner_mnist PROPERTIES TIMEOUT 120)
+set_tests_properties(test_masked_select_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sigmoid_cross_entropy_with_logits_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_optimizer_v2 PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_sum_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cond PROPERTIES TIMEOUT 120)
+set_tests_properties(test_space_to_depth_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_dyn_rnn PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sgd_op PROPERTIES TIMEOUT 250)
+set_tests_properties(test_parallel_executor_seresnext_base_gpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_norm_nn_grad PROPERTIES TIMEOUT 120)
+set_tests_properties(test_matrix_nms_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_generator_dataloader PROPERTIES TIMEOUT 120)
+set_tests_properties(test_partial_concat_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fuse_optimizer_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_softmax_with_cross_entropy_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_reduce_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_adam_optimizer_fp32_fp64 PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_nn_grad PROPERTIES TIMEOUT 120)
+set_tests_properties(test_buffer_shared_memory_reuse_pass_and_fuse_optimization_op_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv_nn_grad PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv3d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_program_prune_backward PROPERTIES TIMEOUT 120)
+set_tests_properties(test_group_norm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_optimizer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pool2d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_transpose_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_eager_deletion_gru_net PROPERTIES TIMEOUT 120)
+set_tests_properties(test_activation_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_normal PROPERTIES TIMEOUT 120)
+set_tests_properties(test_lstmp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bilinear_interp_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_decoupled_py_reader PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fuse_bn_act_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv2d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_elementwise_mul_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cyclic_cifar_dataset PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fuse_all_reduce_pass PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
+set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_regularizer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_imperative_resnet PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES TIMEOUT 200)
+set_tests_properties(test_imperative_se_resnext PROPERTIES TIMEOUT 200)
+set_tests_properties(test_matmul_v2_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_slice_op PROPERTIES TIMEOUT 120)
+if(WITH_COVERAGE)
+    set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
+endif()
+if(WITH_GPU AND NOT WIN32)
+    set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
+endif()
+if(WITH_GPU)
+    set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index 629716cc31558..e264d6d2a6f51 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -8,3 +8,15 @@ endforeach(TEST_OP)
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
 set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
 set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900)
+set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120)
+set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 120)
+set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 120)
+set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bert PROPERTIES TIMEOUT 120)
+set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 120)
+set_tests_properties(test_resnet PROPERTIES TIMEOUT 120)
+set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
+set_tests_properties(test_transformer PROPERTIES TIMEOUT 200)
+set_tests_properties(test_mnist PROPERTIES TIMEOUT 120)
+set_tests_properties(test_bmn PROPERTIES TIMEOUT 120)
+set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
index e958ba75638fc..b667f522c094b 100644
--- a/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/ir/inference/CMakeLists.txt
@@ -28,3 +28,6 @@ endif()
 foreach(target ${TEST_INFERENCE_IR_PASSES})
   py_test_modules(${target} MODULES ${target})
 endforeach()
+if(WITH_GPU AND TENSORRT_FOUND)
+set_tests_properties(test_trt_subgraph_pass PROPERTIES TIMEOUT 120)
+endif()
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
index f71e04c09aa38..51ec17f62d9ac 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -4,3 +4,4 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
+set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
index f71e04c09aa38..0606594c8c25f 100644
--- a/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/rnn/CMakeLists.txt
@@ -4,3 +4,5 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
+set_tests_properties(test_rnn_nets_static PROPERTIES TIMEOUT 120)
+set_tests_properties(test_rnn_nets PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
index f71e04c09aa38..c6ba82f8cbf0f 100644
--- a/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/sequence/CMakeLists.txt
@@ -4,3 +4,6 @@ string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
+set_tests_properties(test_sequence_conv PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sequence_concat PROPERTIES TIMEOUT 120)
+set_tests_properties(test_sequence_pool PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/reader/tests/CMakeLists.txt b/python/paddle/reader/tests/CMakeLists.txt
index 969718d3b1837..a6ac586b680cf 100644
--- a/python/paddle/reader/tests/CMakeLists.txt
+++ b/python/paddle/reader/tests/CMakeLists.txt
@@ -1 +1,2 @@
 py_test(decorator_test SRCS decorator_test.py)
+set_tests_properties(decorator_test PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/tests/CMakeLists.txt b/python/paddle/tests/CMakeLists.txt
index 9f64a6d2b7b67..50466be0c1b1f 100644
--- a/python/paddle/tests/CMakeLists.txt
+++ b/python/paddle/tests/CMakeLists.txt
@@ -39,5 +39,14 @@ foreach(src ${DIST_TEST_OPS})
     message(STATUS ${src})
     py_dist_test(${src} SRCS ${src}.py)
 endforeach()
-
-set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600)
+set_tests_properties(test_dataset_cifar PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 120)
+set_tests_properties(test_model PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataset_movielens PROPERTIES TIMEOUT 120)
+set_tests_properties(test_datasets PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataset_wmt PROPERTIES TIMEOUT 120)
+set_tests_properties(test_vision_models PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataset_uci_housing PROPERTIES TIMEOUT 120)
+set_tests_properties(test_dataset_imdb PROPERTIES TIMEOUT 120)
+set_tests_properties(test_callbacks PROPERTIES TIMEOUT 120)
+set_tests_properties(test_pretrained_model PROPERTIES TIMEOUT 600) 

From 72c78e4dbbdf74fe9995d9b0757ca6e5832c79d7 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 9 Nov 2020 00:58:09 +0800
Subject: [PATCH 140/185] exec ut no more than 15s 2 (#28441)

* exec ut no more than 15s 2

* fix for ut test_inplace_addto_strategy timeout
---
 cmake/generic.cmake                                | 8 ++++----
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 3 ++-
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index a23862653677d..6f4ec24851d3f 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -386,9 +386,9 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 2 minutes.
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
     else()
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 15)
     endif()
   endif()
 endfunction()
@@ -758,10 +758,10 @@ function(py_test TARGET_NAME)
     endif()
     
     if (APPLE OR WIN32)
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
     else()
         # No unit test should exceed 2 minutes in Linux.
-        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 120)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 15)
     endif()
 
   endif()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index a344e04ed4d0c..66952537b7b01 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -215,7 +215,7 @@ function(py_test_modules TARGET_NAME)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
 
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 350)
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 15)
   endif()
 endfunction()
 
@@ -776,3 +776,4 @@ endif()
 if(WITH_GPU)
     set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 120)
 endif()
+set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)

From e29ab5eacbdc74f8f47e96d896bc593fb8701d4f Mon Sep 17 00:00:00 2001
From: wanghuancoder <wanghuan29@baidu.com>
Date: Mon, 9 Nov 2020 10:59:15 +0800
Subject: [PATCH 141/185] clear clcache cache file and reopen clcache (#28384)

* clear clcache cache file and reopen clcache, test=develop

* reopen clcache, test=develop
---
 paddle/scripts/paddle_build.bat | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 296deed1c8e6e..4684472f3542e 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -133,7 +133,6 @@ set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
 clcache.exe -M 21474836480
 
-
 rem ------set cache third_party------
 if not exist %cache_dir%\tools (
     git clone https://github.com/zhouwei25/tools.git %cache_dir%\tools
@@ -248,8 +247,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 echo Build Paddle the %build_times% time:
-::msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
-msbuild /m:%PARALLEL_PROJECT_COUNT% /p:Configuration=Release /verbosity:minimal paddle.sln
+msbuild /m:%PARALLEL_PROJECT_COUNT% /p:TrackFileAccess=false /p:CLToolExe=clcache.exe /p:CLToolPath=%PYTHON_ROOT%\Scripts /p:Configuration=Release /verbosity:minimal paddle.sln
 if %ERRORLEVEL% NEQ 0 (
     set /a build_times=%build_times%+1
     if %build_times% GTR 1 (

From e14ed71cc23a2ef2ee166ddcf61095af5afcc6d8 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Mon, 9 Nov 2020 11:30:06 +0800
Subject: [PATCH 142/185] refine the performance of gather Op (#28458)

---
 paddle/fluid/operators/gather.cu.h   | 28 ++++++++++++++++------------
 paddle/fluid/operators/gather_op.cc  |  5 +++++
 python/paddle/tensor/manipulation.py |  5 +++--
 3 files changed, 24 insertions(+), 14 deletions(-)

diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index c4bdd9e439c54..16864f28baaf9 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
+#include "paddle/fluid/platform/gpu_launch_config.h"
 #include "paddle/fluid/platform/place.h"
-
 namespace paddle {
 namespace operators {
 
@@ -165,14 +165,16 @@ __global__ void GatherGPUKernel(const T* input, const U* index, T* out,
                                 int out_index_dim_size,
                                 int input_index_dim_size, int size) {
   int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  int outer_size = outer_dim_size * out_index_dim_size;
   for (; idx < size; idx += blockDim.x * gridDim.x) {
-    int inner_dim_index = idx / (outer_dim_size * out_index_dim_size);
-    int next_idx = idx % (outer_dim_size * out_index_dim_size);
-    int index_dim_index = next_idx / (outer_dim_size);
-    int out_dim_index = next_idx % outer_dim_size;
+    int inner_dim_index = idx / outer_size;
+    int next_idx = idx - outer_size * inner_dim_index;
+    int index_dim_index = next_idx / outer_dim_size;
+    int index_val = index[index_dim_index];
+    int out_dim_index = next_idx - outer_dim_size * index_dim_index;
     int input_index =
         inner_dim_index * (outer_dim_size * input_index_dim_size) +
-        index[index_dim_index] * outer_dim_size + out_dim_index;
+        index_val * outer_dim_size + out_dim_index;
     out[idx] = input[input_index];
   }
 }
@@ -234,10 +236,11 @@ void GatherV2CUDAFunction(const Tensor* input, const Tensor* index,
   auto* out_data = out->mutable_data<T>(place);
   int out_size = out->numel();
 
-  int threads = 512;
-  int grid = (out_size + threads - 1) / threads;
+  platform::GpuLaunchConfig config =
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), out_size);
   auto stream = ctx.cuda_device_context().stream();
-  GatherGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+  GatherGPUKernel<
+      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
       input_data, index_data, out_data, outer_dim_size, inner_dim_size,
       index_size, index_dim_size, out_size);
 }
@@ -280,10 +283,11 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
   int out_index_dim_size = out_dim[axis_index];
   operators::math::set_constant(*dev_ctx, out, 0.0);
 
-  int threads = 512;
-  int grid = (input_size + threads - 1) / threads;
+  platform::GpuLaunchConfig config =
+      platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
   auto stream = ctx.cuda_device_context().stream();
-  GatherGradGPUKernel<T, U><<<grid, threads, 0, stream>>>(
+  GatherGradGPUKernel<
+      T, U><<<config.block_per_grid, config.thread_per_block, 0, stream>>>(
       input_data, index_data, out_data, outer_dim_size, inner_dim_size,
       input_index_dim_size, out_index_dim_size, input_size);
 }
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index a99879316d684..72b44b22f9c06 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -66,6 +66,11 @@ class GatherOp : public framework::OperatorWithKernel {
         OperatorWithKernel::IndicateVarDataType(ctx, "X"),
         ctx.device_context());
   }
+  framework::OpKernelType GetKernelTypeForVar(
+      const std::string& var_name, const framework::Tensor& tensor,
+      const framework::OpKernelType& expected_kernel_type) const override {
+    return expected_kernel_type;
+  }
 };
 
 class GatherGradOp : public framework::OperatorWithKernel {
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 19b88e122e4e2..1d0785f97db0a 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -16,7 +16,7 @@
 
 from ..fluid.layers import core
 from ..fluid.layer_helper import LayerHelper
-from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_
+from ..fluid.framework import Variable, OpProtoHolder, in_dygraph_mode, convert_np_dtype_to_dtype_, device_guard
 from ..fluid.data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
 from ..fluid.layers.tensor import fill_constant
 from ..fluid.layers import utils
@@ -794,7 +794,8 @@ def gather(x, index, axis=None, name=None):
         axis = 0
     axis_tensor = axis
     if not isinstance(axis, Variable):
-        axis_tensor = fill_constant(shape=[1], dtype='int64', value=axis)
+        with device_guard("cpu"):
+            axis_tensor = fill_constant(shape=[1], dtype='int64', value=axis)
     if in_dygraph_mode():
         return core.ops.gather(x, index, axis_tensor)
 

From 18525d36351f871722f48a77d7fd9f8ecb5019f3 Mon Sep 17 00:00:00 2001
From: WangXi <wangxi16@baidu.com>
Date: Mon, 9 Nov 2020 14:05:51 +0800
Subject: [PATCH 143/185] fix Tanh remainder en doc (#28455)

---
 python/paddle/nn/layer/activation.py |  4 +---
 python/paddle/tensor/math.py         | 12 +++++-------
 2 files changed, 6 insertions(+), 10 deletions(-)

diff --git a/python/paddle/nn/layer/activation.py b/python/paddle/nn/layer/activation.py
index b0a1b27855a80..32979bae34d80 100644
--- a/python/paddle/nn/layer/activation.py
+++ b/python/paddle/nn/layer/activation.py
@@ -252,12 +252,10 @@ class Tanh(layers.Layer):
             import paddle
             import numpy as np
 
-            paddle.disable_static()
-
             x = paddle.to_tensor(np.array([-0.4, -0.2, 0.1, 0.3]))
             m = paddle.nn.Tanh()
             out = m(x)
-            print(out.numpy())
+            print(out)
             # [-0.37994896 -0.19737532  0.09966799  0.29131261]
     """
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index c83e788538e1c..30ce55f009e93 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -387,11 +387,11 @@ def remainder(x, y, name=None):
         out = x \% y
 
     **Note**:
-    ``paddle.remainder`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
+    ``paddle.mod`` supports broadcasting. If you want know more about broadcasting, please refer to :ref:`user_guide_broadcasting` .
 
     Args:
-        x (Tensor): the input tensor, it's data type should be int32, int64.
-        y (Tensor): the input tensor, it's data type should be int32, int64.
+        x (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
+        y (Tensor): the input tensor, it's data type should be float32, float64, int32, int64.
         name (str, optional): Name for the operation (optional, default is None). For more information, please refer to :ref:`api_guide_Name`.
 
     Returns:
@@ -403,12 +403,10 @@ def remainder(x, y, name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             x = paddle.to_tensor([2, 3, 8, 7])
             y = paddle.to_tensor([1, 5, 3, 3])
-            z = paddle.remainder(x, y)
-            print(z.numpy())  # [0, 3, 2, 1]
+            z = paddle.mod(x, y)
+            print(z)  # [0, 3, 2, 1]
 
     """
     op_type = 'elementwise_mod'

From 4fa1d3920530495da3ac6b1ddd42f2477629ada7 Mon Sep 17 00:00:00 2001
From: smallv0221 <33639025+smallv0221@users.noreply.github.com>
Date: Mon, 9 Nov 2020 14:47:05 +0800
Subject: [PATCH 144/185] Fix en doc for rnn.py. test=document_fix (#28470)

---
 python/paddle/nn/layer/rnn.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/python/paddle/nn/layer/rnn.py b/python/paddle/nn/layer/rnn.py
index ee989f27ebf72..75817aa2dc227 100644
--- a/python/paddle/nn/layer/rnn.py
+++ b/python/paddle/nn/layer/rnn.py
@@ -582,11 +582,11 @@ class GRUCell(RNNCellBase):
 
     ..  math::
 
-        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t-1} + b_{hr})
 
-        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t-1} + b_{hz})
 
-        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t} + b_{hc}))
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t-1} + b_{hc}))
 
         h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
 
@@ -1413,11 +1413,11 @@ class GRU(RNNBase):
 
     .. math::
 
-        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t} + b_{hr})
+        r_{t} & = \sigma(W_{ir}x_{t} + b_{ir} + W_{hr}x_{t-1} + b_{hr})
 
-        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t} + b_{hz})
+        z_{t} & = \sigma(W_{iz}x_{t} + b_{iz} + W_{hz}x_{t-1} + b_{hz})
 
-        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t} + b_{hc}))
+        \widetilde{h}_{t} & = \tanh(W_{ic}x_{t} + b_{ic} + r_{t} * (W_{hc}x_{t-1} + b_{hc}))
 
         h_{t} & = z_{t} * h_{t-1} + (1 - z_{t}) * \widetilde{h}_{t}
 

From 7fd20772119b53b3f43ba28a85872830b5d34207 Mon Sep 17 00:00:00 2001
From: Tao Luo <luotao02@baidu.com>
Date: Mon, 9 Nov 2020 18:43:27 +0800
Subject: [PATCH 145/185] set NCCL_SHM_DISABLE=1 for
 test_parallel_executor_profilery.py (#28484)

---
 .../tests/unittests/test_parallel_executor_profiler.py    | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
index 62ecb2207cded..0fac0610fd22d 100644
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_profiler.py
@@ -19,6 +19,14 @@
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 from paddle.fluid.tests.unittests.test_profiler import TestProfiler
+import os
+
+# NCCL 2.7 decides to use shared memory while NCCL 2.6 didn't, hence causing the error.
+# include/shm.h:28 NCCL WARN Call to posix_fallocate failed: No space left on device
+#
+# Set environment variables NCCL_SHM_DISABLE=1 to disables the Shared Memory (SHM) transports 
+# and force to use P2P which is the default transports way of NCCL2.6.
+os.environ['NCCL_SHM_DISABLE'] = str(1)
 
 
 class TestPEProfiler(TestProfiler):

From faa654987748226766ac5c47769f92eb3e64f584 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 9 Nov 2020 20:01:38 +0800
Subject: [PATCH 146/185] setting timeout properity for
 test_flags_mkldnn_ops_on_off (#28489)

---
 python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
index 51ec17f62d9ac..c7fe530a3dedf 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -5,3 +5,4 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)

From 1e698c600eb74ea11c89c9c008510c4d19924b95 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 9 Nov 2020 20:03:42 +0800
Subject: [PATCH 147/185] fix cmake error when setting ut timeout properity
 (#28492)

---
 .../framework/ir/fusion_group/CMakeLists.txt  |  4 +-
 .../fluid/tests/unittests/CMakeLists.txt      | 63 +++++++++----------
 2 files changed, 32 insertions(+), 35 deletions(-)

diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index f32a5eafefcef..ab7ccc7f7e854 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -9,6 +9,6 @@ cc_library(fusion_group_pass
     SRCS fusion_group_pass.cc elementwise_group_detector.cc
     DEPS subgraph_detector fuse_pass_base code_generator device_code)
 cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass)
-if(NOT ON_INFER)
-set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120)
+if(WITH_GPU AND NOT ON_INFER)
+    set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 66952537b7b01..4191c4c8f4389 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -606,38 +606,22 @@ endif()
 
 # setting timeout value for old unittests
 # set_tests_properties(test_dist_fleet_sparse_embedding_ctr PROPERTIES TIMEOUT 200)
-if(NOT WIN32 AND NOT APPLE)
-    set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
-    set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
-    set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
-    set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
-    set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
-    set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+
+if (NOT WIN32)
+    set_tests_properties(test_multiprocess_reader_exception PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT 120)
+endif()
+
+if (WITH_DISTRIBUTE)
+    set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
+endif()
+if (WITH_DISTRIBUTE AND NOT APPLE)
     set_tests_properties(test_fleet_launch PROPERTIES TIMEOUT 120)
     set_tests_properties(test_c_comm_init_op PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
     set_tests_properties(test_fleet_checkpoint PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
     set_tests_properties(test_launch PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
-    set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
-endif()
-
-if (NOT WIN32)
-set_tests_properties(test_multiprocess_reader_exception PROPERTIES TIMEOUT 120)
-set_tests_properties(test_layers PROPERTIES TIMEOUT 120)
-set_tests_properties(test_communicator_half_async PROPERTIES TIMEOUT 120)
-set_tests_properties(test_ir_memory_optimize_transformer PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fleet_utils PROPERTIES TIMEOUT 120)
-
 endif()
 
 # setting timeout value as 15S
@@ -756,11 +740,11 @@ set_tests_properties(test_dygraph_multi_forward PROPERTIES TIMEOUT 120)
 set_tests_properties(test_norm_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_ocr_attention_model PROPERTIES TIMEOUT 120)
 set_tests_properties(test_imperative_mnist PROPERTIES TIMEOUT 120)
-set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_gru_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 120)
-set_tests_properties(test_regularizer PROPERTIES TIMEOUT 120)
+set_tests_properties(test_fused_elemwise_activation_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_gru_op PROPERTIES TIMEOUT 200)
+set_tests_properties(test_layer_norm_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_pool3d_op PROPERTIES TIMEOUT 150)
+set_tests_properties(test_regularizer PROPERTIES TIMEOUT 150)
 set_tests_properties(test_imperative_resnet PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_resnet_sorted_gradient PROPERTIES TIMEOUT 200)
 set_tests_properties(test_imperative_se_resnext PROPERTIES TIMEOUT 200)
@@ -772,8 +756,21 @@ if(WITH_COVERAGE)
 endif()
 if(WITH_GPU AND NOT WIN32)
     set_tests_properties(test_collective_allgather_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_broadcast_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_allreduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_pipeline PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_reducescatter_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_broadcast PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_reducescatter PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_reduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_allreduce PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_allgather PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_scatter_api PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_collective_barrier_api PROPERTIES TIMEOUT 120)
 endif()
 if(WITH_GPU)
     set_tests_properties(test_imperative_auto_mixed_precision PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_dygraph_sync_batch_norm PROPERTIES TIMEOUT 120)
 endif()
 set_tests_properties(test_inplace_addto_strategy PROPERTIES TIMEOUT 120)

From d3b2d07d6eeaf89ec4661feb30a009f9b9b16cbd Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 9 Nov 2020 20:56:26 +0800
Subject: [PATCH 148/185] modified timeout value on windows (#28499)

* modified timeout value on windows

* fix some error
---
 cmake/generic.cmake                                | 8 ++++++--
 python/paddle/fluid/tests/unittests/CMakeLists.txt | 7 +++++--
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 6f4ec24851d3f..5a059c183a209 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -385,7 +385,9 @@ function(cc_test_run TARGET_NAME)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_init_allocated_mem=true)
     set_property(TEST ${TARGET_NAME} PROPERTY ENVIRONMENT FLAGS_cudnn_deterministic=true)
     # No unit test should exceed 2 minutes.
-    if (APPLE OR WIN32)
+    if (WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+    elseif (APPLE)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
     else()
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 15)
@@ -757,7 +759,9 @@ function(py_test TARGET_NAME)
                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
     endif()
     
-    if (APPLE OR WIN32)
+    if (WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+    elseif (APPLE)
         set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 20)
     else()
         # No unit test should exceed 2 minutes in Linux.
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 4191c4c8f4389..01c5cfa0aaee3 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -214,8 +214,11 @@ function(py_test_modules TARGET_NAME)
     if (py_test_modules_SERIAL)
         set_property(TEST ${TARGET_NAME} PROPERTY RUN_SERIAL 1)
     endif()
-
-    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 15)
+    if(WIN32)
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 150)
+    else()
+        set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 15)
+    endif()
   endif()
 endfunction()
 

From e1a5fc449d6acccfd186309d54aa61fa216aa0da Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 9 Nov 2020 21:02:10 +0800
Subject: [PATCH 149/185] fix ut exec timeout notest,test=kunlun (#28495)

* fix ut exec timeout notest,test=kunlun

* fix error for executing ut timeout,test=document_fix
---
 python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
index 6ac4b93bf6d66..eda4c989c5fda 100644
--- a/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/xpu/CMakeLists.txt
@@ -7,3 +7,6 @@ list(REMOVE_ITEM TEST_OPS test_mean_op_xpu)
 foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
+
+set_tests_properties(test_mul_op_xpu PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv2d_op_xpu PROPERTIES TIMEOUT 120)

From eb0855615c02e3165dba9f55232df303dc375d6e Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Mon, 9 Nov 2020 21:05:38 +0800
Subject: [PATCH 150/185] Set Exclusive for test_yolov3 to Avoid Memory
 Segmentation Fault in CI Random Failure (#28485)

As the title
---
 .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index e264d6d2a6f51..56bcd6d7b5289 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -7,7 +7,7 @@ endforeach(TEST_OP)
 
 set_tests_properties(test_se_resnet PROPERTIES TIMEOUT 900)
 set_tests_properties(test_tsm PROPERTIES TIMEOUT 900)
-set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900)
+set_tests_properties(test_yolov3 PROPERTIES TIMEOUT 900 LABELS "RUN_TYPE=EXCLUSIVE")
 set_tests_properties(test_mobile_net PROPERTIES TIMEOUT 120)
 set_tests_properties(test_seq2seq PROPERTIES TIMEOUT 120)
 set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 120)

From 645e999afcd70774579c04e6c32d015582f30084 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Mon, 9 Nov 2020 07:37:17 -0600
Subject: [PATCH 151/185] fix api_impl test. (#28483)

---
 paddle/fluid/inference/api/CMakeLists.txt | 4 ++--
 paddle/fluid/inference/tests/test.cmake   | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
index 5a1b6678074a2..8bf4c5499db85 100755
--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -51,11 +51,11 @@ cc_test(test_paddle_inference_api SRCS api_tester.cc DEPS paddle_inference_api)
 if(WITH_TESTING)
   if (NOT APPLE AND NOT WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS paddle_fluid_shared
-                        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
+      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   elseif(WIN32)
     inference_base_test(test_api_impl SRCS api_impl_tester.cc DEPS ${inference_deps}
-                        ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${PYTHON_TESTS_DIR}/book)
+      ARGS --word2vec_dirname=${WORD2VEC_MODEL_DIR} --book_dirname=${IMG_CLS_RESNET_INSTALL_DIR})
     set_tests_properties(test_api_impl PROPERTIES DEPENDS test_image_classification)
   endif()
 
diff --git a/paddle/fluid/inference/tests/test.cmake b/paddle/fluid/inference/tests/test.cmake
index 9bde2a99db1b7..b35ea51833ff1 100644
--- a/paddle/fluid/inference/tests/test.cmake
+++ b/paddle/fluid/inference/tests/test.cmake
@@ -51,6 +51,12 @@ if(NOT EXISTS ${WORD2VEC_INSTALL_DIR}/word2vec.inference.model.tar.gz)
 endif()
 set(WORD2VEC_MODEL_DIR "${WORD2VEC_INSTALL_DIR}/word2vec.inference.model")
 
+set(IMG_CLS_RESNET_INSTALL_DIR "${INFERENCE_DEMO_INSTALL_DIR}/image_classification_resnet")
+if(NOT EXISTS ${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model.tgz)
+  inference_download_and_uncompress(${IMG_CLS_RESNET_INSTALL_DIR} ${INFERENCE_URL} "image_classification_resnet.inference.model.tgz")
+endif()
+set(IMG_CLS_RESNET_MODEL_DIR "${IMG_CLS_RESNET_INSTALL_DIR}/image_classification_resnet.inference.model")
+
 function (inference_base_test_build TARGET)
    set(options "")
    set(oneValueArgs "")

From 2159646796fb33e63b3e3667acfd844c2df9d5f8 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Mon, 9 Nov 2020 21:51:18 +0800
Subject: [PATCH 152/185] modified timeout value for
 test_conv3d_mkldnn_op,test=document_fix (#28496)

---
 python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
index c7fe530a3dedf..69991a446d7a1 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/mkldnn/CMakeLists.txt
@@ -5,4 +5,5 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120)
+set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120)
 set_tests_properties(test_flags_mkldnn_ops_on_off PROPERTIES TIMEOUT 120)

From 241505c2621fa29ec22a8cc31cc4fca82bf1a25a Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Tue, 10 Nov 2020 11:08:39 +0800
Subject: [PATCH 153/185] refine the doc for API2.0 (#28386)

---
 python/paddle/device.py              |  5 ++---
 python/paddle/tensor/creation.py     |  5 -----
 python/paddle/tensor/manipulation.py | 14 ++------------
 python/paddle/tensor/math.py         | 13 -------------
 python/paddle/tensor/search.py       |  4 ----
 python/paddle/tensor/stat.py         |  1 -
 6 files changed, 4 insertions(+), 38 deletions(-)

diff --git a/python/paddle/device.py b/python/paddle/device.py
index 16bb1123e63c6..2beb92f2c3a75 100644
--- a/python/paddle/device.py
+++ b/python/paddle/device.py
@@ -82,7 +82,7 @@ def get_cudnn_version():
             
             import paddle
 
-            cudnn_version = get_cudnn_version()
+            cudnn_version = paddle.get_cudnn_version()
 
 
 
@@ -117,7 +117,7 @@ def set_device(device):
      .. code-block:: python
             
         import paddle
-        paddle.disable_static()
+
         paddle.set_device("cpu")
         x1 = paddle.ones(name='x1', shape=[1, 2], dtype='int32')
         x2 = paddle.zeros(name='x2', shape=[1, 2], dtype='int32')
@@ -179,7 +179,6 @@ def get_device():
      .. code-block:: python
             
         import paddle
-        paddle.disable_static()
         device = paddle.get_device()
 
     """
diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 8aa94ae420342..7b62ae9102d22 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -224,7 +224,6 @@ def full_like(x, fill_value, dtype=None, name=None):
           import paddle
           import numpy as np
           
-          paddle.disable_static()  # Now we are in imperative mode 
           input = paddle.full(shape=[2, 3], fill_value=0.0, dtype='float32', name='input')
           output = paddle.full_like(input, 2.0)
           # [[2. 2. 2.]
@@ -277,7 +276,6 @@ def ones(shape, dtype=None, name=None):
         .. code-block:: python
 
           import paddle 
-          paddle.disable_static()
           
           # default dtype for ones OP
           data1 = paddle.ones(shape=[3, 2]) 
@@ -361,7 +359,6 @@ def zeros(shape, dtype=None, name=None):
 
           import paddle
           
-          paddle.disable_static()  # Now we are in imperative mode
           data = paddle.zeros(shape=[3, 2], dtype='float32') 
           # [[0. 0.]
           #  [0. 0.]
@@ -446,7 +443,6 @@ def eye(num_rows, num_columns=None, dtype=None, name=None):
           
           import paddle
 
-          paddle.disable_static()  # Now we are in imperative mode
           data = paddle.eye(3, dtype='int32')
           # [[1 0 0]
           #  [0 1 0]
@@ -493,7 +489,6 @@ def full(shape, fill_value, dtype=None, name=None):
 
           import paddle
 
-          paddle.disable_static()  # Now we are in imperative mode
           data1 = paddle.full(shape=[2,1], fill_value=0, dtype='int64') 
           #[[0]
           # [0]]
diff --git a/python/paddle/tensor/manipulation.py b/python/paddle/tensor/manipulation.py
index 1d0785f97db0a..4a01f7e7fa311 100644
--- a/python/paddle/tensor/manipulation.py
+++ b/python/paddle/tensor/manipulation.py
@@ -67,8 +67,6 @@
 
 def concat(x, axis=0, name=None):
     """
-	:alias_main: paddle.concat
-	:alias: paddle.tensor.concat, paddle.tensor.manipulation.concat
 
     This OP concatenates the input along the axis.
 
@@ -91,7 +89,6 @@ def concat(x, axis=0, name=None):
             
             import paddle
             
-            paddle.disable_static()  # Now we are in imperative mode
             x1 = paddle.to_tensor([[1, 2, 3],
                                    [4, 5, 6]])
             x2 = paddle.to_tensor([[11, 12, 13],
@@ -465,7 +462,6 @@ def split(x, num_or_sections, axis=0, name=None):
             import numpy as np
             import paddle
             
-            paddle.disable_static()
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
@@ -608,7 +604,6 @@ def unique(x,
 
             import paddle
 
-            paddle.disable_static()
             x = paddle.to_tensor([2, 3, 3, 1, 5, 3])
             unique = paddle.unique(x)
             np_unique = unique.numpy() # [1 2 3 5]
@@ -744,9 +739,6 @@ def unsqueeze(x, axis, name=None):
 
 def gather(x, index, axis=None, name=None):
     """
-
-    **Gather Layer**
-
     Output is obtained by gathering entries of ``axis``
     of ``x`` indexed by ``index`` and concatenate them together.
 
@@ -765,7 +757,8 @@ def gather(x, index, axis=None, name=None):
                 Then:
 
                 out = [[3, 4],
-                       [5, 6]]
+                       [5, 6]] 
+
     Args:
         x (Tensor): The source input tensor with rank>=1. Supported data type is
             int32, int64, float32, float64 and uint8 (only for CPU),
@@ -784,7 +777,6 @@ def gather(x, index, axis=None, name=None):
 
             import paddle
 
-            paddle.disable_static()
             input = paddle.to_tensor([[1,2],[3,4],[5,6]])
             index = paddle.to_tensor([0,1])
             output = paddle.gather(input, index, axis=0)
@@ -1059,7 +1051,6 @@ def chunk(x, chunks, axis=0, name=None):
             import numpy as np
             import paddle
             
-            paddle.disable_static()
             # x is a Tensor which shape is [3, 9, 5]
             x_np = np.random.random([3, 9, 5]).astype("int32")
             x = paddle.to_tensor(x_np)
@@ -1452,7 +1443,6 @@ def gather_nd(x, index, name=None):
             
             import paddle
             
-            paddle.disable_static()
             x = paddle.to_tensor([[[1, 2], [3, 4], [5, 6]],
                                   [[7, 8], [9, 10], [11, 12]]])
             index = paddle.to_tensor([[0, 1]])
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 30ce55f009e93..d2e9340e8a86f 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -1780,44 +1780,31 @@ def prod(x, axis=None, keepdim=False, dtype=None, name=None):
 
             import paddle
 
-            paddle.disable_static()
-
             # the axis is a int element
             x = paddle.to_tensor([[0.2, 0.3, 0.5, 0.9],
                                   [0.1, 0.2, 0.6, 0.7]])
             out1 = paddle.prod(x)
-            print(out1.numpy())
             # [0.0002268]
 
             out2 = paddle.prod(x, -1)
-            print(out2.numpy())
             # [0.027  0.0084]
 
             out3 = paddle.prod(x, 0)
-            print(out3.numpy())
             # [0.02 0.06 0.3  0.63]
-            print(out3.numpy().dtype)
-            # float32
 
             out4 = paddle.prod(x, 0, keepdim=True)
-            print(out4.numpy())
             # [[0.02 0.06 0.3  0.63]]
 
             out5 = paddle.prod(x, 0, dtype='int64')
-            print(out5.numpy())
             # [0 0 0 0]
-            print(out5.numpy().dtype)
-            # int64
 
             # the axis is list
             y = paddle.to_tensor([[[1.0, 2.0], [3.0, 4.0]],
                                   [[5.0, 6.0], [7.0, 8.0]]])
             out6 = paddle.prod(y, [0, 1])
-            print(out6.numpy())
             # [105. 384.]
 
             out7 = paddle.prod(y, (1, 2))
-            print(out7.numpy())
             # [  24. 1680.]
 
     """
diff --git a/python/paddle/tensor/search.py b/python/paddle/tensor/search.py
index 3b7906730247c..3da4228fc8b20 100644
--- a/python/paddle/tensor/search.py
+++ b/python/paddle/tensor/search.py
@@ -280,8 +280,6 @@ def argmin(x, axis=None, keepdim=False, dtype="int64", name=None):
 
 def index_select(x, index, axis=0, name=None):
     """
-	:alias_main: paddle.index_select
-	:alias: paddle.tensor.index_select, paddle.tensor.search.index_select
 
     Returns a new tensor which indexes the ``input`` tensor along dimension ``axis`` using 
     the entries in ``index`` which is a Tensor. The returned tensor has the same number 
@@ -304,7 +302,6 @@ def index_select(x, index, axis=0, name=None):
             
             import paddle
 
-            paddle.disable_static()  # Now we are in imperative mode
             x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
                                   [5.0, 6.0, 7.0, 8.0],
                                   [9.0, 10.0, 11.0, 12.0]])
@@ -680,7 +677,6 @@ def masked_select(x, mask, name=None):
 
             import paddle
 
-            paddle.disable_static()
 
             x = paddle.to_tensor([[1.0, 2.0, 3.0, 4.0],
                                   [5.0, 6.0, 7.0, 8.0],
diff --git a/python/paddle/tensor/stat.py b/python/paddle/tensor/stat.py
index 5647896066d38..3873d893bd7c3 100644
--- a/python/paddle/tensor/stat.py
+++ b/python/paddle/tensor/stat.py
@@ -242,7 +242,6 @@ def numel(x, name=None):
 
             import paddle
             
-            paddle.disable_static()
             x = paddle.full(shape=[4, 5, 7], fill_value=0, dtype='int32')
             numel = paddle.numel(x) # 140
 

From 369605be1d5933d33d5584d4463ca077a8335c69 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Tue, 10 Nov 2020 13:15:24 +0800
Subject: [PATCH 154/185] fix cmake error when execute build_inference_lib
 (#28503)

---
 paddle/fluid/framework/ir/fusion_group/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
index ab7ccc7f7e854..d6be8fb071738 100644
--- a/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/fusion_group/CMakeLists.txt
@@ -9,6 +9,6 @@ cc_library(fusion_group_pass
     SRCS fusion_group_pass.cc elementwise_group_detector.cc
     DEPS subgraph_detector fuse_pass_base code_generator device_code)
 cc_test(test_fusion_group_pass SRCS fusion_group_pass_tester.cc DEPS fusion_group_pass graph_viz_pass)
-if(WITH_GPU AND NOT ON_INFER)
+if(WITH_TESTING)
     set_tests_properties(test_code_generator PROPERTIES TIMEOUT 120)
 endif()

From 53e9aa948d99000a9d224051901ca2caf97e3fbb Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Tue, 10 Nov 2020 14:39:50 +0800
Subject: [PATCH 155/185] remove diff with develop (#28504)

---
 paddle/scripts/paddle_build.bat | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 4684472f3542e..20e19f10cd203 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -82,11 +82,11 @@ if %day_now% NEQ %day_before% (
     goto :mkbuild
 )
 
-git diff HEAD origin/develop --stat --name-only
-git diff HEAD origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
-if %ERRORLEVEL% EQU 0 (
-    rmdir build /s/q
-)
+:: git diff HEAD origin/develop --stat --name-only
+:: git diff HEAD origin/develop --stat --name-only | findstr "cmake CMakeLists.txt paddle_build.bat"
+:: if %ERRORLEVEL% EQU 0 (
+::     rmdir build /s/q
+:: )
 
 :mkbuild
 if not exist build (

From 8f664a5c49535e2176b63d0d21c50702c92bf0ca Mon Sep 17 00:00:00 2001
From: yukavio <67678385+yukavio@users.noreply.github.com>
Date: Tue, 10 Nov 2020 15:04:56 +0800
Subject: [PATCH 156/185] fix one_hot example code (#28432)

---
 python/paddle/nn/functional/input.py | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/python/paddle/nn/functional/input.py b/python/paddle/nn/functional/input.py
index 2e4bbd99a726d..f6f367bf23d08 100644
--- a/python/paddle/nn/functional/input.py
+++ b/python/paddle/nn/functional/input.py
@@ -74,15 +74,14 @@ def one_hot(x, num_classes, name=None):
 
             import paddle
             # Correspond to the first example above, where label.shape is 4 and one_hot_label.shape is [4, 4].
-            label = paddle.static.data(name="label", shape=[4, 1], dtype="int64")
+            label = paddle.to_tensor([1, 1, 3, 0], dtype='int64')
             # label.shape = [4]
-            # label.data = [1, 1, 3, 0]
-            one_hot_label = paddle.nn.functional.one_hot(x=label, num_classes=4)
+            one_hot_label = paddle.nn.functional.one_hot(label, num_classes=4)
             # one_hot_label.shape = [4, 4]
-            # one_hot_label.data = [[0., 1., 0., 0.],
-            #                       [0., 1., 0., 0.],
-            #                       [0., 0., 0., 1.],
-            #                       [1., 0., 0., 0.]]
+            # one_hot_label = [[0., 1., 0., 0.],
+            #                  [0., 1., 0., 0.],
+            #                  [0., 0., 0., 1.],
+            #                  [1., 0., 0., 0.]]
 
     """
 

From 546b1c1dc4681f05f0b7442dd55433b05c3145ca Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Tue, 10 Nov 2020 15:54:21 +0800
Subject: [PATCH 157/185] Fix paddle.jit.dy2static.data_layer_not_check 2.0 API
 Doc (#28461)

Remove "fluid", "variable" in 2.0 API doc
---
 .../dygraph_to_static/variable_trans_func.py  | 21 ++++++++++---------
 1 file changed, 11 insertions(+), 10 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
index b7ebd3800c4c3..617c05c33675d 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/variable_trans_func.py
@@ -29,19 +29,19 @@
 
 def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
     """
-    This function creates a variable on the global block. Unlike
-    `paddle.fluid.data` , the created variable doesn't check the dtype and the
-    shape of feed data because dygraph input data can be variable-length.
-    This API is used in translating dygraph into static graph.
+    This function creates a Tensor on the global block. The created Tensor
+    doesn't check the dtype and the shape of feed data because dygraph input
+    data can be various-length. This API is used in translating dygraph into
+    static graph.
 
      Note: 
-        The default :code:`stop_gradient` attribute of the Variable created by
+        The default :code:`stop_gradient` attribute of the Tensor created by
         this API is true, which means the gradient won't be passed backward
-        through the data Varaible. Set :code:`var.stop_gradient = False` If
+        through the data Tensor. Set :code:`var.stop_gradient = False` If
         user would like to pass backward gradient.
 
     Args:
-       name (str): The name/alias of the variable, see :ref:`api_guide_Name`
+       name (str): The name/alias of the Tensor, see :ref:`api_guide_Name`
            for more details.
        shape (list|tuple): List|Tuple of integers declaring the shape. You can
            set "None" at a dimension to indicate the dimension can be of any
@@ -54,7 +54,7 @@ def data_layer_not_check(name, shape, dtype='float32', lod_level=0):
            use LoD level, see :ref:`user_guide_lod_tensor` . Default: 0
 
     Returns:
-        Variable: The global variable that gives access to the data.
+        Tensor: The global Tensor that gives access to the data.
     """
     helper = LayerHelper('data', **locals())
     shape = list(shape)
@@ -87,7 +87,8 @@ def create_static_variable_gast_node(name):
 
 
 def create_fill_constant_node(name, value):
-    func_code = "{} = paddle.fluid.layers.fill_constant(shape=[1], ".format(name)
+    func_code = "{} = paddle.fluid.layers.fill_constant(shape=[1], ".format(
+        name)
     if isinstance(value, bool):
         func_code += "dtype='bool', value={})".format(value)
         return gast.parse(func_code).body[0]
@@ -110,7 +111,7 @@ def create_fill_constant_node(name, value):
 
 def to_static_variable(x):
     '''
-    Translate a Python variable to PaddlePaddle static graph variable
+    Translate a Python Tensor to PaddlePaddle static graph Tensor
     '''
     if isinstance(x, bool):
         return fill_constant(shape=[1], dtype='bool', value=x)

From dc455617869bd3986aeeea426f5b8d85e9c398d8 Mon Sep 17 00:00:00 2001
From: tianshuo78520a <707759223@qq.com>
Date: Tue, 10 Nov 2020 16:52:53 +0800
Subject: [PATCH 158/185] Update version docker (#28314)

---
 tools/dockerfile/ubuntu16_dev.sh | 81 +++++++++++++++++++++-----------
 1 file changed, 54 insertions(+), 27 deletions(-)

diff --git a/tools/dockerfile/ubuntu16_dev.sh b/tools/dockerfile/ubuntu16_dev.sh
index 212e9acfea541..de1616169b9ff 100755
--- a/tools/dockerfile/ubuntu16_dev.sh
+++ b/tools/dockerfile/ubuntu16_dev.sh
@@ -1,6 +1,21 @@
 #!/bin/bash
 
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 docker_name=$1
+
   
 function ref_whl(){
   if [[ ${WITH_GPU} == "ON" ]]; then
@@ -20,48 +35,59 @@ function ref_whl(){
   if [[ ${gcc_version} == "8.2.0" ]];then
     ref_gcc=_gcc8.2
   fi
+
+  if [[ ${ref_CUDA_MAJOR} == "10" ]];then
+      ref_version=.post100
+  elif [[ ${ref_CUDA_MAJOR} == "10.1" ]];then
+      ref_version=.post101
+  elif [[ ${ref_CUDA_MAJOR} == "10.2" ]];then
+      ref_version=""
+  elif [[ ${ref_CUDA_MAJOR} == "9" ]];then
+      ref_version=.post90
+  fi
   
   ref_web="https://paddle-wheel.bj.bcebos.com/${PADDLE_BRANCH}-${ref_gpu}-${ref_mkl}${ref_gcc}"
   
-  if [[ ${PADDLE_BRANCH} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+  if [[ ${PADDLE_VERSION} == "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
   else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
   fi
   
-  if [[ ${PADDLE_BRANCH} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}.post${ref_CUDA_MAJOR}${CUDNN_MAJOR}-cp38-cp38-linux_x86_64.whl
+  if [[ ${PADDLE_VERSION} != "0.0.0" && ${WITH_GPU} == "ON" ]]; then
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}${ref_version}-cp38-cp38-linux_x86_64.whl
   else
-    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp27-cp27mu-linux_x86_64.whl
-    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp35-cp35m-linux_x86_64.whl
-    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp36-cp36m-linux_x86_64.whl
-    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp37-cp37m-linux_x86_64.whl
-    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_BRANCH}-cp38-cp38-linux_x86_64.whl
+    ref_paddle_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp27-cp27mu-linux_x86_64.whl
+    ref_paddle3_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp35-cp35m-linux_x86_64.whl
+    ref_paddle36_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp36-cp36m-linux_x86_64.whl
+    ref_paddle37_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp37-cp37m-linux_x86_64.whl
+    ref_paddle38_whl=paddlepaddle${install_gpu}-${PADDLE_VERSION}-cp38-cp38-linux_x86_64.whl
   fi
 }
 
 
 function install_whl(){
   dockerfile_line=`wc -l Dockerfile.tmp|awk '{print $1}'`
-  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle_whl} && pip install ${ref_paddle_whl} && rm -f  ${ref_paddle_whl}" Dockerfile.tmp
-  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle3_whl} && pip3.5 install ${ref_paddle3_whl} && rm  -f ${ref_paddle3_whl}" Dockerfile.tmp
-  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle36_whl} && pip3.6 install ${ref_paddle36_whl} && rm -f ${ref_paddle36_whl}" Dockerfile.tmp
-  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle37_whl} && pip3.7 install ${ref_paddle37_whl} && rm -f ${ref_paddle37_whl}" Dockerfile.tmp
-  sed -i "${dockerfile_line}i RUN wget ${ref_web}/${ref_paddle38_whl} && pip3.8 install ${ref_paddle38_whl} && rm -f ${ref_paddle38_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget -q ${ref_web}/${ref_paddle_whl} && pip install ${ref_paddle_whl} && rm -f  ${ref_paddle_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget -q ${ref_web}/${ref_paddle3_whl} && pip3.5 install ${ref_paddle3_whl} && rm  -f ${ref_paddle3_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget -q ${ref_web}/${ref_paddle36_whl} && pip3.6 install ${ref_paddle36_whl} && rm -f ${ref_paddle36_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget -q ${ref_web}/${ref_paddle37_whl} && pip3.7 install ${ref_paddle37_whl} && rm -f ${ref_paddle37_whl}" Dockerfile.tmp
+  sed -i "${dockerfile_line}i RUN wget -q ${ref_web}/${ref_paddle38_whl} && pip3.8 install ${ref_paddle38_whl} && rm -f ${ref_paddle38_whl}" Dockerfile.tmp
 }
 
+
 function install_gcc(){
   if [ "${gcc_version}" == "8.2.0" ];then
     sed -i 's#<install_gcc>#WORKDIR /usr/bin \
@@ -86,6 +112,7 @@ function make_dockerfile(){
   sed "s/<baseimg>/${docker_name}/g" tools/dockerfile/Dockerfile.ubuntu >Dockerfile.tmp
 }
 
+
 function main(){
   make_dockerfile
   install_gcc

From 47cbf61dd4c135127ec767dc3a8fe353f935a024 Mon Sep 17 00:00:00 2001
From: zhupengyang <zhu_py@qq.com>
Date: Tue, 10 Nov 2020 17:10:42 +0800
Subject: [PATCH 159/185] fix softmax unittest float16 random error (#28480)

---
 paddle/fluid/operators/softmax_op.cc | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/paddle/fluid/operators/softmax_op.cc b/paddle/fluid/operators/softmax_op.cc
index 63a27a8ccbfca..ff25f1911072c 100644
--- a/paddle/fluid/operators/softmax_op.cc
+++ b/paddle/fluid/operators/softmax_op.cc
@@ -234,10 +234,6 @@ class SoftmaxOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 DECLARE_INPLACE_OP_INFERER(SoftmaxInplaceInferer, {"X", "Out"});
 
-// NOTE(zjl): AVX implementation of SoftmaxGrad does not support in-place
-DECLARE_CUDA_ONLY_INPLACE_OP_INFERER(SoftmaxGradInplaceInferer,
-                                     {"Out", framework::GradVarName("X")});
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -248,8 +244,7 @@ REGISTER_OPERATOR(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker,
                   ops::SoftmaxOpGradMaker<paddle::framework::OpDesc>,
                   ops::SoftmaxOpGradMaker<paddle::imperative::OpBase>,
                   ops::SoftmaxInplaceInferer);
-REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad,
-                  ops::SoftmaxGradInplaceInferer);
+REGISTER_OPERATOR(softmax_grad, ops::SoftmaxOpGrad);
 REGISTER_OP_CPU_KERNEL(
     softmax, ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, float>,
     ops::SoftmaxKernel<paddle::platform::CPUDeviceContext, double>);

From c70c1c520d099e13bda1d487b069944c5d5358ee Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Tue, 10 Nov 2020 18:08:47 +0800
Subject: [PATCH 160/185] make Numpy version is below 1.19.3 (#28510)

---
 python/requirements.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/requirements.txt b/python/requirements.txt
index 138220b405748..12f36b3708573 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -1,6 +1,7 @@
 requests>=2.20.0
 numpy>=1.13, <=1.16.4 ; python_version<"3.5"
-numpy>=1.13 ; python_version>="3.5"
+numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
+numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
 protobuf>=3.1.0
 gast==0.3.3
 scipy>=0.19.0, <=1.2.1 ; python_version<"3.5"

From 75196cda403fee4a95fda3b5e52c2ba17ea870b8 Mon Sep 17 00:00:00 2001
From: Pei Yang <peiyang@baidu.com>
Date: Tue, 10 Nov 2020 18:58:28 +0800
Subject: [PATCH 161/185] Paddle-TRT int8 support mul op channelwise quant
 (#28422)

* paddle-trt support mul channelwise quant

* add support for depthwise_conv2d

* add errmsg for unsupported op type
---
 .../ir/quant_conv2d_dequant_fuse_pass.cc      | 83 ++++++++++++++-----
 1 file changed, 62 insertions(+), 21 deletions(-)

diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 611b1bb5eb8b0..96f88e70a98d4 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -195,32 +195,73 @@ void FuseDequant(ir::Graph* graph, Scope* scope,
     auto* weight_tensor =
         scope->Var(quantized_op_weight_node->Name())->GetMutable<LoDTensor>();
     auto w_dims = weight_tensor->dims();
+    float* quantized_weight_data =
+        weight_tensor->mutable_data<float>(platform::CPUPlace());
     // If quantized op is fc, weight scale size = 1;
     // If quantized op is conv2d, weight scale size = weight dims[0]
     // If quantized op is conv2d_transpose, weight scale size = weight dims[1]
-    bool valid_scale_size =
-        (weight_scale.size() == 1 ||
-         weight_scale.size() == static_cast<size_t>(w_dims[0]) ||
-         weight_scale.size() == static_cast<size_t>(w_dims[1]));
-    PADDLE_ENFORCE_EQ(
-        valid_scale_size, true,
-        platform::errors::InvalidArgument(
-            "TRT int8 quant: invalid scale size(%d).", weight_scale.size()));
-    float* quantized_weight_data =
-        weight_tensor->mutable_data<float>(platform::CPUPlace());
-    for (int j = 0; j < weight_tensor->numel(); j++) {
-      if (weight_scale.size() == 1) {
-        quantized_weight_data[j] *= weight_scale[0];
-      } else {
-        if (quantized_op_type == "conv2d_transpose") {
-          int inner_size = w_dims[2] * w_dims[3];
-          quantized_weight_data[j] *=
-              weight_scale[(j / inner_size) % w_dims[1]];
-        } else {
-          int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
-          quantized_weight_data[j] *= weight_scale[j / inner_size];
+    if (quantized_op_type == "mul" || quantized_op_type == "fc") {
+      if (dequant_type == "fake_dequantize_max_abs") {
+        PADDLE_ENFORCE_EQ(
+            weight_scale.size(), 1,
+            platform::errors::InvalidArgument(
+                "mul op weight dequantized by [fake_dequantize_max_abs] "
+                "requires weight scale size = 1, but got %d.",
+                weight_scale.size()));
+        for (int j = 0; j < weight_tensor->numel(); j++) {
+          quantized_weight_data[j] *= weight_scale[0];
         }
       }
+      if (dequant_type == "fake_channel_wise_dequantize_max_abs") {
+        PADDLE_ENFORCE_EQ(
+            weight_scale.size(), static_cast<size_t>(w_dims[1]),
+            platform::errors::InvalidArgument(
+                "mul op weight dequantized by "
+                "[fake_channel_wise_dequantize_max_abs] requires weight scale "
+                "size = 2nd dim of mul's weight, which is %d, but got %d.",
+                static_cast<size_t>(w_dims[1]), weight_scale.size()));
+        for (int j = 0; j < weight_tensor->numel(); j++) {
+          quantized_weight_data[j] *= weight_scale[j % w_dims[1]];
+        }
+      }
+    } else if (quantized_op_type == "conv2d" ||
+               quantized_op_type == "depthwise_conv2d") {
+      PADDLE_ENFORCE_EQ(
+          dequant_type, "fake_channel_wise_dequantize_max_abs",
+          platform::errors::InvalidArgument("conv2d op must be dequantized by "
+                                            "[fake_channel_wise_dequantize_max_"
+                                            "abs], but got %s",
+                                            dequant_type));
+      PADDLE_ENFORCE_EQ(
+          weight_scale.size(), static_cast<size_t>(w_dims[0]),
+          platform::errors::InvalidArgument(
+              "conv2d op requires weight scale size = channel size of the "
+              "weight, which is %d, but got %d.",
+              static_cast<size_t>(w_dims[0]), weight_scale.size()));
+      for (int j = 0; j < weight_tensor->numel(); j++) {
+        int inner_size = w_dims[1] * w_dims[2] * w_dims[3];
+        quantized_weight_data[j] *= weight_scale[j / inner_size];
+      }
+    } else if (quantized_op_type == "conv2d_transpose") {
+      PADDLE_ENFORCE_EQ(
+          dequant_type, "fake_channel_wise_dequantize_max_abs",
+          platform::errors::InvalidArgument(
+              "conv2d_transpose must be dequantized by "
+              "[fake_channel_wise_dequantize_max_abs], but got %s",
+              dequant_type));
+      PADDLE_ENFORCE_EQ(
+          weight_scale.size(), static_cast<size_t>(w_dims[1]),
+          platform::errors::InvalidArgument(
+              "conv2d_transpose op requires weight scale size = channel size "
+              "of the weight, which is %d, but got %d.",
+              static_cast<size_t>(w_dims[1]), weight_scale.size()));
+      for (int j = 0; j < weight_tensor->numel(); j++) {
+        int inner_size = w_dims[2] * w_dims[3];
+        quantized_weight_data[j] *= weight_scale[(j / inner_size) % w_dims[1]];
+      }
+    } else {
+      PADDLE_THROW(platform::errors::InvalidArgument(
+          "Unsupported quantized op type: %s", quantized_op_type));
     }
 
     // create new op_desc

From 3b0d31ab89c0c3314ff5ea77ebaca3b1f7633b6d Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Wed, 11 Nov 2020 10:36:32 +0800
Subject: [PATCH 162/185] Modify ProgramTranslator and TracedLayer Doc for API
 2.0 (#28509)

Modify ProgramTranslator and TracedLayer Doc for API 2.0
---
 .../dygraph_to_static/program_translator.py   |  4 +-
 python/paddle/fluid/dygraph/jit.py            | 84 +++++++++----------
 2 files changed, 41 insertions(+), 47 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
index 9c3f572eb9748..82c3e26028695 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
@@ -789,7 +789,7 @@ def func(x):
 
                 x = paddle.ones([1, 2])
                 # ProgramTranslator is disabled so the func is run in dygraph
-                print(func(x).numpy())  # [[0. 0.]]
+                print(func(x))  # [[0. 0.]]
 
         """
         check_type(enable_to_static, "enable_to_static", bool,
@@ -828,7 +828,7 @@ def func(x):
 
                 x = paddle.ones([1, 2])
                 x_v = prog_trans.get_output(func, x)
-                print(x_v.numpy())  # [[0. 0.]]
+                print(x_v)  # [[0. 0.]]
 
         """
         assert callable(
diff --git a/python/paddle/fluid/dygraph/jit.py b/python/paddle/fluid/dygraph/jit.py
index d4bfb8b112637..5d82ca17474dd 100644
--- a/python/paddle/fluid/dygraph/jit.py
+++ b/python/paddle/fluid/dygraph/jit.py
@@ -1051,7 +1051,7 @@ def trace(layer, inputs):
         model and convert it into a static graph model.
 
         Args:
-            layer (dygraph.Layer): the layer object to be traced.
+            layer (paddle.nn.Layer): the layer object to be traced.
             inputs (list(Tensor)|tuple(Tensor)|Tensor): the input tensors of
                 the layer object.
 
@@ -1063,32 +1063,30 @@ def trace(layer, inputs):
         Examples:
             .. code-block:: python:
 
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear, to_variable, TracedLayer
-                import numpy as np
+                import paddle
 
-                class ExampleLayer(fluid.dygraph.Layer):
+                class ExampleLayer(paddle.nn.Layer):
                     def __init__(self):
                         super(ExampleLayer, self).__init__()
-                        self._fc = Linear(3, 10)
+                        self._fc = paddle.nn.Linear(3, 10)
 
                     def forward(self, input):
                         return self._fc(input)
 
-                with fluid.dygraph.guard():
-                    layer = ExampleLayer()
-                    in_np = np.random.random([2, 3]).astype('float32')
-                    in_var = to_variable(in_np)
-                    out_dygraph, static_layer = TracedLayer.trace(layer, inputs=[in_var])
+                
+                layer = ExampleLayer()
+                in_var = paddle.uniform(shape=[2, 3], dtype='float32')
+                out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var])
+
+                # run the static graph model using Executor inside
+                out_static_graph = static_layer([in_var])
 
-                    # run the static graph model using Executor inside
-                    out_static_graph = static_layer([in_var])
+                print(len(out_static_graph)) # 1
+                print(out_static_graph[0].shape) # (2, 10)
 
-                    print(len(out_static_graph)) # 1
-                    print(out_static_graph[0].shape) # (2, 10)
+                # save the static graph model for inference
+                static_layer.save_inference_model(dirname='./saved_infer_model')
 
-                    # save the static graph model for inference
-                    static_layer.save_inference_model(dirname='./saved_infer_model')
         """
         assert isinstance(
             layer, Layer
@@ -1114,33 +1112,30 @@ def set_strategy(self, build_strategy=None, exec_strategy=None):
         Examples:
             .. code-block:: python:
 
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear, to_variable, TracedLayer
-                import numpy as np
+                import paddle
 
-                class ExampleLayer(fluid.dygraph.Layer):
+                class ExampleLayer(paddle.nn.Layer):
                     def __init__(self):
                         super(ExampleLayer, self).__init__()
-                        self._fc = Linear(3, 10)
+                        self._fc = paddle.nn.Linear(3, 10)
 
                     def forward(self, input):
                         return self._fc(input)
 
-                with fluid.dygraph.guard():
-                    layer = ExampleLayer()
-                    in_np = np.random.random([2, 3]).astype('float32')
-                    in_var = to_variable(in_np)
+                layer = ExampleLayer()
+                in_var = paddle.uniform(shape=[2, 3], dtype='float32')
+
+                out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var])
 
-                    out_dygraph, static_layer = TracedLayer.trace(layer, inputs=[in_var])
+                build_strategy = paddle.static.BuildStrategy()
+                build_strategy.enable_inplace = True
 
-                    build_strategy = fluid.BuildStrategy()
-                    build_strategy.enable_inplace = True
+                exec_strategy = paddle.static.ExecutionStrategy()
+                exec_strategy.num_threads = 2
 
-                    exec_strategy = fluid.ExecutionStrategy()
-                    exec_strategy.num_threads = 2
+                static_layer.set_strategy(build_strategy=build_strategy, exec_strategy=exec_strategy)
+                out_static_graph = static_layer([in_var])
 
-                    static_layer.set_strategy(build_strategy=build_strategy, exec_strategy=exec_strategy)
-                    out_static_graph = static_layer([in_var])
         """
         assert self._compiled_program is None, "Cannot set strategy after run"
         assert isinstance(
@@ -1212,30 +1207,29 @@ def save_inference_model(self, dirname, feed=None, fetch=None):
         Examples:
             .. code-block:: python:
 
-                import paddle.fluid as fluid
-                from paddle.fluid.dygraph import Linear, to_variable, TracedLayer
                 import numpy as np
+                import paddle
 
-                class ExampleLayer(fluid.dygraph.Layer):
+                class ExampleLayer(paddle.nn.Layer):
                     def __init__(self):
                         super(ExampleLayer, self).__init__()
-                        self._fc = Linear(3, 10)
+                        self._fc = paddle.nn.Linear(3, 10)
 
                     def forward(self, input):
                         return self._fc(input)
 
                 save_dirname = './saved_infer_model'
                 in_np = np.random.random([2, 3]).astype('float32')
+                in_var = paddle.to_tensor(in_np)
+                layer = ExampleLayer()
 
-                with fluid.dygraph.guard():
-                    layer = ExampleLayer()
-                    in_var = to_variable(in_np)
-                    out_dygraph, static_layer = TracedLayer.trace(layer, inputs=[in_var])
-                    static_layer.save_inference_model(save_dirname, feed=[0], fetch=[0])
+                out_dygraph, static_layer = paddle.jit.TracedLayer.trace(layer, inputs=[in_var])
+                static_layer.save_inference_model(save_dirname, feed=[0], fetch=[0])
 
-                place = fluid.CPUPlace()
-                exe = fluid.Executor(place)
-                program, feed_vars, fetch_vars = fluid.io.load_inference_model(save_dirname,
+                paddle.enable_static()
+                place = paddle.CPUPlace()
+                exe = paddle.static.Executor(place)
+                program, feed_vars, fetch_vars = paddle.static.load_inference_model(save_dirname,
                                                     exe)
 
                 fetch, = exe.run(program, feed={feed_vars[0]: in_np}, fetch_list=fetch_vars)

From 0ce933a9eeebbf079dd70e7df22861a32f32f16e Mon Sep 17 00:00:00 2001
From: liym27 <33742067+liym27@users.noreply.github.com>
Date: Wed, 11 Nov 2020 10:44:09 +0800
Subject: [PATCH 163/185] [API2.0] Fix documents of 6 APIs to fit API2.0:
 (#28514)

1. Remove 'fluid';
2. Variable -> Tensor

APIs:
 sum, convert_call, convert_ifelse,
 convert_logical_and, convert_logical_or, convert_logical_not
---
 .../dygraph_to_static/convert_call_func.py    | 40 ++++++++++---------
 .../dygraph_to_static/convert_operators.py    | 18 ++++-----
 python/paddle/tensor/math.py                  |  1 -
 3 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
index 9654a23852024..bd7f51d89b201 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py
@@ -98,25 +98,27 @@ def convert_call(func):
     Examples:
         .. code-block:: python
 
-          import paddle.fluid as fluid
-          from paddle.fluid.dygraph.dygraph_to_static import convert_call
-
-          def dyfunc(x):
-              if fluid.layers.mean(x) < 0:
-                  x_v = x - 1
-              else:
-                  x_v = x + 1
-
-               return x_v
-          new_func = convert_call(dyfunc)
-          x = fluid.layers.fill_constant(shape=[3, 3], value=0, dtype='float64')
-          x_v = new_func(x)
-          exe = fluid.Executor(fluid.CPUPlace())
-          out = exe.run(fetch_list=[x_v])
-          print(out[0])
-          # [[1. 1. 1.]
-          #  [1. 1. 1.]
-          #  [1. 1. 1.]]
+            import paddle
+            from paddle.jit.dy2static import convert_call
+
+            paddle.enable_static()
+            def dyfunc(x):
+                if paddle.mean(x) < 0:
+                    x_v = x - 1
+                else:
+                    x_v = x + 1
+                return x_v
+
+            new_func = convert_call(dyfunc)
+            x = paddle.tensor.manipulation.fill_constant(shape=[3, 3], value=0, dtype='float64')
+            x_v = new_func(x)
+
+            exe = paddle.static.Executor(paddle.CPUPlace())
+            out = exe.run(fetch_list=[x_v])
+            print(out[0])
+            # [[1. 1. 1.]
+            #  [1. 1. 1.]
+            #  [1. 1. 1.]]
 
     """
     translator_logger.log(1,
diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
index 02d8754e62c6d..f64d97569feeb 100644
--- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
+++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_operators.py
@@ -24,7 +24,7 @@ def convert_while_loop(cond, body, loop_vars):
     A function representation of a Python ``while`` statement.
 
     Args:
-        cond(Callable): A callable object that returns a boolean variable to control whether to  execute the loop body.  It takes  ``loop_vars`` as arguments.
+        cond(Callable): A callable object that returns a boolean variable to control whether to execute the loop body.  It takes  ``loop_vars`` as arguments.
         body(Callable): A callable object that returns a tuple or list of variables with the same arguments ``loops_vars`` as ``cond`` .
         loop_vars(list|tuple): A list or tuple of variables passed to ``cond`` and ``body`` .
 
@@ -44,7 +44,7 @@ def convert_while_loop(cond, body, loop_vars):
 
 
 def _run_paddle_while_loop(cond, body, loop_vars):
-    # NOTE: loop_vars of Paddle op `control_flow.while_loop` must be Paddle Variable.
+    # NOTE: loop_vars of Paddle op `control_flow.while_loop` must be Paddle Tensors.
     loop_vars = [to_static_variable(var) for var in loop_vars]
     loop_vars = control_flow.while_loop(cond, body, loop_vars)
     return loop_vars
@@ -61,8 +61,8 @@ def convert_logical_and(x, y):
     A function representation of a Python ``and`` statement.
 
     Args:
-        x(bool|Variable): Left hand operand of ``and`` operator.
-        y(bool|Variable): Right hand operand of ``and`` operator.
+        x(bool|Tensor): Left hand operand of ``and`` operator.
+        y(bool|Tensor): Right hand operand of ``and`` operator.
 
     Returns:
         A python bool variable or a bool Tensor.
@@ -94,8 +94,8 @@ def convert_logical_or(x, y):
     A function representation of a Python ``or`` statement.
 
     Args:
-        x(bool|Variable): Left hand operand of ``or`` operator.
-        y(bool|Variable): Right hand operand of ``or`` operator.
+        x(bool|Tensor): Left hand operand of ``or`` operator.
+        y(bool|Tensor): Right hand operand of ``or`` operator.
 
     Returns:
         A python bool variable or a bool Tensor.
@@ -127,7 +127,7 @@ def convert_logical_not(x):
     A function representation of a Python ``not`` statement.
 
     Args:
-        x(bool|Variable): Operand of of ``not`` operator.
+        x(bool|Tensor): Operand of of ``not`` operator.
 
     Returns:
         A python bool variable or a bool Tensor.
@@ -153,7 +153,7 @@ def convert_ifelse(pred, true_fn, false_fn, true_args, false_args, return_vars):
     A function representation of a Python ``if/else`` statement.
 
     Args:
-        pred(bool|Variable): A boolean variable which determines whether to return the result of ``true_fn`` or ``false_fn`` .
+        pred(bool|Tensor): A boolean Tensor which determines whether to return the result of ``true_fn`` or ``false_fn`` .
         true_fn(callable): A callable to be performed if ``pred`` is true.
         false_fn(callable): A callable to be performed if ``pred`` is false.
         true_args(tuple): Parameters of ``true_fn``.
@@ -175,7 +175,7 @@ def _run_paddle_cond(pred, true_fn, false_fn, true_args, false_args,
                      return_vars):
 
     return_var_ids = [id(var) for var in return_vars]
-    # NOTE 1: return vars of Paddle op `control_flow.cond` must be Paddle Variable
+    # NOTE 1: Returned vars of Paddle op `control_flow.cond` must be Paddle Tensors
     # NOTE 2: Here uses id(var) not var, because `if var in return_var` use operator `==`,
     #  which will call `fluid.layers.equal` and causes error when var in return_vars is not initialized.
     true_args = [
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index d2e9340e8a86f..56933cf73ef98 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -627,7 +627,6 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
         .. code-block:: python
 
             import paddle
-            paddle.disable_static()
 
             # x is a Tensor with following elements:
             #    [[0.2, 0.3, 0.5, 0.9]

From 98dc11bb6a825b9997123403f9aaf522e2c7e438 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 11 Nov 2020 10:54:46 +0800
Subject: [PATCH 164/185] add monitoring for executive ut at night (#28377)

* add monitoring for executive ut at night

* fix some error for paddle_build.bat

* fix some error

* fix some error in windows

* fix some error on windows
---
 paddle/scripts/paddle_build.bat    | 11 ++++++++++-
 paddle/scripts/paddle_build.sh     | 14 +++++++++++---
 tools/check_file_diff_approvals.sh |  4 ++--
 3 files changed, 23 insertions(+), 6 deletions(-)

diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index 20e19f10cd203..d557cad1c4c6f 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -339,7 +339,16 @@ if %errorlevel%==0 (
 set PATH=%THIRD_PARTY_PATH:/=\%\install\openblas\lib;%THIRD_PARTY_PATH:/=\%\install\openblas\bin;^
 %THIRD_PARTY_PATH:/=\%\install\zlib\bin;%THIRD_PARTY_PATH:/=\%\install\mklml\lib;^
 %THIRD_PARTY_PATH:/=\%\install\mkldnn\bin;%THIRD_PARTY_PATH:/=\%\install\warpctc\bin;%PATH%
-ctest.exe -E "(%disable_ut_quickly%)" --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
+if "%NIGHTLY_MODE%"=="ON" (
+    set nightly_label="()"
+    ) else (
+    set nightly_label="(RUN_TYPE=NIGHTLY^|RUN_TYPE=DIST:NIGHTLY^|RUN_TYPE=EXCLUSIVE:NIGHTLY)"
+    echo    ========================================
+    echo    "Unittests with nightly labels  are only run at night"
+    echo    ========================================
+)
+
+ctest.exe -E "(%disable_ut_quickly%)" -LE %nightly_label% --output-on-failure -C Release -j 8 --repeat until-pass:4 after-timeout:4
 goto:eof
 
 :unit_test_error
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 315e2ac7af003..14bd5a7ae8932 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -560,7 +560,15 @@ EOF
         set +ex
         ut_startTime_s=`date +%s`
         get_quickly_disable_ut||disable_ut_quickly='' # indicate whether the case was in quickly disable list 
-        ctest -E "($disable_ut_quickly)" --output-on-failure -j $2 | tee $tmpfile
+        if [ ${NIGHTLY_MODE:-OFF} == "ON" ]; then
+            nightly_label=""
+        else
+            nightly_label="RUN_TYPE=NIGHTLY|RUN_TYPE=DIST:NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY"
+            echo "========================================="
+            echo "Unittests with nightly labels  are only run at night"
+            echo "========================================="
+        fi
+        ctest -E "($disable_ut_quickly)" -LE "($nightly_label)" --output-on-failure -j $2 | tee $tmpfile
         failed_test_lists=''
         collect_failed_tests
         mactest_error=0
@@ -741,14 +749,14 @@ function check_approvals_of_unittest() {
         unittest_spec_diff=`python ${PADDLE_ROOT}/tools/diff_unittest.py ${PADDLE_ROOT}/paddle/fluid/UNITTEST_DEV.spec ${PADDLE_ROOT}/paddle/fluid/UNITTEST_PR.spec`
         if [ "$unittest_spec_diff" != "" ]; then
             approval_line=`curl -H "Authorization: token ${GITHUB_API_TOKEN}" https://api.github.com/repos/PaddlePaddle/Paddle/pulls/${GIT_PR_ID}/reviews?per_page=10000`
-            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244`
+            APPROVALS=`echo ${approval_line}|python ${PADDLE_ROOT}/tools/check_pr_approval.py 1 22165420 52485244 32428676 45041955`
             set +x
             echo "current pr ${GIT_PR_ID} got approvals: ${APPROVALS}"
             if [ "${APPROVALS}" == "FALSE" ]; then
                 echo "************************************"
                 echo -e "It is forbidden to disable or delete the unit-test.\n"
                 echo -e "If you must delete it temporarily, please add it to[https://github.com/PaddlePaddle/Paddle/wiki/Temporarily-disabled-Unit-Test]."
-                echo -e "Then you must have one RD (kolinwei(recommended) or zhouwei25) approval for the deletion of unit-test. \n"
+                echo -e "Then you must have one RD (kolinwei(recommended), chalsliu, XieYunshen or zhouwei25) approval for the deletion of unit-test. \n"
                 echo -e "If you have any problems about deleting unit-test, please read the specification [https://github.com/PaddlePaddle/Paddle/wiki/Deleting-unit-test-is-forbidden]. \n"
                 echo -e "Following unit-tests are deleted in this PR: \n ${unittest_spec_diff} \n"
                 echo "************************************"
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 66b0bf67d7097..f07d6a6d8f126 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -290,13 +290,13 @@ RUNTYPE_FILE_CHANGED=`git diff --name-only --diff-filter=AM upstream/$BRANCH|gre
 if [ "${RUNTYPE_FILE_CHANGED}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     for CMAKELISTS_FILE in ${RUNTYPE_FILE_CHANGED};
     do
-        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|PROPERTIES[[:space:]]+TIMEOUT" || true`
+        RUNTYPE_ADD=`git diff -U0 upstream/$BRANCH ${PADDLE_ROOT}/${CMAKELISTS_FILE} |grep "^+" |grep -E "RUN_TYPE=EXCLUSIVE|RUN_TYPE=DIST|RUN_TYPE=NIGHTLY|RUN_TYPE=EXCLUSIVE:NIGHTLY|RUN_TYPE=DIST:NIGHTLY|PROPERTIES[[:space:]]+TIMEOUT" || true`
 	if [[ ${RUNTYPE_ADD} != "" ]];then
 	    RUNTYPE_ADD_LINES="${RUNTYPE_ADD_LINES}\n${CMAKELISTS_FILE}\n${RUNTYPE_ADD}\n"
 	fi
     done
     if [[ ${RUNTYPE_ADD_LINES} != "" ]];then
-        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE to EXCLUSIVE or DIST, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
+        echo_line="You must have one QA (XieYunshen(Recommend) or chalsliu) approval for setting parameter RUN_TYPE as EXCLUSIVE, DIST, NIGHTLY, EXCLUSIVE:NIGHTLY or DISTNIGHTLY, or setting TIMEOUT properties.\nThe corresponding lines are as follows:\n${RUNTYPE_ADD_LINES}\nFor more information, please refer to:https://github.com/PaddlePaddle/Paddle/wiki/PaddlePaddle-Unit-test-specification"
 	check_approval 1 32428676 45041955
     fi
 fi

From d7cfee9b315bd5a54a07b388b2b3c2dedd55a63a Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 11 Nov 2020 11:29:15 +0800
Subject: [PATCH 165/185] Checkout point add (#28488)

* upgrade pass capability
---
 paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc |  2 +-
 paddle/fluid/operators/fill_constant_op.cc             | 10 ++++++++++
 paddle/fluid/operators/gather_op.cc                    |  6 +++---
 3 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
index d74843611cdd2..542aadbe53d5e 100644
--- a/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/squared_mat_sub_fuse_pass.cc
@@ -394,5 +394,5 @@ REGISTER_PASS_CAPABILITY(squared_mat_sub_fuse_pass)
             .EQ("square", 0)
             .EQ("elementwise_mul", 0)
             .EQ("elementwise_sub", 0)
-            .EQ("fill_constant", 0)
+            .EQ("fill_constant", 1)
             .EQ("fusion_squared_mat_sub", 0));
diff --git a/paddle/fluid/operators/fill_constant_op.cc b/paddle/fluid/operators/fill_constant_op.cc
index 35d54577bfef8..cc85c295965ba 100644
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/fill_constant_op.h"
 #include <string>
+#include "paddle/fluid/framework/op_version_registry.h"
 namespace paddle {
 namespace operators {
 
@@ -143,3 +144,12 @@ REGISTER_OP_CPU_KERNEL(fill_constant, ops::FillConstantKernel<float>,
                        ops::FillConstantKernel<int>,
                        ops::FillConstantKernel<bool>,
                        ops::FillConstantKernel<paddle::platform::float16>);
+
+REGISTER_OP_VERSION(fill_constant)
+    .AddCheckpoint(
+        R"ROC(
+      Upgrade fill_constant, add a new input [ValueTensor].
+    )ROC",
+        paddle::framework::compatible::OpVersionDesc().NewInput(
+            "ValueTensor",
+            "In order to support new feature tensor support of Value"));
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 72b44b22f9c06..34fd11e8c0d0d 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -171,6 +171,6 @@ REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<uint8_t>,
                        ops::GatherGradientOpKernel<int64_t>);
 REGISTER_OP_VERSION(gather)
-    .AddCheckpoint(R"ROC(upgrad gather, add attribut [axis])ROC",
-                   paddle::framework::compatible::OpVersionDesc().NewAttr(
-                       "axis", "Specify the axis of gather operation.", {}));
+    .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
+                   paddle::framework::compatible::OpVersionDesc().NewInput(
+                       "Axis", "Specify the axis of gather operation."));

From 5305b2749a4c5f913b0fa8b5ffe6ba616b621bab Mon Sep 17 00:00:00 2001
From: Kaipeng Deng <dengkaipeng@baidu.com>
Date: Wed, 11 Nov 2020 15:04:26 +0800
Subject: [PATCH 166/185] deprecated APIs under paddle.dataset. test=develop
 (#28423)

---
 python/paddle/__init__.py            |  2 --
 python/paddle/dataset/__init__.py    | 17 ++-----------
 python/paddle/dataset/cifar.py       | 21 ++++++++++++++++
 python/paddle/dataset/conll05.py     | 17 +++++++++++++
 python/paddle/dataset/flowers.py     | 13 ++++++++++
 python/paddle/dataset/imdb.py        | 21 ++++++++++++++++
 python/paddle/dataset/imikolov.py    | 13 ++++++++++
 python/paddle/dataset/mnist.py       | 13 ++++++++++
 python/paddle/dataset/movielens.py   | 37 ++++++++++++++++++++++++++++
 python/paddle/dataset/uci_housing.py | 17 +++++++++++++
 python/paddle/dataset/voc2012.py     | 13 ++++++++++
 python/paddle/dataset/wmt14.py       | 21 ++++++++++++++++
 python/paddle/dataset/wmt16.py       | 21 ++++++++++++++++
 13 files changed, 209 insertions(+), 17 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 50c1142c7bfb6..400dbc85d68c3 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -27,8 +27,6 @@
      import paddle from the source directory; please install paddlepaddle*.whl firstly.'''
                      )
 
-import paddle.reader
-import paddle.dataset
 import paddle.batch
 batch = batch.batch
 from .fluid import monkey_patch_variable
diff --git a/python/paddle/dataset/__init__.py b/python/paddle/dataset/__init__.py
index d1e5975856515..c2739d3805072 100644
--- a/python/paddle/dataset/__init__.py
+++ b/python/paddle/dataset/__init__.py
@@ -29,18 +29,5 @@
 import paddle.dataset.voc2012
 import paddle.dataset.image
 
-__all__ = [
-    'mnist',
-    'imikolov',
-    'imdb',
-    'cifar',
-    'movielens',
-    'conll05',
-    'uci_housing',
-    'wmt14',
-    'wmt16',
-    'mq2007',
-    'flowers',
-    'voc2012',
-    'image',
-]
+# set __all__ as empty for not showing APIs under paddle.dataset
+__all__ = []
diff --git a/python/paddle/dataset/cifar.py b/python/paddle/dataset/cifar.py
index 16f06f2400b58..2ee95c3723b3a 100644
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -32,6 +32,7 @@
 import itertools
 import numpy
 import paddle.dataset.common
+import paddle.utils.deprecated as deprecated
 import tarfile
 import six
 from six.moves import cPickle as pickle
@@ -75,6 +76,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Cifar100",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train100():
     """
     CIFAR-100 training set creator.
@@ -90,6 +95,10 @@ def train100():
         'train')
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Cifar100",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test100():
     """
     CIFAR-100 test set creator.
@@ -105,6 +114,10 @@ def test100():
         'test')
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Cifar10",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train10(cycle=False):
     """
     CIFAR-10 training set creator.
@@ -123,6 +136,10 @@ def train10(cycle=False):
         cycle=cycle)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Cifar10",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test10(cycle=False):
     """
     CIFAR-10 test set creator.
@@ -141,6 +158,10 @@ def test10(cycle=False):
         cycle=cycle)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Cifar10",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(CIFAR10_URL, 'cifar', CIFAR10_MD5)
     paddle.dataset.common.download(CIFAR100_URL, 'cifar', CIFAR100_MD5)
diff --git a/python/paddle/dataset/conll05.py b/python/paddle/dataset/conll05.py
index 81a8cfc2e6abb..e7176626ca2d1 100644
--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -27,6 +27,7 @@
 import itertools
 import paddle.dataset.common
 import paddle.compat as cpt
+import paddle.utils.deprecated as deprecated
 from six.moves import zip, range
 
 __all__ = ['test, get_dict', 'get_embedding']
@@ -202,6 +203,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Conll05st",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict():
     """
     Get the word, verb and label dictionary of Wikipedia corpus.
@@ -215,6 +220,10 @@ def get_dict():
     return word_dict, verb_dict, label_dict
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Conll05st",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_embedding():
     """
     Get the trained word vector based on Wikipedia corpus.
@@ -222,6 +231,10 @@ def get_embedding():
     return paddle.dataset.common.download(EMB_URL, 'conll05st', EMB_MD5)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Conll05st",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
     Conll05 test set creator.
@@ -242,6 +255,10 @@ def test():
     return reader_creator(reader, word_dict, verb_dict, label_dict)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Conll05st",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(WORDDICT_URL, 'conll05st', WORDDICT_MD5)
     paddle.dataset.common.download(VERBDICT_URL, 'conll05st', VERBDICT_MD5)
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index bb60c58211c23..22e0838b12b26 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -39,6 +39,7 @@
 from paddle.dataset.image import *
 from paddle.reader import map_readers, xmap_readers
 from paddle import compat as cpt
+import paddle.utils.deprecated as deprecated
 import os
 import numpy as np
 from multiprocessing import cpu_count
@@ -143,6 +144,10 @@ def reader():
         return map_readers(mapper, reader)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Flowers",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers training set reader.
@@ -172,6 +177,10 @@ def train(mapper=train_mapper, buffered_size=1024, use_xmap=True, cycle=False):
         cycle=cycle)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Flowers",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
     '''
     Create flowers test set reader.
@@ -201,6 +210,10 @@ def test(mapper=test_mapper, buffered_size=1024, use_xmap=True, cycle=False):
         cycle=cycle)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.Flowers",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def valid(mapper=test_mapper, buffered_size=1024, use_xmap=True):
     '''
     Create flowers validation set reader.
diff --git a/python/paddle/dataset/imdb.py b/python/paddle/dataset/imdb.py
index d76a533a55a28..e5a3b6074c96d 100644
--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -23,6 +23,7 @@
 from __future__ import print_function
 
 import paddle.dataset.common
+import paddle.utils.deprecated as deprecated
 import collections
 import tarfile
 import re
@@ -76,6 +77,10 @@ def build_dict(pattern, cutoff):
     return word_idx
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imdb",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def reader_creator(pos_pattern, neg_pattern, word_idx):
     UNK = word_idx['<unk>']
     INS = []
@@ -94,6 +99,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imdb",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(word_idx):
     """
     IMDB training set creator.
@@ -111,6 +120,10 @@ def train(word_idx):
         re.compile("aclImdb/train/neg/.*\.txt$"), word_idx)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imdb",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(word_idx):
     """
     IMDB test set creator.
@@ -128,6 +141,10 @@ def test(word_idx):
         re.compile("aclImdb/test/neg/.*\.txt$"), word_idx)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imdb",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def word_dict():
     """
     Build a word dictionary from the corpus.
@@ -139,5 +156,9 @@ def word_dict():
         re.compile("aclImdb/((train)|(test))/((pos)|(neg))/.*\.txt$"), 150)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imdb",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, 'imdb', MD5)
diff --git a/python/paddle/dataset/imikolov.py b/python/paddle/dataset/imikolov.py
index e1967d3db8c19..cc8e95fc342c2 100644
--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -22,6 +22,7 @@
 from __future__ import print_function
 
 import paddle.dataset.common
+import paddle.utils.deprecated as deprecated
 import collections
 import tarfile
 import six
@@ -111,6 +112,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imikolov",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(word_idx, n, data_type=DataType.NGRAM):
     """
     imikolov training set creator.
@@ -131,6 +136,10 @@ def train(word_idx, n, data_type=DataType.NGRAM):
                           data_type)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imikolov",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(word_idx, n, data_type=DataType.NGRAM):
     """
     imikolov test set creator.
@@ -151,5 +160,9 @@ def test(word_idx, n, data_type=DataType.NGRAM):
                           data_type)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Imikolov",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, "imikolov", MD5)
diff --git a/python/paddle/dataset/mnist.py b/python/paddle/dataset/mnist.py
index f52ffa049bc4a..14e54d593bbe7 100644
--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -21,6 +21,7 @@
 from __future__ import print_function
 
 import paddle.dataset.common
+import paddle.utils.deprecated as deprecated
 import gzip
 import numpy
 import struct
@@ -88,6 +89,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.MNIST",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
     MNIST training set creator.
@@ -105,6 +110,10 @@ def train():
                                        TRAIN_LABEL_MD5), 100)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.MNIST",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
     MNIST test set creator.
@@ -121,6 +130,10 @@ def test():
         100)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.MNIST",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(TRAIN_IMAGE_URL, 'mnist', TRAIN_IMAGE_MD5)
     paddle.dataset.common.download(TRAIN_LABEL_URL, 'mnist', TRAIN_LABEL_MD5)
diff --git a/python/paddle/dataset/movielens.py b/python/paddle/dataset/movielens.py
index 22ecfac953fde..f753f405bba1f 100644
--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -27,6 +27,7 @@
 import numpy as np
 import zipfile
 import paddle.dataset.common
+import paddle.utils.deprecated as deprecated
 import re
 import random
 import functools
@@ -167,6 +168,10 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
                     yield usr.value() + mov.value() + [[rating]]
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def __reader_creator__(**kwargs):
     return lambda: __reader__(**kwargs)
 
@@ -175,6 +180,10 @@ def __reader_creator__(**kwargs):
 test = functools.partial(__reader_creator__, is_test=True)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_movie_title_dict():
     """
     Get movie title dictionary.
@@ -190,6 +199,10 @@ def __max_index_info__(a, b):
         return b
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_movie_id():
     """
     Get the maximum value of movie id.
@@ -198,6 +211,10 @@ def max_movie_id():
     return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_user_id():
     """
     Get the maximum value of user id.
@@ -213,6 +230,10 @@ def __max_job_id_impl__(a, b):
         return b
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def max_job_id():
     """
     Get the maximum value of job id.
@@ -222,6 +243,10 @@ def max_job_id():
                             list(USER_INFO.values())).job_id
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def movie_categories():
     """
     Get movie categories dictionary.
@@ -230,6 +255,10 @@ def movie_categories():
     return CATEGORIES_DICT
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def user_info():
     """
     Get user info dictionary.
@@ -238,6 +267,10 @@ def user_info():
     return USER_INFO
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def movie_info():
     """
     Get movie info dictionary.
@@ -255,6 +288,10 @@ def unittest():
     print(train_count, test_count)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.Movielens",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, "movielens", MD5)
 
diff --git a/python/paddle/dataset/uci_housing.py b/python/paddle/dataset/uci_housing.py
index f7930d34f93e2..daed62fbefba1 100644
--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -27,6 +27,7 @@
 import tarfile
 import os
 import paddle.dataset.common
+import paddle.utils.deprecated as deprecated
 
 __all__ = ['train', 'test']
 
@@ -83,6 +84,10 @@ def load_data(filename, feature_num=14, ratio=0.8):
     UCI_TEST_DATA = data[offset:]
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.UCIHousing",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
     UCI_HOUSING training set creator.
@@ -103,6 +108,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.UCIHousing",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
     UCI_HOUSING test set creator.
@@ -134,6 +143,10 @@ def fluid_model():
     return dirpath
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.UCIHousing",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def predict_reader():
     """
     It returns just one tuple data to do inference.
@@ -146,5 +159,9 @@ def predict_reader():
     return (UCI_TEST_DATA[0][:-1], )
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.UCIHousing",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL, 'uci_housing', MD5)
diff --git a/python/paddle/dataset/voc2012.py b/python/paddle/dataset/voc2012.py
index 50688937654ae..5a0ff76aab4fe 100644
--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -26,6 +26,7 @@
 import numpy as np
 from paddle.dataset.common import download
 from paddle.dataset.image import *
+import paddle.utils.deprecated as deprecated
 from PIL import Image
 
 __all__ = ['train', 'test', 'val']
@@ -66,6 +67,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.VOC2012",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train():
     """
     Create a train dataset reader containing 2913 images in HWC order.
@@ -73,6 +78,10 @@ def train():
     return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'trainval')
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.VOC2012",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test():
     """
     Create a test dataset reader containing 1464 images in HWC order.
@@ -80,6 +89,10 @@ def test():
     return reader_creator(download(VOC_URL, CACHE_DIR, VOC_MD5), 'train')
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.vision.datasets.VOC2012",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def val():
     """
     Create a val dataset reader containing 1449 images in HWC order.
diff --git a/python/paddle/dataset/wmt14.py b/python/paddle/dataset/wmt14.py
index 129e1129fb9f6..3bd5e8d5bad46 100644
--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -28,6 +28,7 @@
 
 import paddle.dataset.common
 import paddle.compat as cpt
+import paddle.utils.deprecated as deprecated
 
 __all__ = [
     'train',
@@ -114,6 +115,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT14",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(dict_size):
     """
     WMT14 training set creator.
@@ -130,6 +135,10 @@ def train(dict_size):
         'train/train', dict_size)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT14",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(dict_size):
     """
     WMT14 test set creator.
@@ -146,12 +155,20 @@ def test(dict_size):
         'test/test', dict_size)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT14",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def gen(dict_size):
     return reader_creator(
         paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN),
         'gen/gen', dict_size)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT14",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict(dict_size, reverse=True):
     # if reverse = False, return dict = {'a':'001', 'b':'002', ...}
     # else reverse = true, return dict = {'001':'a', '002':'b', ...}
@@ -163,6 +180,10 @@ def get_dict(dict_size, reverse=True):
     return src_dict, trg_dict
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT14",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
     paddle.dataset.common.download(URL_MODEL, 'wmt14', MD5_MODEL)
diff --git a/python/paddle/dataset/wmt16.py b/python/paddle/dataset/wmt16.py
index 251e305104edc..7f11bc4b1f013 100644
--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -38,6 +38,7 @@
 
 import paddle
 import paddle.compat as cpt
+import paddle.utils.deprecated as deprecated
 
 __all__ = [
     "train",
@@ -144,6 +145,10 @@ def reader():
     return reader
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT16",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def train(src_dict_size, trg_dict_size, src_lang="en"):
     """
     WMT16 train set reader.
@@ -193,6 +198,10 @@ def train(src_dict_size, trg_dict_size, src_lang="en"):
         src_lang=src_lang)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT16",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def test(src_dict_size, trg_dict_size, src_lang="en"):
     """
     WMT16 test set reader.
@@ -242,6 +251,10 @@ def test(src_dict_size, trg_dict_size, src_lang="en"):
         src_lang=src_lang)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT16",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def validation(src_dict_size, trg_dict_size, src_lang="en"):
     """
     WMT16 validation set reader.
@@ -289,6 +302,10 @@ def validation(src_dict_size, trg_dict_size, src_lang="en"):
         src_lang=src_lang)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT16",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def get_dict(lang, dict_size, reverse=False):
     """
     return the word dictionary for the specified language.
@@ -319,6 +336,10 @@ def get_dict(lang, dict_size, reverse=False):
     return __load_dict(tar_file, dict_size, lang, reverse)
 
 
+@deprecated(
+    since="2.0.0",
+    update_to="paddle.text.datasets.WMT16",
+    reason="Please use new dataset API which supports paddle.io.DataLoader")
 def fetch():
     """download the entire dataset.
     """

From 621b31c5260e71d3985a63ca30a056169b9e1d0d Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Wed, 11 Nov 2020 15:06:37 +0800
Subject: [PATCH 167/185] modified timeout value for test_resnet_v2 and
 test_resnet (#28532)

---
 .../fluid/tests/unittests/dygraph_to_static/CMakeLists.txt | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
index 56bcd6d7b5289..b6acf5884737a 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/CMakeLists.txt
@@ -14,9 +14,12 @@ set_tests_properties(test_cycle_gan PROPERTIES TIMEOUT 120)
 set_tests_properties(test_lac PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bert PROPERTIES TIMEOUT 120)
 set_tests_properties(test_basic_api_transformation PROPERTIES TIMEOUT 120)
-set_tests_properties(test_resnet PROPERTIES TIMEOUT 120)
 set_tests_properties(test_reinforcement_learning PROPERTIES TIMEOUT 120)
 set_tests_properties(test_transformer PROPERTIES TIMEOUT 200)
 set_tests_properties(test_mnist PROPERTIES TIMEOUT 120)
 set_tests_properties(test_bmn PROPERTIES TIMEOUT 120)
-set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120)
+
+if(NOT WIN32)
+    set_tests_properties(test_resnet_v2 PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_resnet PROPERTIES TIMEOUT 120)
+endif()

From c52fe48f6ffd62cbdf707a93b54c3f3df5547a06 Mon Sep 17 00:00:00 2001
From: wangchaochaohu <wangchao66@baidu.com>
Date: Wed, 11 Nov 2020 15:49:39 +0800
Subject: [PATCH 168/185] fix the GetKernelTypeForVar of input for fluid.gather
 (#28534)

---
 paddle/fluid/operators/gather_op.cc | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index 34fd11e8c0d0d..648afe7e8215f 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -69,7 +69,11 @@ class GatherOp : public framework::OperatorWithKernel {
   framework::OpKernelType GetKernelTypeForVar(
       const std::string& var_name, const framework::Tensor& tensor,
       const framework::OpKernelType& expected_kernel_type) const override {
-    return expected_kernel_type;
+    if (var_name == "Axis") {
+      return expected_kernel_type;
+    }
+    return framework::OpKernelType(expected_kernel_type.data_type_,
+                                   tensor.place(), tensor.layout());
   }
 };
 

From 26d292b1087ed32edebc179d60b82e96df11895a Mon Sep 17 00:00:00 2001
From: furnace <34057289+windstamp@users.noreply.github.com>
Date: Wed, 11 Nov 2020 18:51:49 +0800
Subject: [PATCH 169/185] bugfix for api (mv, empty, empty_like op) (#28513)

---
 python/paddle/tensor/creation.py | 2 --
 python/paddle/tensor/linalg.py   | 2 --
 2 files changed, 4 deletions(-)

diff --git a/python/paddle/tensor/creation.py b/python/paddle/tensor/creation.py
index 7b62ae9102d22..a69bc64c4cf66 100644
--- a/python/paddle/tensor/creation.py
+++ b/python/paddle/tensor/creation.py
@@ -952,7 +952,6 @@ def empty(shape, dtype=None, name=None):
           import paddle
           import numpy as np
 
-          paddle.disable_static()   # Now we are in imperative mode
           paddle.set_device("cpu")  # and use cpu device
 
           # example 1: argument ``shape`` is a list which doesn't contain Tensor.
@@ -1036,7 +1035,6 @@ def empty_like(x, dtype=None, name=None):
           import paddle
           import numpy as np
 
-          paddle.disable_static()   # Now we are in imperative mode
           paddle.set_device("cpu")  # and use cpu device
 
           x = paddle.randn([2, 3], 'float32')
diff --git a/python/paddle/tensor/linalg.py b/python/paddle/tensor/linalg.py
index 2745464995f5d..e46a26bf45ba6 100644
--- a/python/paddle/tensor/linalg.py
+++ b/python/paddle/tensor/linalg.py
@@ -921,13 +921,11 @@ def mv(x, vec, name=None):
             import numpy as np
             import paddle
 
-            paddle.disable_static()
             x_data = np.array([[2, 1, 3], [3, 0, 1]]).astype("float64")
             x = paddle.to_tensor(x_data)
             vec_data = np.array([3, 5, 1])
             vec = paddle.to_tensor(vec_data).astype("float64")
             out = paddle.mv(x, vec)
-            paddle.enable_static()
     """
     if in_dygraph_mode():
         out = core.ops.mv(x, vec)

From 1bf4836580951b6fd50495339a7a75b77bf539f6 Mon Sep 17 00:00:00 2001
From: Wilber <jiweibo@baidu.com>
Date: Wed, 11 Nov 2020 04:56:42 -0600
Subject: [PATCH 170/185] [Inference] Add TryShrinkMemory interface. (#28409)

---
 .../fluid/inference/api/analysis_predictor.cc | 15 +++++-
 .../fluid/inference/api/analysis_predictor.h  | 11 +++++
 .../api/analysis_predictor_tester.cc          | 46 ++++++++++++++++++-
 paddle/fluid/inference/api/api_tester.cc      |  1 +
 paddle/fluid/inference/api/paddle_api.h       | 11 +++++
 .../inference/api/paddle_inference_api.h      | 11 +++++
 paddle/fluid/pybind/inference_api.cc          |  2 +
 7 files changed, 93 insertions(+), 4 deletions(-)

diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index ccfb6dfa17ab4..20bea8e568e46 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -175,7 +175,10 @@ bool AnalysisPredictor::PrepareScope(
     status_is_cloned_ = true;
   } else {
     paddle::framework::InitDevices(false);
-    scope_.reset(new paddle::framework::Scope());
+    scope_.reset(new paddle::framework::Scope(), [&](framework::Scope *scope) {
+      delete scope;
+      memory::Release(place_);
+    });
     status_is_cloned_ = false;
   }
   sub_scope_ = &scope_->NewScope();
@@ -591,7 +594,6 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
         gflags.push_back("--allocator_strategy=thread_local");
         process_level_allocator_enabled = false;
       } else {
-        gflags.push_back("--allocator_strategy=naive_best_fit");
         process_level_allocator_enabled = true;
       }
 
@@ -890,6 +892,11 @@ bool AnalysisPredictor::LoadParameters() {
   return true;
 }
 
+uint64_t AnalysisPredictor::TryShrinkMemory() {
+  ClearIntermediateTensor();
+  return paddle::memory::Release(place_);
+}
+
 void AnalysisPredictor::ClearIntermediateTensor() {
   PADDLE_ENFORCE_NOT_NULL(inference_program_.get(),
                           platform::errors::PreconditionNotMet(
@@ -985,6 +992,8 @@ AnalysisPredictor::~AnalysisPredictor() {
     mkldnn_quantizer_ = nullptr;
   }
 #endif
+
+  memory::Release(place_);
 }
 
 std::unique_ptr<PaddlePredictor> AnalysisPredictor::Clone() {
@@ -1142,6 +1151,8 @@ void Predictor::ClearIntermediateTensor() {
   predictor_->ClearIntermediateTensor();
 }
 
+uint64_t Predictor::TryShrinkMemory() { return predictor_->TryShrinkMemory(); }
+
 int GetNumBytesOfDataType(DataType dtype) {
   switch (dtype) {
     case DataType::FLOAT32:
diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h
index 269f2fd80bb47..35b52fa56d63a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.h
+++ b/paddle/fluid/inference/api/analysis_predictor.h
@@ -193,6 +193,17 @@ class AnalysisPredictor : public PaddlePredictor {
   ///
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory() override;
+
   ///
   /// \brief Get the argument used by predictor
   ///
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index 5766919f08e68..67c9b441e2619 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -135,6 +135,7 @@ TEST(AnalysisPredictor, ZeroCopy) {
   auto* out_data = out->data<float>(&place, &size);
   LOG(INFO) << "output size: " << size / sizeof(float);
   LOG(INFO) << "output_data: " << out_data;
+  predictor->TryShrinkMemory();
 }
 
 TEST(AnalysisPredictor, Clone) {
@@ -253,8 +254,7 @@ class MkldnnQuantizerTest : public testing::Test {
  public:
   MkldnnQuantizerTest() {
     AnalysisConfig config(FLAGS_dirname);
-
-    predictor.reset(new AnalysisPredictor(config));
+    predictor = std::move(CreatePaddlePredictor(config));
     auto* predictor_p = static_cast<AnalysisPredictor*>(predictor.get());
 
     auto qconfig = new MkldnnQuantizerConfig();
@@ -507,3 +507,45 @@ TEST(AnalysisPredictor, bf16_pass_strategy) {
 }
 
 }  // namespace paddle
+
+namespace paddle_infer {
+
+TEST(Predictor, Run) {
+  Config config;
+  config.SetModel(FLAGS_dirname);
+
+  auto predictor = CreatePredictor(config);
+
+  auto w0 = predictor->GetInputHandle("firstw");
+  auto w1 = predictor->GetInputHandle("secondw");
+  auto w2 = predictor->GetInputHandle("thirdw");
+  auto w3 = predictor->GetInputHandle("forthw");
+
+  w0->Reshape({4, 1});
+  w1->Reshape({4, 1});
+  w2->Reshape({4, 1});
+  w3->Reshape({4, 1});
+
+  auto* w0_data = w0->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w1_data = w1->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w2_data = w2->mutable_data<int64_t>(PlaceType::kCPU);
+  auto* w3_data = w3->mutable_data<int64_t>(PlaceType::kCPU);
+
+  for (int i = 0; i < 4; i++) {
+    w0_data[i] = i;
+    w1_data[i] = i;
+    w2_data[i] = i;
+    w3_data[i] = i;
+  }
+
+  predictor->Run();
+
+  auto out = predictor->GetOutputHandle("fc_1.tmp_2");
+  PlaceType place;
+  int size = 0;
+  out->data<float>(&place, &size);
+  LOG(INFO) << "output size: " << size / sizeof(float);
+  predictor->TryShrinkMemory();
+}
+
+}  // namespace paddle_infer
diff --git a/paddle/fluid/inference/api/api_tester.cc b/paddle/fluid/inference/api/api_tester.cc
index 988ffc47292b5..0c717f0fae03c 100644
--- a/paddle/fluid/inference/api/api_tester.cc
+++ b/paddle/fluid/inference/api/api_tester.cc
@@ -60,6 +60,7 @@ TEST(paddle_inference_api, demo) {
   auto predictor = CreatePaddlePredictor(config);
   std::vector<PaddleTensor> outputs;
   predictor->Run({}, &outputs);
+  predictor->TryShrinkMemory();
 }
 
 TEST(paddle_inference_api, get_version) {
diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h
index 064f63542683a..9fd198fb5a473 100644
--- a/paddle/fluid/inference/api/paddle_api.h
+++ b/paddle/fluid/inference/api/paddle_api.h
@@ -319,6 +319,17 @@ class PD_INFER_DECL PaddlePredictor {
   ///
   virtual void ClearIntermediateTensor() {}
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  virtual uint64_t TryShrinkMemory() { return 0; }
+
   /// \brief Clone an existing predictor
   /// When using clone, the same network will be created,
   /// and the parameters between them are shared.
diff --git a/paddle/fluid/inference/api/paddle_inference_api.h b/paddle/fluid/inference/api/paddle_inference_api.h
index 5dc4430fde471..2e1e3b822d164 100644
--- a/paddle/fluid/inference/api/paddle_inference_api.h
+++ b/paddle/fluid/inference/api/paddle_inference_api.h
@@ -224,6 +224,17 @@ class PD_INFER_DECL Predictor {
   /// \brief Clear the intermediate tensors of the predictor
   void ClearIntermediateTensor();
 
+  ///
+  /// \brief Release all tmp tensor to compress the size of the memory pool.
+  /// The memory pool is considered to be composed of a list of chunks, if
+  /// the chunk is not occupied, it can be released.
+  ///
+  /// \return Number of bytes released. It may be smaller than the actual
+  /// released memory, because part of the memory is not managed by the
+  /// MemoryPool.
+  ///
+  uint64_t TryShrinkMemory();
+
  private:
   std::unique_ptr<paddle::PaddlePredictor> predictor_;
 };
diff --git a/paddle/fluid/pybind/inference_api.cc b/paddle/fluid/pybind/inference_api.cc
index a0cb096193fcd..7f3fe410464ed 100644
--- a/paddle/fluid/pybind/inference_api.cc
+++ b/paddle/fluid/pybind/inference_api.cc
@@ -566,6 +566,7 @@ void BindAnalysisPredictor(py::module *m) {
       .def("zero_copy_run", &AnalysisPredictor::ZeroCopyRun)
       .def("clear_intermediate_tensor",
            &AnalysisPredictor::ClearIntermediateTensor)
+      .def("try_shrink_memory", &AnalysisPredictor::TryShrinkMemory)
       .def("create_feed_fetch_var", &AnalysisPredictor::CreateFeedFetchVar)
       .def("prepare_feed_fetch", &AnalysisPredictor::PrepareFeedFetch)
       .def("prepare_argument", &AnalysisPredictor::PrepareArgument)
@@ -593,6 +594,7 @@ void BindPaddleInferPredictor(py::module *m) {
       .def("get_output_handle", &paddle_infer::Predictor::GetOutputHandle)
       .def("run", &paddle_infer::Predictor::Run)
       .def("clone", &paddle_infer::Predictor::Clone)
+      .def("try_shrink_memory", &paddle_infer::Predictor::TryShrinkMemory)
       .def("clear_intermediate_tensor",
            &paddle_infer::Predictor::ClearIntermediateTensor);
 }

From 543ff333cdf1434fa8ba77ed84f88b6db7c75b5b Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Wed, 11 Nov 2020 20:25:10 +0800
Subject: [PATCH 171/185] Refine the format of printing tensor 3 (support
 scaler tensor)  (#28544)

---
 python/paddle/fluid/tests/unittests/test_var_base.py | 11 +++++++++++
 python/paddle/tensor/to_string.py                    |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py
index 41aef68db624d..511813fc1cd0f 100644
--- a/python/paddle/fluid/tests/unittests/test_var_base.py
+++ b/python/paddle/fluid/tests/unittests/test_var_base.py
@@ -466,6 +466,17 @@ def test_tensor_str3(self):
         self.assertEqual(a_str, expected)
         paddle.enable_static()
 
+    def test_tensor_str_scaler(self):
+        paddle.disable_static(paddle.CPUPlace())
+        a = paddle.to_tensor(np.array(False))
+        a_str = str(a)
+
+        expected = '''Tensor(shape=[], dtype=bool, place=CPUPlace, stop_gradient=True,
+       False)'''
+
+        self.assertEqual(a_str, expected)
+        paddle.enable_static()
+
 
 class TestVarBaseSetitem(unittest.TestCase):
     def setUp(self):
diff --git a/python/paddle/tensor/to_string.py b/python/paddle/tensor/to_string.py
index bd956b923a663..778a391df605e 100644
--- a/python/paddle/tensor/to_string.py
+++ b/python/paddle/tensor/to_string.py
@@ -153,7 +153,7 @@ def _format_tensor(var, sumary, indent=0, max_width=0, signed=False):
     if len(var.shape) == 0:
         # currently, shape = [], i.e., scaler tensor is not supported.
         # If it is supported, it should be formatted like this.
-        return _format_item(var.item(0), max_width, signed)
+        return _format_item(var, max_width, signed)
     elif len(var.shape) == 1:
         if sumary and var.shape[0] > 2 * edgeitems:
             items = [

From b258caf467019dc3e145e5ee1fb989cc8b664353 Mon Sep 17 00:00:00 2001
From: Steffy-zxf <48793257+Steffy-zxf@users.noreply.github.com>
Date: Thu, 12 Nov 2020 10:07:20 +0800
Subject: [PATCH 172/185] fix add_n doc (eng) (#28464)

fix the add_n english doc
---
 python/paddle/tensor/math.py | 66 +++++++++++++++++++-----------------
 1 file changed, 35 insertions(+), 31 deletions(-)

diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 56933cf73ef98..33f9158d438dd 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -717,37 +717,41 @@ def sum(x, axis=None, dtype=None, keepdim=False, name=None):
 @templatedoc(op_type="sum")
 def add_n(inputs, name=None):
     """
-    ${comment}
-
-    Case 1:
-    ::
-        Input:
-            Input. Shape = [2, 3]
-            Input = [[1, 2, 3],
-                     [4, 5, 6]]
-
-        Output:
-            The output. Shape = [2, 3]
-            Output = [[1, 2, 3],
-                      [4, 5, 6]]
-
-    Case 2:
-    ::
-        Input:
-            First input:
-            Input1. Shape = [2, 3]
-            Input1 = [[1, 2, 3],
-                      [4, 5, 6]]
-
-        The second input:
-            Input2. Shape = [2, 3]
-            Input2 = [[7, 8, 9],
-                      [10, 11, 12]]
-
-        Output:
-            The output. Shape = [2, 3]
-            Output = [[8, 10, 12],
-                      [14, 16, 18]]
+    This OP is used to sum one or more Tensor of the input.
+    
+    For example:
+
+    .. code-block:: text
+    
+        Case 1:
+
+            Input:
+                input.shape = [2, 3]
+                input = [[1, 2, 3],
+                         [4, 5, 6]]
+
+            Output:
+                output.shape = [2, 3]
+                output = [[1, 2, 3],
+                          [4, 5, 6]]
+
+        Case 2:
+       
+            Input:
+                First input:
+                    input1.shape = [2, 3]
+                    Input1 = [[1, 2, 3],
+                              [4, 5, 6]]
+
+                The second input:
+                    input2.shape = [2, 3]
+                    input2 = [[7, 8, 9],
+                              [10, 11, 12]]
+
+                Output:
+                    output.shape = [2, 3]
+                    output = [[8, 10, 12],
+                              [14, 16, 18]]
 
     Args:
         inputs (Tensor|list(Tensor)):  A Tensor list. The shape and data type of the list elements should be consistent.

From 0fc181dbd06bbec8b8f5e0a99fe26d505af778bb Mon Sep 17 00:00:00 2001
From: lidanqing <danqing.li@intel.com>
Date: Thu, 12 Nov 2020 03:38:16 +0100
Subject: [PATCH 173/185] [Fix bug] If the pass name is not found, IsCompatible
 should return false (#28475)

---
 .../fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc   | 2 +-
 paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc   | 1 +
 paddle/fluid/framework/op_version_registry.h                  | 2 +-
 paddle/fluid/framework/op_version_registry_test.cc            | 4 ++++
 ...mkldnn_fuse_pass.py => test_mkldnn_conv_bias_fuse_pass.py} | 0
 5 files changed, 7 insertions(+), 2 deletions(-)
 rename python/paddle/fluid/tests/unittests/ir/inference/{test_conv_bias_mkldnn_fuse_pass.py => test_mkldnn_conv_bias_fuse_pass.py} (100%)

diff --git a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
index 76c6ca24aaaf0..716c49dcb12d9 100644
--- a/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/conv_bias_mkldnn_fuse_pass.cc
@@ -158,7 +158,7 @@ REGISTER_PASS(conv_transpose_bias_mkldnn_fuse_pass,
 REGISTER_PASS_CAPABILITY(conv_transpose_bias_mkldnn_fuse_pass)
     .AddCombination(
         paddle::framework::compatible::OpVersionComparatorCombination()
-            .EQ("conv2d_transpose", 0)
+            .LE("conv2d_transpose", 1)
             .EQ("elementwise_add", 0));
 
 REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass,
diff --git a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
index 96f88e70a98d4..895c396e1e614 100644
--- a/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/quant_conv2d_dequant_fuse_pass.cc
@@ -326,6 +326,7 @@ void QuantDequantFusePass::ApplyImpl(ir::Graph* graph) const {
 
 REGISTER_PASS(quant_conv2d_dequant_fuse_pass,
               paddle::framework::ir::QuantDequantFusePass);
+REGISTER_PASS_CAPABILITY(quant_conv2d_dequant_fuse_pass);
 
 REGISTER_PASS_CAPABILITY(tensorrt_subgraph_pass)
     .AddCombination(
diff --git a/paddle/fluid/framework/op_version_registry.h b/paddle/fluid/framework/op_version_registry.h
index c9d3084724bcd..c121e6429dbb4 100644
--- a/paddle/fluid/framework/op_version_registry.h
+++ b/paddle/fluid/framework/op_version_registry.h
@@ -308,7 +308,7 @@ class PassVersionCheckerRegistrar {
   bool IsPassCompatible(const std::string& fuse_pass_name) const {
     auto iter = pass_version_checkers_map_.find(fuse_pass_name);
     if (iter == pass_version_checkers_map_.end()) {
-      return true;
+      return false;
     }
     return iter->second.IsPassCompatible();
   }
diff --git a/paddle/fluid/framework/op_version_registry_test.cc b/paddle/fluid/framework/op_version_registry_test.cc
index ef8384c1e7ee1..888dd6de0618b 100644
--- a/paddle/fluid/framework/op_version_registry_test.cc
+++ b/paddle/fluid/framework/op_version_registry_test.cc
@@ -57,6 +57,10 @@ TEST(test_operator_version, test_operator_version) {
 
 TEST(test_pass_op_version_checker, test_pass_op_version_checker) {
   const std::string fake_op_name{"op_name__"};
+  ASSERT_FALSE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
+      "no_registered_capability_pass"));
+
+  REGISTER_PASS_CAPABILITY(no_bind_pass);
   ASSERT_TRUE(PassVersionCheckerRegistrar::GetInstance().IsPassCompatible(
       "no_bind_pass"));
 
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py
similarity index 100%
rename from python/paddle/fluid/tests/unittests/ir/inference/test_conv_bias_mkldnn_fuse_pass.py
rename to python/paddle/fluid/tests/unittests/ir/inference/test_mkldnn_conv_bias_fuse_pass.py

From 08d24131429c65c74d85591ae310a0a2a7b6d975 Mon Sep 17 00:00:00 2001
From: joejiong <wujionghao@baidu.com>
Date: Thu, 12 Nov 2020 14:33:01 +0800
Subject: [PATCH 174/185] add log2 operator (#28319)

As the title
---
 paddle/fluid/operators/activation_op.cc       |  10 ++
 paddle/fluid/operators/activation_op.h        |  22 +++
 python/paddle/__init__.py                     |   1 +
 .../tests/unittests/test_activation_op.py     | 155 ++++++++++++++----
 python/paddle/tensor/__init__.py              |   1 +
 python/paddle/tensor/math.py                  |  49 ++++++
 6 files changed, 203 insertions(+), 35 deletions(-)
 mode change 100644 => 100755 paddle/fluid/operators/activation_op.cc
 mode change 100644 => 100755 paddle/fluid/operators/activation_op.h

diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
old mode 100644
new mode 100755
index a640a6c745ccb..a541831f79a1c
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -301,6 +301,15 @@ Natural logarithm of x.
 
 )DOC";
 
+UNUSED constexpr char Log2Doc[] = R"DOC(
+Log2 Activation Operator.
+
+$$out = \log_2x$$
+
+logarithm of x base to 2.
+
+)DOC";
+
 UNUSED constexpr char Log1pDoc[] = R"DOC(
 Log Activation Operator.
 
@@ -697,6 +706,7 @@ REGISTER_ACTIVATION_OP_MAKER(Cosh, CoshDoc);
 REGISTER_ACTIVATION_OP_MAKER(Round, RoundDoc);
 REGISTER_ACTIVATION_OP_MAKER(Reciprocal, ReciprocalDoc);
 REGISTER_ACTIVATION_OP_MAKER(Log, LogDoc);
+REGISTER_ACTIVATION_OP_MAKER(Log2, Log2Doc);
 REGISTER_ACTIVATION_OP_MAKER(Log1p, Log1pDoc);
 REGISTER_ACTIVATION_OP_MAKER(Square, SquareDoc);
 REGISTER_ACTIVATION_OP_MAKER(Softsign, SoftsignDoc);
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
old mode 100644
new mode 100755
index a5c613297a473..0892eca35c3b4
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -820,6 +820,27 @@ struct LogGradFunctor : public BaseActivationFunctor<T> {
   static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
 };
 
+// log2(x) = logarithm to the base 2 of the elements of x
+template <typename T>
+struct Log2Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(2));
+  }
+};
+
+// the gradient of log2(x) is 1/(x*ln(2))
+template <typename T>
+struct Log2GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out, typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return kDepX; }
+};
+
 // log1p(x) = natural logarithm of x+1
 template <typename T>
 struct Log1pFunctor : public BaseActivationFunctor<T> {
@@ -1908,6 +1929,7 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                       \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor);  \
   __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                      \
+  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                          \
   __macro(brelu, BRelu, BReluFunctor, BReluGradFunctor);                      \
   __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);         \
   __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                      \
diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 400dbc85d68c3..40fff86fbf65f 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -151,6 +151,7 @@
 from .tensor.math import floor  #DEFINE_ALIAS
 from .tensor.math import increment  #DEFINE_ALIAS
 from .tensor.math import log  #DEFINE_ALIAS
+from .tensor.math import log2  #DEFINE_ALIAS
 from .tensor.math import multiplex  #DEFINE_ALIAS
 from .tensor.math import pow  #DEFINE_ALIAS
 from .tensor.math import reciprocal  #DEFINE_ALIAS
diff --git a/python/paddle/fluid/tests/unittests/test_activation_op.py b/python/paddle/fluid/tests/unittests/test_activation_op.py
index 8d9056f0ee37e..53e4bbc4bf284 100755
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
@@ -180,10 +180,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.log_sigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[11, 17], dtype='int32')
             self.assertRaises(TypeError, F.log_sigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[11, 17], dtype='float16')
             F.log_sigmoid(x_fp16)
 
 
@@ -260,10 +262,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.tanh, 1)
             # The input dtype must be float16, float32.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.tanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.tanh(x_fp16)
 
 
@@ -519,10 +523,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.tanhshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.tanhshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.tanhshrink(x_fp16)
 
 
@@ -616,10 +622,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardshrink, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardshrink(x_fp16)
 
 
@@ -676,10 +684,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardtanh, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardtanh, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardtanh(x_fp16)
 
 
@@ -759,13 +769,16 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softshrink, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.softshrink, x_int32)
             # The threshold must be no less than zero
-            x_fp32 = paddle.fluid.data(name='x_fp32', shape=[12, 10], dtype='float32')
+            x_fp32 = paddle.fluid.data(
+                name='x_fp32', shape=[12, 10], dtype='float32')
             self.assertRaises(ValueError, F.softshrink, x_fp32, -1.0)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.softshrink(x_fp16)
 
 
@@ -1010,10 +1023,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[10, 12], dtype='int32')
             self.assertRaises(TypeError, F.relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[10, 12], dtype='float16')
             F.relu(x_fp16)
 
 
@@ -1119,10 +1134,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.leaky_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.leaky_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.leaky_relu(x_fp16)
 
 
@@ -1218,10 +1235,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.gelu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[11, 17], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[11, 17], dtype='int32')
             self.assertRaises(TypeError, F.gelu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[11, 17], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[11, 17], dtype='float16')
             F.gelu(x_fp16)
 
 
@@ -1368,10 +1387,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.relu6, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.relu6, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.relu6(x_fp16)
 
 
@@ -1455,10 +1476,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardswish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardswish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardswish(x_fp16)
 
 
@@ -1572,10 +1595,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.elu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[10, 12], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[10, 12], dtype='int32')
             self.assertRaises(TypeError, F.elu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[10, 12], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[10, 12], dtype='float16')
             F.elu(x_fp16)
 
 
@@ -1624,6 +1649,55 @@ def test_error(self):
         self.assertRaises(TypeError, fluid.layers.log, in2)
 
 
+class TestLog2(TestActivation):
+    def setUp(self):
+        self.op_type = "log2"
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        out = np.log2(x)
+
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(x)}
+        self.outputs = {'Out': out}
+
+    def test_check_grad(self):
+        if self.dtype == np.float16:
+            return
+        self.check_grad(['X'], 'Out')
+
+    def test_error(self):
+        in1 = paddle.static.data(name="in1", shape=[11, 17], dtype="int32")
+        in2 = paddle.static.data(name="in2", shape=[11, 17], dtype="int64")
+
+        self.assertRaises(TypeError, paddle.log2, in1)
+        self.assertRaises(TypeError, paddle.log2, in2)
+
+    def test_api(self):
+        with paddle.static.program_guard(paddle.static.Program(),
+                                         paddle.static.Program()):
+            input_x = np.random.uniform(0.1, 1, [11, 17]).astype("float64")
+            data_x = paddle.static.data(
+                name="data_x", shape=[11, 17], dtype="float64")
+
+            out1 = paddle.log2(data_x)
+            exe = paddle.static.Executor(place=fluid.CPUPlace())
+            exe.run(paddle.static.default_startup_program())
+            res1 = exe.run(paddle.static.default_main_program(),
+                           feed={"data_x": input_x},
+                           fetch_list=[out1])
+        expected_res = np.log2(input_x)
+        self.assertTrue(np.allclose(res1, expected_res))
+
+        # dygraph
+        with fluid.dygraph.guard():
+            np_x = np.random.uniform(0.1, 1, [11, 17]).astype("float64")
+            data_x = paddle.to_tensor(np_x)
+            z = paddle.log2(data_x)
+            np_z = z.numpy()
+            z_expected = np.array(np.log2(np_x))
+        self.assertTrue(np.allclose(np_z, z_expected))
+
+
 class TestLog1p(TestActivation):
     def setUp(self):
         self.op_type = "log1p"
@@ -1895,10 +1969,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softplus, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.softplus, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.softplus(x_fp16)
 
 
@@ -1972,10 +2048,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.softsign, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.softsign, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.softsign(x_fp16)
 
 
@@ -2055,10 +2133,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.thresholded_relu, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.thresholded_relu, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.thresholded_relu(x_fp16)
 
 
@@ -2154,10 +2234,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.hardsigmoid, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.hardsigmoid, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.hardsigmoid(x_fp16)
 
 
@@ -2232,10 +2314,12 @@ def test_errors(self):
             # The input type must be Variable.
             self.assertRaises(TypeError, F.swish, 1)
             # The input dtype must be float16, float32, float64.
-            x_int32 = paddle.fluid.data(name='x_int32', shape=[12, 10], dtype='int32')
+            x_int32 = paddle.fluid.data(
+                name='x_int32', shape=[12, 10], dtype='int32')
             self.assertRaises(TypeError, F.swish, x_int32)
             # support the input dtype is float16
-            x_fp16 = paddle.fluid.data(name='x_fp16', shape=[12, 10], dtype='float16')
+            x_fp16 = paddle.fluid.data(
+                name='x_fp16', shape=[12, 10], dtype='float16')
             F.swish(x_fp16)
 
 
@@ -2347,6 +2431,7 @@ def test_check_grad(self):
 create_test_act_fp16_class(TestELU)
 create_test_act_fp16_class(TestReciprocal)
 create_test_act_fp16_class(TestLog)
+create_test_act_fp16_class(TestLog2, atol=5e-2)
 create_test_act_fp16_class(TestLog1p, grad_atol=0.9)
 create_test_act_fp16_class(TestSquare)
 create_test_act_fp16_class(TestPow, atol=5e-2)
diff --git a/python/paddle/tensor/__init__.py b/python/paddle/tensor/__init__.py
index 2a9820d4a90d3..55cb0a8986745 100755
--- a/python/paddle/tensor/__init__.py
+++ b/python/paddle/tensor/__init__.py
@@ -151,6 +151,7 @@
 from .math import atan  #DEFINE_ALIAS
 from .math import logsumexp  #DEFINE_ALIAS
 from .math import inverse  #DEFINE_ALIAS
+from .math import log2  #DEFINE_ALIAS
 from .math import log1p  #DEFINE_ALIAS
 from .math import erf  #DEFINE_ALIAS
 # from .math import addcmul  #DEFINE_ALIAS
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 33f9158d438dd..7693a61eef3d7 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -79,6 +79,7 @@
         'floor',
         'increment',
         'log',
+        'log2',
         'logsumexp',
         'mul',
         'multiplex',
@@ -1315,6 +1316,54 @@ def log1p(x, name=None):
     helper.append_op(type="log1p", inputs={"X": x}, outputs={"Out": out})
     return out
 
+def log2(x, name=None):
+    """
+    Calculates the log to the base 2 of the given input tensor, element-wise.
+
+    .. math::
+
+        Out = \\log_2x
+
+    Args:
+        x (Tensor): Input tensor must be one of the following types: float32, float64.
+        name (str|None): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name`
+
+
+    Returns:
+        Tensor: The log to the base 2 of the input Tensor computed element-wise.
+
+    Examples:
+
+        .. code-block:: python
+        
+            import paddle
+
+            # example 1: x is a float
+            x_i = paddle.to_tensor([[1.0], [2.0]])
+            res = paddle.log2(x_i) # [[0.], [1.0]]
+
+            # example 2: x is float32
+            x_i = paddle.full(shape=[1], fill_value=2, dtype='float32')
+            paddle.to_tensor(x_i)
+            res = paddle.log2(x_i)
+            print(res) # [1.0]
+
+            # example 3: x is float64
+            x_i = paddle.full(shape=[1], fill_value=2, dtype='float64')
+            paddle.to_tensor(x_i)
+            res = paddle.log2(x_i)
+            print(res) # [1.0]
+    """
+    if in_dygraph_mode():
+        return core.ops.log2(x)
+
+    check_variable_and_dtype(x, 'x', ['float16', 'float32', 'float64'], "log2")
+    inputs = {'X': [x]}
+    helper = LayerHelper('log2', **locals())
+    dtype = helper.input_dtype(input_param_name='x')
+    out = helper.create_variable_for_type_inference(dtype)
+    helper.append_op(type="log2", inputs={"X": x}, outputs={"Out": out})
+    return out
 
 def addcmul(input, tensor1, tensor2, value=1.0, name=None):
     """

From 8699f38d0841b0026c8a1af2d82cabccf32ecbb0 Mon Sep 17 00:00:00 2001
From: Shang Zhizhou <shangzhizhou@baidu.com>
Date: Thu, 12 Nov 2020 17:00:23 +0800
Subject: [PATCH 175/185] =?UTF-8?q?=E8=A3=81=E5=89=AAtransformer=E6=A8=A1?=
 =?UTF-8?q?=E5=9E=8Btrt=E6=94=AF=E6=8C=81=EF=BC=9B=E4=BF=AE=E5=A4=8Dtensor?=
 =?UTF-8?q?RT=E4=B8=8D=E6=94=AF=E6=8C=81DeletePass=E7=9A=84bug=20(#28517)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* skip_layernorm_op done

* add unittest

* slice op convertor support trt < 6

* skip_layernorm only work in ernie
---
 cmake/operators.cmake                         |   2 +-
 .../embedding_eltwise_layernorm_fuse_pass.cc  |   3 +
 .../ir/multihead_matmul_fuse_pass.cc          |   6 +-
 paddle/fluid/framework/ir/pass.h              |   3 +
 .../framework/ir/skip_layernorm_fuse_pass.cc  |   8 +
 .../ir/skip_layernorm_fuse_pass_tester.cc     |   2 +
 .../ir_passes/tensorrt_subgraph_pass.cc       |  12 +-
 paddle/fluid/inference/api/analysis_config.cc |  12 +-
 .../inference/tensorrt/convert/slice_op.cc    |  10 +-
 .../fluid/inference/tests/api/CMakeLists.txt  |   9 ++
 ...rt_dynamic_shape_transformer_prune_test.cc | 139 ++++++++++++++++++
 paddle/fluid/operators/fused/CMakeLists.txt   |   3 +
 .../operators/fused/skip_layernorm_op.cc      |  91 ++++++++++++
 .../operators/fused/skip_layernorm_op.cu      |  66 +++++++++
 .../fluid/tests/unittests/ir/pass_test.py     |   3 +
 .../ir/test_ir_skip_layernorm_pass.py         |   6 +
 16 files changed, 354 insertions(+), 21 deletions(-)
 create mode 100644 paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
 create mode 100644 paddle/fluid/operators/fused/skip_layernorm_op.cc
 create mode 100644 paddle/fluid/operators/fused/skip_layernorm_op.cu

diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7aa2766763ce9..715d324c357fb 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -127,7 +127,7 @@ function(op_library TARGET)
 "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
 "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
 "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
-"multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
+"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op"
 "fused_bn_add_activation_op")
         if ("${TARGET}" STREQUAL "${manual_pybind_op}")
             set(pybind_flag 1)
diff --git a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
index 51861b402d58a..19662a04f541d 100644
--- a/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_eltwise_layernorm_fuse_pass.cc
@@ -326,6 +326,9 @@ static int BuildFusion(Graph* graph, const std::string& name_scope
 void EmbeddingEltwiseLayerNormFusePass::ApplyImpl(Graph* graph) const {
   FusePassBase::Init(name_scope_, graph);
   int fusion_count = patterns::BuildFusion(graph, name_scope_);
+  if (fusion_count > 0) {
+    graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  }
   AddStatis(fusion_count);
 }
 
diff --git a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
index d1fbc8396ba55..cd6d1d57034d7 100644
--- a/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/multihead_matmul_fuse_pass.cc
@@ -696,7 +696,11 @@ void MultiHeadMatmulV2FusePass::ApplyImpl(Graph* graph) const {
       platform::errors::Fatal(
           "During the multiheadMatmul pass, The scope should not be null."));
 
-  patterns::BuildFusionV2(graph, name_scope_, scope);
+  int fusion_count = patterns::BuildFusionV2(graph, name_scope_, scope);
+  if (fusion_count > 0) {
+    graph->Set(kMultiheadMatmulPass, new bool(true));
+  }
+  AddStatis(fusion_count);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/pass.h b/paddle/fluid/framework/ir/pass.h
index 668dc74eab20a..a3b1b33d2685b 100644
--- a/paddle/fluid/framework/ir/pass.h
+++ b/paddle/fluid/framework/ir/pass.h
@@ -36,6 +36,9 @@ struct PassRegistrar;
 
 typedef std::unordered_set<std::string> PassRecorder;
 constexpr char kPassRecorder[] = "pass_recorder";
+constexpr char kEmbEltwiseLayernormPass[] =
+    "embedding_eltwise_layernorm_fuse_pass_flag";
+constexpr char kMultiheadMatmulPass[] = "multihead_matmul_fuse_pass_flag";
 
 class Pass {
  public:
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
index e5f348dfeb13e..b708f2eff10e7 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass.cc
@@ -134,6 +134,14 @@ void SkipLayerNormFusePass::ApplyImpl(ir::Graph *graph) const {
     GET_IR_NODE_FROM_SUBGRAPH(layer_norm_variance, layer_norm_variance,
                               fused_pattern);
 
+    // check if is in ernie or not
+    if (!graph->Has(kEmbEltwiseLayernormPass) ||
+        !graph->Has(kMultiheadMatmulPass)) {
+      LOG(INFO) << "The skip_layernorm_fuse_pass is only supported in "
+                << "Ernie/Bert model. Just skip this pass.";
+      return;
+    }
+
     std::unordered_set<const Node *> del_node_set;
 
     // Create an SkipLayerNorm op node
diff --git a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
index eff5dcddf54ee..29be2c3cb09a7 100644
--- a/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/skip_layernorm_fuse_pass_tester.cc
@@ -36,6 +36,8 @@ TEST(SkipLayerNormFusePass, basic) {
   layers.layer_norm(elementwise_out, scale, bias);
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
+  graph->Set(kEmbEltwiseLayernormPass, new bool(true));
+  graph->Set(kMultiheadMatmulPass, new bool(true));
   auto pass = PassRegistry::Instance().Get("skip_layernorm_fuse_pass");
   int num_nodes_before = graph->Nodes().size();
   VLOG(3) << DebugString(graph);
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 7ad882797870d..08f3d609fa3e6 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -117,20 +117,11 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
   block_desc.Proto()->set_idx(0);
   LOG(INFO) << "---  detect a sub-graph with " << subgraph.size() << " nodes";
 
-  bool has_fused_embedding_eltwise_layernorm = false;
-  bool has_multihead_matmul = false;
   for (auto *node : subgraph) {
     auto *new_block_op = new_block->AppendOp();
     auto *op = block_desc.AppendOp();
     *new_block_op->Proto() = *node->Op()->Proto();
     *op->Proto() = *node->Op()->Proto();
-    if (!has_fused_embedding_eltwise_layernorm 
-        && op->Type() == "fused_embedding_eltwise_layernorm") {
-      has_fused_embedding_eltwise_layernorm = true;
-    }
-    if (!has_multihead_matmul && op->Type() == "multihead_matmul") {
-      has_multihead_matmul = true;
-    }
   }
 
   // Then, we will use the input_names_with_id and output_names_with_id to
@@ -319,7 +310,8 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                   disable_trt_plugin_fp16);
   trt_engine->SetUseOSS(Get<bool>("use_oss"));
   trt_engine->SetWithErnie(
-      has_multihead_matmul && has_fused_embedding_eltwise_layernorm);
+      graph->Has(framework::ir::kEmbEltwiseLayernormPass) &&
+      graph->Has(framework::ir::kMultiheadMatmulPass));
 
   bool need_serialize = (use_static_engine && !load_from_memory);
   if (need_serialize) {
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 7e5552a74ccd5..9df3c3e316bbc 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -175,7 +175,13 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
 #undef CP_MEMBER
 
-  Update();
+  // Update();
+  // Update() will reset all the passes, when some tensorRT pass is deleted in
+  // other.pass_builder(), it will set again, so just copy the passes.
+  pass_builder_->ClearPasses();
+  for (const std::string &pass : other.pass_builder()->AllPasses()) {
+    pass_builder_->AppendPass(pass);
+  }
 }
 
 void AnalysisConfig::EnableCUDNN() {
@@ -281,9 +287,7 @@ void AnalysisConfig::SetTRTDynamicShapeInfo(
   disable_trt_plugin_fp16_ = disable_trt_plugin_fp16;
 }
 
-void AnalysisConfig::EnableTensorRtOSS() {
-    trt_use_oss_ = true;
-}
+void AnalysisConfig::EnableTensorRtOSS() { trt_use_oss_ = true; }
 
 // TODO(Superjomn) refactor this, buggy.
 void AnalysisConfig::Update() {
diff --git a/paddle/fluid/inference/tensorrt/convert/slice_op.cc b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
index ee4716bb56bc2..f516d605cc1e2 100644
--- a/paddle/fluid/inference/tensorrt/convert/slice_op.cc
+++ b/paddle/fluid/inference/tensorrt/convert/slice_op.cc
@@ -78,6 +78,7 @@ class SliceOpConverter : public OpConverter {
 
     nvinfer1::ILayer* layer = nullptr;
     if (engine_->with_dynamic_shape()) {
+#if IS_TRT_VERSION_GE(6000)
       if (engine_->use_oss() && engine_->with_ernie()) {
         std::vector<nvinfer1::ITensor*> plugin_inputs;
         // plugin_inputs.emplace_back(trans_layer->getOutput(0));
@@ -92,17 +93,16 @@ class SliceOpConverter : public OpConverter {
         layer = engine_->AddPluginV2(plugin_inputs.data(), plugin_inputs.size(),
                                      plugin);
       } else {
-#if IS_TRT_VERSION_GE(6000)
         bool ban_fp16 = engine_->disable_trt_plugin_fp16();
         plugin::SlicePluginDynamic* plugin =
             new plugin::SlicePluginDynamic(starts, ends, axes, ban_fp16);
         layer = engine_->AddPluginV2(&input, 1, plugin);
+      }
 #else
-        PADDLE_THROW(platform::errors::Fatal(
-            "You are running the TRT Dynamic Shape mode, need to confirm that "
-            "your TRT version is no less than 6.0"));
+      PADDLE_THROW(platform::errors::Fatal(
+          "You are running the TRT Dynamic Shape mode, need to confirm that "
+          "your TRT version is no less than 6.0"));
 #endif
-      }
     } else {
       bool ban_fp16 = engine_->disable_trt_plugin_fp16();
       plugin::SlicePlugin* plugin =
diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index bfc2984dc65c6..a1e0717062ee1 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -529,6 +529,15 @@ if(WITH_GPU AND TENSORRT_FOUND)
             EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
             ARGS --infer_model=${TEST_TRT_ERNIE_MODEL}/ernie_model_4)
 
+    set(TEST_TRT_TRANSFORMER_PRUNE_MODEL "${TRT_MODEL_INSTALL_DIR}/transformer_prune")
+    if (NOT EXISTS ${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune.tar.gz)
+        inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz")
+    endif()
+
+    inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
+            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+            ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
+
     set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
     if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
         inference_download_and_uncompress(${TEST_TRT_ERNIE_MODEL} ${INFERENCE_URL}/tensorrt_test "ernie_model_4_unserialized.tgz")
diff --git a/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
new file mode 100644
index 0000000000000..fe86a42663d1f
--- /dev/null
+++ b/paddle/fluid/inference/tests/api/trt_dynamic_shape_transformer_prune_test.cc
@@ -0,0 +1,139 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gflags/gflags.h>
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+
+#include "paddle/fluid/inference/tests/api/trt_test_helper.h"
+
+namespace paddle {
+namespace inference {
+
+void run(const AnalysisConfig& config, std::vector<float>* out_data) {
+  auto predictor = CreatePaddlePredictor(config);
+  auto input_names = predictor->GetInputNames();
+
+  int run_batch = 1;
+  const int run_seq_len = 128;
+
+  std::vector<int64_t> tmp_input;
+  std::vector<float> tmp_four_input;
+  tmp_input.reserve(run_batch * run_seq_len);
+  tmp_four_input.reserve(run_batch * run_seq_len);
+
+  int64_t i0[run_seq_len] = {
+      1,    3558, 4,   75,  491, 89, 340, 313, 93,   4,   255,   10, 75,    321,
+      4095, 1902, 4,   134, 49,  75, 311, 14,  44,   178, 543,   15, 12043, 2,
+      75,   201,  340, 9,   14,  44, 486, 218, 1140, 279, 12043, 2};
+  int64_t i1[run_seq_len] = {
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  int64_t i2[run_seq_len] = {0,  1,  2,  3,  4,  5,  6,  7,  8,  9,
+                             10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+                             20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+                             30, 31, 32, 33, 34, 35, 36, 37, 38, 39};
+  float i3[run_seq_len] = {1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0,
+                           1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0};
+
+  // first input
+  auto input_t = predictor->GetInputTensor(input_names[0]);
+  input_t->Reshape({run_batch, run_seq_len, 1});
+  input_t->copy_from_cpu(i0);
+
+  // second input
+  auto input_t2 = predictor->GetInputTensor(input_names[1]);
+  input_t2->Reshape({run_batch, run_seq_len, 1});
+  input_t2->copy_from_cpu(i1);
+
+  // third input.
+  auto input_t3 = predictor->GetInputTensor(input_names[2]);
+  input_t3->Reshape({run_batch, run_seq_len, 1});
+  input_t3->copy_from_cpu(i2);
+
+  auto input_t4 = predictor->GetInputTensor(input_names[3]);
+  input_t4->Reshape({run_batch, run_seq_len, 1});
+  input_t4->copy_from_cpu(i3);
+
+  ASSERT_TRUE(predictor->ZeroCopyRun());
+
+  auto output_names = predictor->GetOutputNames();
+  auto output_t = predictor->GetOutputTensor(output_names[0]);
+  std::vector<int> output_shape = output_t->shape();
+  int out_num = std::accumulate(output_shape.begin(), output_shape.end(), 1,
+                                std::multiplies<int>());
+  out_data->resize(out_num);
+  output_t->copy_to_cpu(out_data->data());
+}
+
+void trt_ernie(bool with_fp16, std::vector<float> result) {
+  AnalysisConfig config;
+  std::string model_dir = FLAGS_infer_model;
+  SetConfig(&config, model_dir, true);
+
+  config.SwitchUseFeedFetchOps(false);
+
+  int batch = 32;
+  int min_seq_len = 1;
+  int max_seq_len = 128;
+  int opt_seq_len = 128;
+
+  std::vector<int> min_shape = {1, min_seq_len, 1};
+  std::vector<int> max_shape = {batch, max_seq_len, 1};
+  std::vector<int> opt_shape = {batch, opt_seq_len, 1};
+  // Set the input's min, max, opt shape
+  std::map<std::string, std::vector<int>> min_input_shape = {
+      {"read_file_0.tmp_0", min_shape},
+      {"read_file_0.tmp_1", min_shape},
+      {"read_file_0.tmp_2", min_shape},
+      {"read_file_0.tmp_3", min_shape}};
+  std::map<std::string, std::vector<int>> max_input_shape = {
+      {"read_file_0.tmp_0", max_shape},
+      {"read_file_0.tmp_1", max_shape},
+      {"read_file_0.tmp_2", max_shape},
+      {"read_file_0.tmp_3", max_shape}};
+  std::map<std::string, std::vector<int>> opt_input_shape = {
+      {"read_file_0.tmp_0", opt_shape},
+      {"read_file_0.tmp_1", opt_shape},
+      {"read_file_0.tmp_2", opt_shape},
+      {"read_file_0.tmp_3", opt_shape}};
+
+  auto precision = AnalysisConfig::Precision::kFloat32;
+  if (with_fp16) {
+    precision = AnalysisConfig::Precision::kHalf;
+  }
+  config.EnableTensorRtEngine(1 << 30, 1, 12, precision, false, false);
+  config.SetTRTDynamicShapeInfo(min_input_shape, max_input_shape,
+                                opt_input_shape);
+  std::vector<float> out_data;
+  run(config, &out_data);
+
+  for (size_t i = 0; i < out_data.size(); i++) {
+    EXPECT_NEAR(result[i], out_data[i], 1e-5);
+  }
+}
+
+TEST(AnalysisPredictor, no_fp16) {
+  std::vector<float> result = {0.498667, 0.501333};
+  trt_ernie(false, result);
+}
+
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/operators/fused/CMakeLists.txt b/paddle/fluid/operators/fused/CMakeLists.txt
index 477a9162fe3f7..97d6e696b137d 100644
--- a/paddle/fluid/operators/fused/CMakeLists.txt
+++ b/paddle/fluid/operators/fused/CMakeLists.txt
@@ -6,6 +6,7 @@ register_operators(EXCLUDES
     fusion_conv_inception_op
     fused_fc_elementwise_layernorm_op
     multihead_matmul_op
+    skip_layernorm_op
     fused_embedding_eltwise_layernorm_op
     fusion_group_op
     fusion_gru_op
@@ -40,6 +41,8 @@ if (WITH_GPU)
     # multihead_matmul_op
     op_library(multihead_matmul_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(multihead_matmul);\n")
+    op_library(skip_layernorm_op)
+    file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(skip_layernorm);\n")
     op_library(fused_embedding_eltwise_layernorm_op)
     file(APPEND ${pybind_file} "USE_CUDA_ONLY_OP(fused_embedding_eltwise_layernorm);\n")
     # fusion_group
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cc b/paddle/fluid/operators/fused/skip_layernorm_op.cc
new file mode 100644
index 0000000000000..442f359c0dac5
--- /dev/null
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/platform/errors.h"
+
+namespace paddle {
+namespace operators {
+
+class SkipLayerNormOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+ protected:
+  void InferShape(framework::InferShapeContext *context) const override {
+    PADDLE_ENFORCE_EQ(context->HasInput("X"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(X) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(context->HasInput("Y"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Y) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("Scale"), true,
+        platform::errors::InvalidArgument(
+            "Input(Scale) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasInput("Bias"), true,
+        platform::errors::InvalidArgument(
+            "Input(Bias) of MultiHeadMatMul should not be null."));
+    PADDLE_ENFORCE_EQ(
+        context->HasOutput("Out"), true,
+        platform::errors::InvalidArgument(
+            "Output(Out) of MultiHeadMatMul should not be null."));
+
+    auto dim_input = context->GetInputDim("X");
+    context->SetOutputDim("Out", dim_input);
+    context->ShareLoD("X", "Out");
+  }
+};
+
+class SkipLayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "The X input of SkipLayerNorm op");
+    AddInput("Y", "The Y input of SkipLayerNorm op");
+    AddInput("Scale", "The scale input of SkipLayerNorm op");
+    AddInput("Bias", "The bias input of SkipLayerNorm op");
+    AddOutput("Out", "The output of SkipLayerNorm op");
+    AddAttr<float>("epsilon",
+                   "param epsilon of layer_norm op in "
+                   "skip_layernorm_fuse_pass");
+    AddAttr<int>("begin_norm_axis",
+                 "param begin_norm_axis of "
+                 "layer_norm op in skip_layernorm_fuse_pass");
+    AddComment(R"DOC(
+SkipLayerNorm Operator.
+
+This op is used for skip_layernorm_fuse_pass, which fuse op pattern as followed.
+
+     |           |                            |            |
+ other_op1   other_op2                    other_op1    other_op2
+     |           |              fuse           \          /
+     |------elementwise_add      ->           skip_layernorm
+                 |                                   |
+             layer_norm                          other_op3
+                 |                                   |
+             other_op3
+                 |
+
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(skip_layernorm, ops::SkipLayerNormOp,
+                             ops::SkipLayerNormOpMaker);
diff --git a/paddle/fluid/operators/fused/skip_layernorm_op.cu b/paddle/fluid/operators/fused/skip_layernorm_op.cu
new file mode 100644
index 0000000000000..856d5e694bdf1
--- /dev/null
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -0,0 +1,66 @@
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cuda_runtime.h>
+#include <paddle/fluid/platform/device_context.h>
+#include <algorithm>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/operators/math/bert_encoder_functor.h"
+#include "paddle/fluid/operators/math/blas.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename DeviceContext, typename T>
+class SkipLayerNormKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &context) const override {
+    using Tensor = framework::Tensor;
+    auto *X = context.Input<framework::Tensor>("X");
+    auto *Y = context.Input<framework::Tensor>("Y");
+    auto *scale = context.Input<framework::Tensor>("Scale");
+    auto *bias = context.Input<framework::Tensor>("Bias");
+
+    auto *X_d = X->data<T>();
+    auto *Y_d = Y->data<T>();
+    auto *scale_d = scale->data<T>();
+    auto *bias_d = bias->data<T>();
+    float epsilon = context.Attr<float>("epsilon");
+    int begin_norm_axis = context.Attr<int>("begin_norm_axis");
+
+    auto *out = context.Output<framework::Tensor>("Out");
+    out->Resize(X->dims());
+    auto *output_d = out->mutable_data<T>(context.GetPlace());
+
+    size_t num = 1;
+    for (size_t i = 0; i < X->dims().size(); i++) {
+      num *= X->dims()[i];
+    }
+    int hidden = X->dims()[2];
+    auto &device_ctx = context.template device_context<DeviceContext>();
+    operators::math::SkipLayerNormFunctor<T> skip_layer_norm_func;
+
+    skip_layer_norm_func(num, hidden, X_d, Y_d, scale_d, bias_d, output_d,
+                         epsilon, device_ctx.stream());
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(
+    skip_layernorm,
+    ops::SkipLayerNormKernel<paddle::platform::CUDADeviceContext, float>);
diff --git a/python/paddle/fluid/tests/unittests/ir/pass_test.py b/python/paddle/fluid/tests/unittests/ir/pass_test.py
index c1c05c4335975..aae1cc65c9220 100644
--- a/python/paddle/fluid/tests/unittests/ir/pass_test.py
+++ b/python/paddle/fluid/tests/unittests/ir/pass_test.py
@@ -36,6 +36,7 @@ def setUpClass(self):
         self.fetch_list = None
         self.pass_names = None
         self.pass_attrs = {}
+        self.graph_attrs = {}
         self.fused_op_type = None
         self.num_fused_ops = -1
 
@@ -85,6 +86,8 @@ def _run_program(self, executor, program):
     def _apply_ir_passes(self):
         graph = core.Graph(self.main_program.desc)
         graph.set_not_owned("__param_scope__", fluid.global_scope())
+        for attr_name, attr_value in self.graph_attrs.items():
+            graph.set(attr_name, attr_value)
 
         if not isinstance(self.pass_names, list):
             self.pass_names = [self.pass_names]
diff --git a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
index 888857e5a7246..0aac6650f52dd 100644
--- a/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
+++ b/python/paddle/fluid/tests/unittests/ir/test_ir_skip_layernorm_pass.py
@@ -16,12 +16,14 @@
 
 import numpy as np
 from pass_test import PassTest
+import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
 
 
 class SkipLayerNormFusePassTest(PassTest):
     def setUp(self):
+        paddle.enable_static()
         with fluid.program_guard(self.main_program, self.startup_program):
             x = fluid.data(
                 name="x", shape=[128, 768], dtype="float32", lod_level=0)
@@ -34,6 +36,10 @@ def setUp(self):
         self.pass_names = "skip_layernorm_fuse_pass"
         self.fused_op_type = "skip_layernorm"
         self.num_fused_ops = 1
+        self.graph_attrs = {
+            "embedding_eltwise_layernorm_fuse_pass_flag": True,
+            "multihead_matmul_fuse_pass_flag": True
+        }
 
     def test_check_program(self):
         use_gpu_set = [False]

From 614f20f6bad5b158fc13452474dfac2e62ce1de6 Mon Sep 17 00:00:00 2001
From: Yang Zhang <yangzhang@live.com>
Date: Thu, 12 Nov 2020 17:05:05 +0800
Subject: [PATCH 176/185] Update `add` `clip` `MSELoss` and `no_grad` (#28530)

also fix `no_grad` alias
---
 python/paddle/__init__.py           |  1 -
 python/paddle/fluid/dygraph/base.py |  2 --
 python/paddle/framework/__init__.py |  2 +-
 python/paddle/nn/layer/loss.py      |  3 +--
 python/paddle/tensor/math.py        | 16 ++++------------
 5 files changed, 6 insertions(+), 18 deletions(-)

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index 40fff86fbf65f..f66f52000b233 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -263,7 +263,6 @@
 from .fluid.dygraph.base import enable_dygraph as disable_static  #DEFINE_ALIAS
 from .fluid.dygraph.base import disable_dygraph as enable_static  #DEFINE_ALIAS
 from .fluid.framework import in_dygraph_mode as in_dynamic_mode  #DEFINE_ALIAS
-from .fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
 from .fluid.layers import crop_tensor as crop  #DEFINE_ALIAS
 
 from . import jit
diff --git a/python/paddle/fluid/dygraph/base.py b/python/paddle/fluid/dygraph/base.py
index db1a705167cb9..5f0d8e089822c 100644
--- a/python/paddle/fluid/dygraph/base.py
+++ b/python/paddle/fluid/dygraph/base.py
@@ -272,8 +272,6 @@ class no_grad_:
         import numpy as np
         import paddle
 
-        paddle.disable_static()
-
         # use as generator
 
         data = np.array([[2, 3], [4, 5]]).astype('float32')
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 3d06b4ab911ac..5ba4446970fef 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -38,7 +38,7 @@
 from ..fluid.core import VarBase  #DEFINE_ALIAS
 
 from paddle.fluid import core  #DEFINE_ALIAS
-from ..fluid.dygraph.base import no_grad  #DEFINE_ALIAS
+from ..fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
 from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from .io import save
diff --git a/python/paddle/nn/layer/loss.py b/python/paddle/nn/layer/loss.py
index 351afc97a2a88..fdeed0ae49dfd 100644
--- a/python/paddle/nn/layer/loss.py
+++ b/python/paddle/nn/layer/loss.py
@@ -418,12 +418,11 @@ class MSELoss(fluid.dygraph.layers.Layer):
             input_data = np.array([1.5]).astype("float32")
             label_data = np.array([1.7]).astype("float32")
 
-            paddle.disable_static()
             mse_loss = paddle.nn.loss.MSELoss()
             input = paddle.to_tensor(input_data)
             label = paddle.to_tensor(label_data)
             output = mse_loss(input, label)
-            print(output.numpy())
+            print(output)
             # [0.04000002]
     """
 
diff --git a/python/paddle/tensor/math.py b/python/paddle/tensor/math.py
index 7693a61eef3d7..4abd3390d5808 100755
--- a/python/paddle/tensor/math.py
+++ b/python/paddle/tensor/math.py
@@ -274,18 +274,15 @@ def _elementwise_op(helper):
 
 def add(x, y, name=None):
     """
-Examples:
+    Examples:
 
     ..  code-block:: python
 
         import paddle
-
-        paddle.disable_static()
         x = paddle.to_tensor([2, 3, 4], 'float64')
         y = paddle.to_tensor([1, 5, 2], 'float64')
         z = paddle.add(x, y)
-        np_z = z.numpy()
-        print(np_z)  # [3., 8., 6. ]
+        print(z)  # [3., 8., 6. ]
 
     """
     op_type = 'elementwise_add'
@@ -1411,9 +1408,6 @@ def addcmul(input, tensor1, tensor2, value=1.0, name=None):
 
 def clip(x, min=None, max=None, name=None):
     """
-        :alias_main: paddle.clip
-        :alias: paddle.clip,paddle.tensor.clip,paddle.tensor.math.clip
-
     **clip layer**
 
     This operator clip all elements in input into the range [ min, max ] and return
@@ -1440,15 +1434,13 @@ def clip(x, min=None, max=None, name=None):
         .. code-block:: python
 
             import paddle
-
-            paddle.disable_static()
             x1 = paddle.to_tensor([[1.2, 3.5], [4.5, 6.4]], 'float32')
             out1 = paddle.clip(x1, min=3.5, max=5.0)
             out2 = paddle.clip(x1, min=2.5)
-            print(out1.numpy())
+            print(out1)
             # [[3.5, 3.5]
             # [4.5, 5.0]]
-            print(out2.numpy())
+            print(out2)
             # [[2.5, 3.5]
             # [[4.5, 6.4]
     """

From 849467b5aa9870c892e5ad9dd44e9d391fbb9ffa Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Fri, 13 Nov 2020 10:39:39 +0800
Subject: [PATCH 177/185] fix user set CUDA_VISIBLE_DEVICES start/end with
 quotation marks (#28547)

---
 paddle/fluid/platform/gpu_info.cc | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc
index ca1e5501c6a84..2a6714c39a1cb 100644
--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -57,6 +57,16 @@ static int GetCUDADeviceCountImpl() {
   const auto *cuda_visible_devices = std::getenv("CUDA_VISIBLE_DEVICES");
   if (cuda_visible_devices != nullptr) {
     std::string cuda_visible_devices_str(cuda_visible_devices);
+    if (!cuda_visible_devices_str.empty()) {
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\''));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\'') + 1);
+      cuda_visible_devices_str.erase(
+          0, cuda_visible_devices_str.find_first_not_of('\"'));
+      cuda_visible_devices_str.erase(
+          cuda_visible_devices_str.find_last_not_of('\"') + 1);
+    }
     if (std::all_of(cuda_visible_devices_str.begin(),
                     cuda_visible_devices_str.end(),
                     [](char ch) { return ch == ' '; })) {

From 7b1619e69bbf5a5d3009212f16a597585fcdc613 Mon Sep 17 00:00:00 2001
From: YUNSHEN XIE <1084314248@qq.com>
Date: Fri, 13 Nov 2020 11:05:13 +0800
Subject: [PATCH 178/185] disable
 test_trt_dynamic_shape_transformer_prune,test=document_fix (#28588)

---
 paddle/fluid/inference/tests/api/CMakeLists.txt | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/paddle/fluid/inference/tests/api/CMakeLists.txt b/paddle/fluid/inference/tests/api/CMakeLists.txt
index a1e0717062ee1..4eb1c8225660a 100644
--- a/paddle/fluid/inference/tests/api/CMakeLists.txt
+++ b/paddle/fluid/inference/tests/api/CMakeLists.txt
@@ -534,9 +534,9 @@ if(WITH_GPU AND TENSORRT_FOUND)
         inference_download_and_uncompress(${TEST_TRT_TRANSFORMER_PRUNE_MODEL} ${INFERENCE_URL}/tensorrt_test "transformer_prune.tar.gz")
     endif()
 
-    inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
-            EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
-            ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
+    #inference_analysis_test(test_trt_dynamic_shape_transformer_prune SRCS trt_dynamic_shape_transformer_prune_test.cc
+    #        EXTRA_DEPS ${INFERENCE_EXTRA_DEPS} 
+    #        ARGS --infer_model=${TEST_TRT_TRANSFORMER_PRUNE_MODEL}/transformer_prune)
 
     set(TEST_TRT_ERNIE_UNSER_MODEL "${TRT_MODEL_INSTALL_DIR}/ernie_test/ernie_model_4_unserialized/")
     if (NOT EXISTS ${TEST_TRT_ERNIE_UNSER_MODEL}/ernie_model_4_unserialized.tgz)
@@ -619,9 +619,9 @@ set_tests_properties(test_analyzer_transformer PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_bert PROPERTIES TIMEOUT 120)
 set_tests_properties(test_analyzer_mobilenet_depthwise_conv PROPERTIES TIMEOUT 120)
 if(WITH_GPU AND TENSORRT_FOUND)
-set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
-set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
+    set_tests_properties(trt_mobilenet_test PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_bfloat16_resnet50 PROPERTIES TIMEOUT 120)
 endif()
 if(ON_INFER OR WITH_GPU)
-set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_analyzer_transformer_profile PROPERTIES TIMEOUT 120)
 endif()

From bf6e7cba7adccc8d608b185ca9ff5e46d413d35b Mon Sep 17 00:00:00 2001
From: Zhou Wei <52485244+zhouwei25@users.noreply.github.com>
Date: Fri, 13 Nov 2020 11:36:00 +0800
Subject: [PATCH 179/185] updata 2.0 API english doc (#28525)

* make Numpy version is below 1.19.3

* fix 2.0 doc
---
 paddle/fluid/operators/unique_op.cu |   2 +-
 paddle/fluid/pybind/imperative.cc   |  22 ++----
 python/paddle/framework/__init__.py |   6 +-
 python/paddle/framework/io.py       |   4 -
 python/paddle/optimizer/lr.py       | 115 ++++++++++++++++------------
 5 files changed, 77 insertions(+), 72 deletions(-)

diff --git a/paddle/fluid/operators/unique_op.cu b/paddle/fluid/operators/unique_op.cu
index 848df4c7aba8d..d22406f27c470 100644
--- a/paddle/fluid/operators/unique_op.cu
+++ b/paddle/fluid/operators/unique_op.cu
@@ -177,7 +177,7 @@ static void UniqueFlattendCUDATensor(const framework::ExecutionContext& context,
   thrust::sort_by_key(thrust::device, in_data_hat, in_data_hat + num_input,
                       sorted_indices_data);
 
-  // 1. Calculate op result: 'out'：
+  // 1. Calculate op result: 'out'
   Tensor range;
   range.Resize(framework::make_ddim({num_input + 1}));
   auto range_data_ptr = range.mutable_data<IndexT>(context.GetPlace());
diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc
index 4d68afeede4e5..7c36efcaf38bb 100644
--- a/paddle/fluid/pybind/imperative.cc
+++ b/paddle/fluid/pybind/imperative.cc
@@ -685,8 +685,6 @@ void BindImperative(py::module *m_ptr) {
             .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
-
                 linear = Linear(32, 64)
                 data = paddle.uniform(shape=[30, 10, 32], -1, 1)
                 x = linear(data)
@@ -704,19 +702,13 @@ void BindImperative(py::module *m_ptr) {
              .. code-block:: python
 
                 import paddle
-                paddle.disable_static()
-
-                inputs = []
-                for _ in range(10):
-                    tmp = paddle.ones([2, 2])
-                    tmp.stop_gradient=False
-                    inputs.append(tmp)
-                ret = paddle.sums(inputs2)
-                loss = paddle.sum(ret)
-                loss.backward()
-                print("Before clear_gradient {}".format(loss.grad))
-                loss.clear_gradient()
-                print("After clear_gradient {}".format(loss.grad))
+                input = paddle.uniform([10, 2])
+                linear = paddle.nn.Linear(2, 3)
+                out = linear(input)
+                out.backward()
+                print("Before clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
+                linear.weight.clear_gradient()
+                print("After clear_gradient, linear.weight.grad: {}".format(linear.weight.grad))
       )DOC")
       .def("clone",
            [](std::shared_ptr<imperative::VarBase> &self) {
diff --git a/python/paddle/framework/__init__.py b/python/paddle/framework/__init__.py
index 5ba4446970fef..8c1742664fd0f 100644
--- a/python/paddle/framework/__init__.py
+++ b/python/paddle/framework/__init__.py
@@ -18,10 +18,7 @@
     'get_default_dtype', 'set_default_dtype'
 ]
 
-__all__ += [
-    'grad', 'LayerList', 'load', 'save', 'to_variable', 'no_grad',
-    'DataParallel'
-]
+__all__ += ['grad', 'LayerList', 'load', 'save', 'no_grad', 'DataParallel']
 
 from . import random
 from .random import seed
@@ -39,7 +36,6 @@
 
 from paddle.fluid import core  #DEFINE_ALIAS
 from ..fluid.dygraph.base import no_grad_ as no_grad  #DEFINE_ALIAS
-from ..fluid.dygraph.base import to_variable  #DEFINE_ALIAS
 from ..fluid.dygraph.base import grad  #DEFINE_ALIAS
 from .io import save
 from .io import load
diff --git a/python/paddle/framework/io.py b/python/paddle/framework/io.py
index 7e8c717bb1deb..945c8160b47fb 100644
--- a/python/paddle/framework/io.py
+++ b/python/paddle/framework/io.py
@@ -225,8 +225,6 @@ def save(obj, path):
 
             import paddle
 
-            paddle.disable_static()
-
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
             paddle.save(layer_state_dict, "emb.pdparams")
@@ -318,8 +316,6 @@ def load(path, **configs):
         .. code-block:: python
 
             import paddle
-            
-            paddle.disable_static()
 
             emb = paddle.nn.Embedding(10, 10)
             layer_state_dict = emb.state_dict()
diff --git a/python/paddle/optimizer/lr.py b/python/paddle/optimizer/lr.py
index 80b4b2a9d0562..e4fb54c229f21 100644
--- a/python/paddle/optimizer/lr.py
+++ b/python/paddle/optimizer/lr.py
@@ -226,14 +226,15 @@ class NoamDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.NoamDecay(d_model=0.01, warmup_steps=100, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -251,7 +252,7 @@ class NoamDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -259,7 +260,8 @@ class NoamDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
     """
 
@@ -322,14 +324,15 @@ class PiecewiseDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.PiecewiseDecay(boundaries=[3, 6, 9], values=[0.1, 0.2, 0.3, 0.4], verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -347,7 +350,7 @@ class PiecewiseDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -355,7 +358,8 @@ class PiecewiseDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self, boundaries, values, last_epoch=-1, verbose=False):
@@ -403,14 +407,15 @@ class NaturalExpDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.NaturalExpDecay(learning_rate=0.5, gamma=0.1, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -428,7 +433,7 @@ class NaturalExpDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -436,7 +441,8 @@ class NaturalExpDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
@@ -481,14 +487,15 @@ class InverseTimeDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.InverseTimeDecay(learning_rate=0.5, gamma=0.1, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -506,7 +513,7 @@ class InverseTimeDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -514,7 +521,8 @@ class InverseTimeDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
     """
 
@@ -576,14 +584,15 @@ class PolynomialDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.PolynomialDecay(learning_rate=0.5, decay_steps=20, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -601,7 +610,7 @@ class PolynomialDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -609,7 +618,8 @@ class PolynomialDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self,
@@ -691,14 +701,15 @@ class LinearWarmup(LRScheduler):
                     learning_rate=0.5, warmup_steps=20, start_lr=0, end_lr=0.5, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -717,7 +728,7 @@ class LinearWarmup(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -725,7 +736,8 @@ class LinearWarmup(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self,
@@ -814,14 +826,15 @@ class ExponentialDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.ExponentialDecay(learning_rate=0.5, gamma=0.9, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -839,7 +852,7 @@ class ExponentialDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -847,7 +860,8 @@ class ExponentialDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self, learning_rate, gamma, last_epoch=-1, verbose=False):
@@ -901,14 +915,15 @@ class MultiStepDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.MultiStepDecay(learning_rate=0.5, milestones=[2, 4, 6], gamma=0.8, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -926,7 +941,7 @@ class MultiStepDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -934,7 +949,8 @@ class MultiStepDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self,
@@ -1008,14 +1024,15 @@ class StepDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.StepDecay(learning_rate=0.5, step_size=5, gamma=0.8, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -1033,7 +1050,7 @@ class StepDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -1041,7 +1058,8 @@ class StepDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
     """
 
     def __init__(self,
@@ -1102,14 +1120,15 @@ class LambdaDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.LambdaDecay(learning_rate=0.5, lr_lambda=lambda x:0.95**x, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -1127,7 +1146,7 @@ class LambdaDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -1135,7 +1154,8 @@ class LambdaDecay(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step()
+                    scheduler.step()    # If you update learning rate each step
+              # scheduler.step()        # If you update learning rate each epoch
 
     """
 
@@ -1200,14 +1220,15 @@ class ReduceOnPlateau(LRScheduler):
             scheduler = paddle.optimizer.lr.ReduceOnPlateau(learning_rate=1.0, factor=0.5, patience=5, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
                     loss.backward()
                     sgd.step()
                     sgd.clear_gradients()
-                scheduler.step(loss)
+                    scheduler.step(loss)    # If you update learning rate each step
+              # scheduler.step(loss)        # If you update learning rate each epoch
 
             # train on static graph mode
             paddle.enable_static()
@@ -1225,7 +1246,7 @@ class ReduceOnPlateau(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={
@@ -1233,7 +1254,8 @@ class ReduceOnPlateau(LRScheduler):
                             'y': np.random.randn(3, 4, 5).astype('float32')
                         },
                         fetch_list=loss.name)
-                scheduler.step(out[0])
+                    scheduler.step(out[0])    # If you update learning rate each step
+              # scheduler.step(out[0])        # If you update learning rate each epoch
 
     """
 
@@ -1268,7 +1290,6 @@ def __init__(self,
                 "The type of 'learning_rate' in 'ReduceOnPlateau' must be 'float', but received %s."
                 % type(learning_rate))
 
-        self.verbose = verbose
         self.patience = patience
         self.threshold = threshold
         self.threshold_mode = threshold_mode
@@ -1406,7 +1427,7 @@ class CosineAnnealingDecay(LRScheduler):
             scheduler = paddle.optimizer.lr.CosineAnnealingDecay(learning_rate=0.5, T_max=10, verbose=True)
             sgd = paddle.optimizer.SGD(learning_rate=scheduler, parameters=linear.parameters())
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     x = paddle.uniform([10, 10])
                     out = linear(x)
                     loss = paddle.mean(out)
@@ -1431,7 +1452,7 @@ class CosineAnnealingDecay(LRScheduler):
             exe = paddle.static.Executor()
             exe.run(start_prog)
             for epoch in range(20):
-                for batch_id in range(2):
+                for batch_id in range(5):
                     out = exe.run(
                         main_prog,
                         feed={

From c545b9b6731c8fc25f4283017bdb433a780907d3 Mon Sep 17 00:00:00 2001
From: channings <chenlingchi@baidu.com>
Date: Fri, 13 Nov 2020 13:20:30 +0800
Subject: [PATCH 180/185] Add ONNX Exporter (#27831)

* add onnx export module, test=develop

* add unit test for paddle.onnx.export

* adjust api & doc

* fix some typo
---
 python/paddle/__init__.py                     |   1 +
 .../fluid/tests/unittests/test_onnx_export.py |  78 +++++++++++++
 python/paddle/onnx/__init__.py                |  18 +++
 python/paddle/onnx/export.py                  | 105 ++++++++++++++++++
 python/setup.py.in                            |   1 +
 python/unittest_py/requirements.txt           |   1 +
 6 files changed, 204 insertions(+)
 create mode 100644 python/paddle/fluid/tests/unittests/test_onnx_export.py
 create mode 100644 python/paddle/onnx/__init__.py
 create mode 100644 python/paddle/onnx/export.py

diff --git a/python/paddle/__init__.py b/python/paddle/__init__.py
index f66f52000b233..a650ec4faa17d 100755
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -268,6 +268,7 @@
 from . import jit
 from . import static
 from . import amp
+from . import onnx
 
 # high-level api
 from .hapi import Model
diff --git a/python/paddle/fluid/tests/unittests/test_onnx_export.py b/python/paddle/fluid/tests/unittests/test_onnx_export.py
new file mode 100644
index 0000000000000..79d36063d77d5
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_onnx_export.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import os
+import pickle
+import unittest
+import numpy as np
+import paddle
+from paddle.static import InputSpec
+
+
+class LinearNet(paddle.nn.Layer):
+    def __init__(self):
+        super(LinearNet, self).__init__()
+        self._linear = paddle.nn.Linear(128, 10)
+
+    def forward(self, x):
+        return self._linear(x)
+
+
+class Logic(paddle.nn.Layer):
+    def __init__(self):
+        super(Logic, self).__init__()
+
+    def forward(self, x, y, z):
+        if z:
+            return x
+        else:
+            return y
+
+
+class TestExportWithTensor(unittest.TestCase):
+    def setUp(self):
+        self.x_spec = paddle.static.InputSpec(
+            shape=[None, 128], dtype='float32')
+
+    def test_with_tensor():
+        model = LinearNet()
+        paddle.onnx.export(model, 'linear_net', input_spec=[self.x_spec])
+
+
+class TestExportWithTensor(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.to_tensor(np.random.random((1, 128)))
+
+    def test_with_tensor(self):
+        model = LinearNet()
+        paddle.onnx.export(model, 'linear_net', input_spec=[self.x])
+
+
+class TestExportPrunedGraph(unittest.TestCase):
+    def setUp(self):
+        self.x = paddle.to_tensor(np.array([1]))
+        self.y = paddle.to_tensor(np.array([-1]))
+
+    def test_prune_graph(self):
+        model = Logic()
+        paddle.jit.to_static(model)
+        out = model(self.x, self.y, z=True)
+        paddle.onnx.export(
+            model, 'pruned', input_spec=[self.x], output_spec=[out])
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/onnx/__init__.py b/python/paddle/onnx/__init__.py
new file mode 100644
index 0000000000000..885d1968ce1ae
--- /dev/null
+++ b/python/paddle/onnx/__init__.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+from .export import export
+
+__all__ = ['export']
diff --git a/python/paddle/onnx/export.py b/python/paddle/onnx/export.py
new file mode 100644
index 0000000000000..4b99b42bb0423
--- /dev/null
+++ b/python/paddle/onnx/export.py
@@ -0,0 +1,105 @@
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from paddle.utils import try_import
+
+__all__ = ['export']
+
+
+def export(layer, path, input_spec=None, opset_version=9, **configs):
+    """
+    Export Layer to ONNX format, which can use for inference via onnxruntime or other backends.
+    For more details, Please refer to `paddle2onnx <https://github.com/PaddlePaddle/paddle2onnx>`_ .
+
+    Args:
+        layer (Layer): The Layer to be exported.
+        path (str): The path prefix to export model. The format is ``dirname/file_prefix`` or ``file_prefix`` ,
+            and the exported ONNX file suffix is ``.onnx`` . 
+        input_spec (list[InputSpec|Tensor], optional): Describes the input of the exported model's forward 
+            method, which can be described by InputSpec or example Tensor. If None, all input variables of 
+            the original Layer's forward method would be the inputs of the exported ``ONNX`` model. Default: None.
+        opset_version(int, optional): Opset version of exported ONNX model.
+            Now, stable supported opset version include 9, 10, 11. Default: 9.
+        **configs (dict, optional): Other export configuration options for compatibility. We do not 
+            recommend using these configurations, they may be removed in the future. If not necessary, 
+            DO NOT use them. Default None.
+            The following options are currently supported:
+            (1) output_spec (list[Tensor]): Selects the output targets of the exported model.
+            By default, all return variables of original Layer's forward method are kept as the 
+            output of the exported model. If the provided ``output_spec`` list is not all output variables, 
+            the exported model will be pruned according to the given ``output_spec`` list. 
+    Returns:
+        None
+    Examples:
+        .. code-block:: python
+
+            import paddle
+            import numpy as np
+
+            class LinearNet(paddle.nn.Layer):
+                def __init__(self):
+                    super(LinearNet, self).__init__()
+                    self._linear = paddle.nn.Linear(128, 10)
+
+                def forward(self, x):
+                    return self._linear(x)
+
+            # Export model with 'InputSpec' to support dynamic input shape.
+            def export_linear_net():
+                model = LinearNet()
+                x_spec = paddle.static.InputSpec(shape=[None, 128], dtype='float32')
+                paddle.onnx.export(model, 'linear_net', input_spec=[x_spec])
+
+            export_linear_net()
+
+            class Logic(paddle.nn.Layer):
+                def __init__(self):
+                    super(Logic, self).__init__()
+
+                def forward(self, x, y, z):
+                    if z:
+                        return x
+                    else:
+                        return y
+
+            # Export model with 'Tensor' to support pruned model by set 'output_spec'.
+            def export_logic():
+                model = Logic()
+                x = paddle.to_tensor(np.array([1]))
+                y = paddle.to_tensor(np.array([2]))
+                # Static and run model.
+                paddle.jit.to_static(model)
+                out = model(x, y, z=True)
+                paddle.onnx.export(model, 'pruned', input_spec=[x], output_spec=[out])
+
+            export_logic()
+    """
+
+    p2o = try_import('paddle2onnx')
+
+    file_prefix = os.path.basename(path)
+    if file_prefix == "":
+        raise ValueError("The input path MUST be format of dirname/file_prefix "
+                         "[dirname\\file_prefix in Windows system], but "
+                         "the file_prefix is empty in received path: {}".format(
+                             path))
+    save_file = path + '.onnx'
+
+    p2o.dygraph2onnx(
+        layer,
+        save_file,
+        input_spec=input_spec,
+        opset_version=opset_version,
+        **configs)
diff --git a/python/setup.py.in b/python/setup.py.in
index a4570c9d19563..df43e4a317117 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -213,6 +213,7 @@ packages=['paddle',
           'paddle.static',
           'paddle.static.nn',
           'paddle.tensor',
+          'paddle.onnx',
           ]
 
 with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
diff --git a/python/unittest_py/requirements.txt b/python/unittest_py/requirements.txt
index b61ba138441c9..19748f6f8f71b 100644
--- a/python/unittest_py/requirements.txt
+++ b/python/unittest_py/requirements.txt
@@ -4,3 +4,4 @@ pycrypto ; platform_system != "Windows"
 mock
 opencv-python<=4.2.0.32
 visualdl ; python_version>="3.5"
+paddle2onnx>=0.4

From c47bfe98cf7a3a3d6fba81e2b5e3eabc226abe5e Mon Sep 17 00:00:00 2001
From: Yang Zhang <yangzhang@live.com>
Date: Fri, 13 Nov 2020 13:24:20 +0800
Subject: [PATCH 181/185] Reduce input data size for syncbn unit test by 8x
 (#28571)

this is a preemptive measure, the unit test was failing occasionally with
cudnn internal error
---
 .../fluid/tests/unittests/test_sync_batch_norm_op.py      | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
index bfd22dbe1cee6..9a380c886e915 100644
--- a/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sync_batch_norm_op.py
@@ -50,9 +50,9 @@ def setUp(self):
         """Setup."""
         #self.dtype = np.float32
         self.dtype = np.float64
-        self.N = 32
+        self.N = 8
         self.C = 16
-        self.H = 64
+        self.H = 32
         self.W = 32
         self.dshape = [self.N, self.C, self.H, self.W]
         self.atol = 1e-3
@@ -196,9 +196,9 @@ class TestFP16SyncBatchNormOpTraining(TestSyncBatchNormOpTraining):
     def setUp(self):
         """Setup."""
         self.dtype = np.float16
-        self.N = 32
+        self.N = 8
         self.C = 16
-        self.H = 64
+        self.H = 32
         self.W = 32
         self.dshape = [self.N, self.C, self.H, self.W]
         self.atol = 1e-2

From 5579edfb3c864d11d467325c59e3d65e8a579795 Mon Sep 17 00:00:00 2001
From: LiuChiachi <709153940@qq.com>
Date: Fri, 13 Nov 2020 13:32:40 +0800
Subject: [PATCH 182/185] save dtype of inputs (#28301)

---
 python/paddle/hapi/model.py | 49 +++++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 19 deletions(-)

diff --git a/python/paddle/hapi/model.py b/python/paddle/hapi/model.py
index 466b6f2e63ec5..d5d2ec70e9906 100644
--- a/python/paddle/hapi/model.py
+++ b/python/paddle/hapi/model.py
@@ -200,16 +200,22 @@ def _init_context():
     return strategy
 
 
-def _update_input_shapes(inputs):
+def _update_input_info(inputs):
     "Get input shape list by given inputs in Model initialization."
     shapes = None
+    dtypes = None
     if isinstance(inputs, Input):
         shapes = [list(inputs.shape)]
+        dtypes = [inputs.dtype]
     elif isinstance(inputs, list):
         shapes = [list(input.shape) for input in inputs]
+        dtypes = [input.dtype for input in inputs]
     elif isinstance(inputs, dict):
         shapes = [list(inputs[name].shape) for name in inputs]
-    return shapes
+        dtypes = [inputs[name].dtype for name in inputs]
+    else:
+        return None
+    return shapes, dtypes
 
 
 class StaticGraphAdapter(object):
@@ -617,7 +623,7 @@ def __init__(self, model):
             'test_batch': 0
         }
 
-        self._input_shapes = None
+        self._input_info = None
         if self._nranks > 1:
             stradegy = fluid.dygraph.parallel.ParallelStrategy()
             stradegy.nranks = ParallelEnv().nranks
@@ -642,7 +648,7 @@ def train_batch(self, inputs, labels=None):
         self.model.network.train()
         self.mode = 'train'
         inputs = to_list(inputs)
-        self._input_shapes = _update_input_shapes(inputs)
+        self._input_info = _update_input_info(inputs)
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
@@ -679,7 +685,7 @@ def eval_batch(self, inputs, labels=None):
         self.model.network.eval()
         self.mode = 'eval'
         inputs = to_list(inputs)
-        self._input_shapes = _update_input_shapes(inputs)
+        self._input_info = _update_input_info(inputs)
         labels = labels or []
         labels = [to_variable(l) for l in to_list(labels)]
 
@@ -728,7 +734,7 @@ def predict_batch(self, inputs):
         self.model.network.eval()
         self.mode = 'test'
         inputs = [to_variable(x) for x in to_list(inputs)]
-        self._input_shapes = _update_input_shapes(inputs)
+        self._input_info = _update_input_info(inputs)
         outputs = self.model.network.forward(*inputs)
         if self._nranks > 1 and isinstance(self.model._place, fluid.CUDAPlace):
             outputs = [_all_gather(o, self._nranks) for o in to_list(outputs)]
@@ -875,7 +881,7 @@ def __init__(self, network, inputs=None, labels=None):
         self._loss = None
         self._loss_weights = None
         self._optimizer = None
-        self._input_shapes = None
+        self._input_info = None
         self._is_shape_inferred = False
         self._test_dataloader = None
 
@@ -884,7 +890,7 @@ def __init__(self, network, inputs=None, labels=None):
                 raise TypeError(
                     "'inputs' must be list or dict, and couldn't be None.")
         elif inputs:
-            self._input_shapes = _update_input_shapes(inputs)
+            self._input_info = _update_input_info(inputs)
 
         self._inputs = self._verify_spec(inputs, is_input=True)
         self._labels = self._verify_spec(labels)
@@ -941,7 +947,7 @@ def train_batch(self, inputs, labels=None):
               print(loss)
         """
         loss = self._adapter.train_batch(inputs, labels)
-        if fluid.in_dygraph_mode() and self._input_shapes is None:
+        if fluid.in_dygraph_mode() and self._input_info is None:
             self._update_inputs()
         return loss
 
@@ -992,7 +998,7 @@ def eval_batch(self, inputs, labels=None):
               print(loss)
         """
         loss = self._adapter.eval_batch(inputs, labels)
-        if fluid.in_dygraph_mode() and self._input_shapes is None:
+        if fluid.in_dygraph_mode() and self._input_info is None:
             self._update_inputs()
         return loss
 
@@ -1036,7 +1042,7 @@ def predict_batch(self, inputs):
               print(out)
         """
         loss = self._adapter.predict_batch(inputs)
-        if fluid.in_dygraph_mode() and self._input_shapes is None:
+        if fluid.in_dygraph_mode() and self._input_info is None:
             self._update_inputs()
         return loss
 
@@ -1750,14 +1756,15 @@ def get_inout_spec(all_vars, return_name=False):
         if fluid.in_dygraph_mode():
             with fluid.framework._dygraph_guard(None):
                 layer = self.network
-                if self._input_shapes is None:  # No provided or inferred
+                if self._input_info is None:  # No provided or inferred
                     raise RuntimeError(
                         "Saving inference model needs 'inputs' or running before saving. Please specify 'inputs' in Model initialization or input training data and perform a training for shape derivation."
                     )
                 if self._is_shape_inferred:
                     warnings.warn(
                         "'inputs' was not specified when Model initialization, so the input shape to be saved will be the shape derived from the user's actual inputs. The input shape to be saved is %s. For saving correct input shapes, please provide 'inputs' for Model initialization."
-                        % self._input_shapes)
+                        % self._input_info[0])
+
                 layer.forward = paddle.jit.to_static(
                     layer.forward, input_spec=self._inputs)
 
@@ -1945,7 +1952,7 @@ def summary(self, input_size=None, dtype=None):
             _input_size = self._inputs
         return summary(self.network, _input_size, dtype)
 
-    def _verify_spec(self, specs, shapes=None, is_input=False):
+    def _verify_spec(self, specs, shapes=None, dtypes=None, is_input=False):
         out_specs = []
 
         if specs is None:
@@ -1954,10 +1961,12 @@ def _verify_spec(self, specs, shapes=None, is_input=False):
 
             if is_input:
                 arg_names = extract_args(self.network.forward)[1:]
-                if shapes is not None and fluid.in_dygraph_mode():
+                # While Saving inference model in dygraph, and providing inputs only in running.
+                if shapes is not None and dtypes is not None and fluid.in_dygraph_mode(
+                ):
                     out_specs = [
                         Input(
-                            name=n, shape=shapes[i])
+                            name=n, dtype=dtypes[i], shape=shapes[i])
                         for i, n in enumerate(arg_names)
                     ]
                 else:
@@ -2000,6 +2009,8 @@ def _len_data_loader(self, data_loader):
 
     def _update_inputs(self):
         "Update self._inputs according to given inputs."
-        self._input_shapes = self._adapter._input_shapes
-        self._is_shape_inferred = True
-        self._inputs = self._verify_spec(None, self._input_shapes, True)
+        self._input_info = self._adapter._input_info
+        if self._input_info is not None and len(self._input_info) == 2:
+            self._inputs = self._verify_spec(None, self._input_info[0],
+                                             self._input_info[1], True)
+            self._is_shape_inferred = True

From a829357e4d2e6f05060455625b4e7cfbb1bff8dd Mon Sep 17 00:00:00 2001
From: Zhong Hui <zhonghui.net@gmail.com>
Date: Fri, 13 Nov 2020 14:55:48 +0800
Subject: [PATCH 183/185] register the op version for some ops

register the op version for some ops
---
 paddle/fluid/operators/clip_op.cc             | 14 ++++++++
 .../fluid/operators/controlflow/compare_op.cc | 35 ++++++++++++-------
 2 files changed, 37 insertions(+), 12 deletions(-)

diff --git a/paddle/fluid/operators/clip_op.cc b/paddle/fluid/operators/clip_op.cc
index f727f63eb61d6..ad61d61d4cc81 100644
--- a/paddle/fluid/operators/clip_op.cc
+++ b/paddle/fluid/operators/clip_op.cc
@@ -14,6 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/clip_op.h"
 #include <memory>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 
 namespace paddle {
 namespace operators {
@@ -122,3 +124,15 @@ REGISTER_OP_CPU_KERNEL(
 REGISTER_OP_CPU_KERNEL(
     clip_grad, ops::ClipGradKernel<paddle::platform::CPUDeviceContext, float>,
     ops::ClipGradKernel<paddle::platform::CPUDeviceContext, double>);
+
+REGISTER_OP_VERSION(clip)
+    .AddCheckpoint(
+        R"ROC(
+              Upgrade clip add a new input [Min])ROC",
+        paddle::framework::compatible::OpVersionDesc()
+            .NewInput("Min",
+                      "Pass the mix, min value as input, not attribute. Min is "
+                      "dispensable.")
+            .NewInput("Max",
+                      "Pass the mix, min value as input, not attribute. Max is "
+                      "dispensable."));
diff --git a/paddle/fluid/operators/controlflow/compare_op.cc b/paddle/fluid/operators/controlflow/compare_op.cc
index 4940649c2a326..21c28f9818b51 100644
--- a/paddle/fluid/operators/controlflow/compare_op.cc
+++ b/paddle/fluid/operators/controlflow/compare_op.cc
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
@@ -128,18 +129,28 @@ class CompareOp : public framework::OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle
 
-#define REGISTER_COMPARE_OP(op_type, _equation)                         \
-  struct _##op_type##Comment {                                          \
-    static char type[];                                                 \
-    static char equation[];                                             \
-  };                                                                    \
-  char _##op_type##Comment::type[]{#op_type};                           \
-  char _##op_type##Comment::equation[]{_equation};                      \
-  REGISTER_OPERATOR(                                                    \
-      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,     \
-      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,    \
-      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
-      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
+#define REGISTER_COMPARE_OP_VERSION(op_type)                               \
+  REGISTER_OP_VERSION(op_type)                                             \
+      .AddCheckpoint(                                                      \
+          R"ROC(Upgrade compare ops, add a new attribute [force_cpu])ROC", \
+          paddle::framework::compatible::OpVersionDesc().NewAttr(          \
+              "force_cpu",                                                 \
+              "In order to force fill output variable to cpu memory.",     \
+              false));
+
+#define REGISTER_COMPARE_OP(op_type, _equation)                           \
+  struct _##op_type##Comment {                                            \
+    static char type[];                                                   \
+    static char equation[];                                               \
+  };                                                                      \
+  char _##op_type##Comment::type[]{#op_type};                             \
+  char _##op_type##Comment::equation[]{_equation};                        \
+  REGISTER_OPERATOR(                                                      \
+      op_type, ::paddle::operators::CompareOp<_##op_type##Comment>,       \
+      ::paddle::operators::CompareOpProtoMaker<_##op_type##Comment>,      \
+      ::paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,   \
+      ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>); \
+  REGISTER_COMPARE_OP_VERSION(op_type);
 
 REGISTER_COMPARE_OP(less_than, "Out = X < Y");
 REGISTER_COMPARE_KERNEL(less_than, CPU, paddle::operators::LessThanFunctor);

From ed9dd7c9f0036c55745b441ea9cae79c3cf602b2 Mon Sep 17 00:00:00 2001
From: lilong12 <lilong12@baidu.com>
Date: Fri, 13 Nov 2020 15:49:55 +0800
Subject: [PATCH 184/185] add send and recv ops (#28590)

* update, test=develop
---
 .../fluid/operators/collective/recv_v2_op.cc  |  91 +++++++++++++++
 .../operators/collective/recv_v2_op.cu.cc     | 104 ++++++++++++++++++
 .../fluid/operators/collective/recv_v2_op.h   |  37 +++++++
 .../fluid/operators/collective/send_v2_op.cc  |  77 +++++++++++++
 .../operators/collective/send_v2_op.cu.cc     |  90 +++++++++++++++
 .../fluid/operators/collective/send_v2_op.h   |  38 +++++++
 .../fluid/tests/unittests/CMakeLists.txt      |   1 +
 .../tests/unittests/collective_sendrecv_op.py |  74 +++++++++++++
 .../tests/unittests/test_collective_base.py   |   6 +
 .../unittests/test_collective_sendrecv.py     |  34 ++++++
 10 files changed, 552 insertions(+)
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op.cc
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op.cu.cc
 create mode 100644 paddle/fluid/operators/collective/recv_v2_op.h
 create mode 100644 paddle/fluid/operators/collective/send_v2_op.cc
 create mode 100644 paddle/fluid/operators/collective/send_v2_op.cu.cc
 create mode 100644 paddle/fluid/operators/collective/send_v2_op.h
 create mode 100644 python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
 create mode 100644 python/paddle/fluid/tests/unittests/test_collective_sendrecv.py

diff --git a/paddle/fluid/operators/collective/recv_v2_op.cc b/paddle/fluid/operators/collective/recv_v2_op.cc
new file mode 100644
index 0000000000000..10408820387b7
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op.cc
@@ -0,0 +1,91 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+#include <string>
+
+namespace paddle {
+namespace operators {
+
+class RecvOpV2 : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Recv_V2");
+    int peer = ctx->Attrs().Get<int>("peer");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for recv_v2 op must be non-negative.", peer));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for recv_v2 op must be non-negative.", ring_id));
+    auto out_shape = ctx->Attrs().Get<std::vector<int>>("out_shape");
+    PADDLE_ENFORCE_GE(out_shape.size(), 1,
+                      platform::errors::InvalidArgument(
+                          "The size of the output shape must be greater than 0 "
+                          "but the value given is %d.",
+                          out_shape.size()));
+    ctx->SetOutputDim("Out", framework::make_ddim(out_shape));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    int dtype = ctx.Attr<int>("dtype");
+    framework::proto::VarType::Type type =
+        framework::proto::VarType::Type(dtype);
+    return framework::OpKernelType(type, ctx.GetPlace());
+  }
+};
+
+class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddOutput("Out", "(Tensor) tensor to receive.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("peer", "(int default 0) rank id for sender.").SetDefault(0);
+    AddAttr<int>("dtype", "(int default 5('float32')) data type of tensor.")
+        .SetDefault(5);
+    AddAttr<std::vector<int>>("out_shape", "shape of the output tensor.")
+        .SetDefault(std::vector<int>());
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Recv Operator
+
+Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(recv_v2, ops::RecvOpV2, ops::RecvOpV2Maker);
+
+REGISTER_OP_CPU_KERNEL(recv_v2, ops::RecvOpV2CPUKernel<float>,
+                       ops::RecvOpV2CPUKernel<double>,
+                       ops::RecvOpV2CPUKernel<int>,
+                       ops::RecvOpV2CPUKernel<int64_t>,
+                       ops::RecvOpV2CPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
new file mode 100644
index 0000000000000..f0dd8aee23588
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -0,0 +1,104 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/recv_v2_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class RecvOpV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext &ctx) const override {
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for recv_v2 op must be non-negative.", rid));
+
+    int peer = ctx.Attr<int>("peer");
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for recv_v2 op must be non-negative.", peer));
+
+    auto out = ctx.Output<framework::LoDTensor>("Out");
+    auto out_dims = out->dims();
+    int data_type = ctx.Attr<int>("dtype");
+    framework::proto::VarType::Type type =
+        framework::proto::VarType::Type(data_type);
+
+#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
+    cudaStream_t stream = nullptr;
+    auto place = ctx.GetPlace();
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext *>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+
+    PADDLE_ENFORCE_LT(
+        peer, comm->nranks(),
+        platform::errors::InvalidArgument("The value of peer (%d) you set must "
+                                          "be less than comm->nranks (%d).",
+                                          peer, comm->nranks()));
+    ncclDataType_t dtype = platform::ToNCCLDataType(type);
+
+    // Recv the number of elements to receive first
+    int numel = 0;
+    int *numel_ptr = nullptr;
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        platform::dynload::ncclRecv(static_cast<void *>(numel_ptr), 1, ncclInt,
+                                    peer, comm->comm(), stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(&numel, numel_ptr, sizeof(int), cudaMemcpyDeviceToHost));
+
+    int rest_numel = 1;
+    for (int i = 1; i < out_dims.size(); ++i) {
+      rest_numel = rest_numel * out_dims[i];
+    }
+    out_dims[0] = numel / rest_numel;
+    out->mutable_data<T>(out_dims, place);
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclRecv(
+        out->data<T>(), numel, dtype, peer, comm->comm(), stream));
+    VLOG(3) << "rank " << comm->rank() << " recv "
+            << framework::product(out->dims()) << " from " << peer;
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should be compiled with NCCL and "
+        "NCCL version >= 2.7.3 is needed."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(recv_v2, ops::RecvOpV2CUDAKernel<float>,
+                        ops::RecvOpV2CUDAKernel<double>,
+                        ops::RecvOpV2CUDAKernel<int>,
+                        ops::RecvOpV2CUDAKernel<int64_t>,
+                        ops::RecvOpV2CUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/recv_v2_op.h b/paddle/fluid/operators/collective/recv_v2_op.h
new file mode 100644
index 0000000000000..f9e21003f8f34
--- /dev/null
+++ b/paddle/fluid/operators/collective/recv_v2_op.h
@@ -0,0 +1,37 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class RecvOpV2CPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support recv for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/collective/send_v2_op.cc b/paddle/fluid/operators/collective/send_v2_op.cc
new file mode 100644
index 0000000000000..c5a86b4f08813
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op.cc
@@ -0,0 +1,77 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+namespace paddle {
+namespace operators {
+
+class SendOpV2 : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "SendV2");
+    int peer = ctx->Attrs().Get<int>("peer");
+    int ring_id = ctx->Attrs().Get<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for send_v2 op must be non-negative.", peer));
+    PADDLE_ENFORCE_GE(
+        ring_id, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for send_v2 op must be non-negative.", ring_id));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace());
+  }
+};
+
+class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "(Tensor) tensor to be sent.");
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("peer", "(int default 0) rank id for receiver.").SetDefault(0);
+    AddAttr<bool>(
+        "use_calc_stream",
+        "(bool default false) eject CUDA operations to calculation stream.")
+        .SetDefault(false);
+    AddComment(R"DOC(
+Send Operator
+
+Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv
+)DOC");
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_WITHOUT_GRADIENT(send_v2, ops::SendOpV2, ops::SendOpV2Maker);
+
+REGISTER_OP_CPU_KERNEL(send_v2, ops::SendOpV2CPUKernel<float>,
+                       ops::SendOpV2CPUKernel<double>,
+                       ops::SendOpV2CPUKernel<int>,
+                       ops::SendOpV2CPUKernel<int64_t>,
+                       ops::SendOpV2CPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
new file mode 100644
index 0000000000000..9f925b2eede02
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -0,0 +1,90 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/collective/send_v2_op.h"
+
+#if defined(PADDLE_WITH_NCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class SendOpV2CUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    auto x = ctx.Input<framework::LoDTensor>("X");
+    int numel = x->numel();
+
+    int rid = ctx.Attr<int>("ring_id");
+    PADDLE_ENFORCE_GE(
+        rid, 0,
+        platform::errors::InvalidArgument(
+            "The ring_id (%d) for send_v2 op must be non-negative.", rid));
+
+    int peer = ctx.Attr<int>("peer");
+    PADDLE_ENFORCE_GE(
+        peer, 0,
+        platform::errors::InvalidArgument(
+            "The peer (%d) for send_v2 op must be non-negative.", peer));
+    cudaStream_t stream = nullptr;
+    auto place = ctx.GetPlace();
+#if defined(PADDLE_WITH_NCCL) && NCCL_VERSION_CODE >= 2703
+    auto comm = platform::NCCLCommContext::Instance().Get(rid, place);
+    if (ctx.Attr<bool>("use_calc_stream")) {
+      auto dev_ctx = platform::DeviceContextPool::Instance().Get(place);
+      stream = static_cast<platform::CUDADeviceContext*>(dev_ctx)->stream();
+    } else {
+      stream = comm->stream();
+    }
+    PADDLE_ENFORCE_LT(
+        peer, comm->nranks(),
+        platform::errors::InvalidArgument("The value of peer (%d) you set must "
+                                          "be less than comm->nranks (%d).",
+                                          peer, comm->nranks()));
+    ncclDataType_t dtype = platform::ToNCCLDataType(x->type());
+    // Send number of elements to the receiver, as the receiver may have
+    // no information of the Tensor size.
+    int* numel_ptr = nullptr;
+    PADDLE_ENFORCE_CUDA_SUCCESS(cudaMalloc(&numel_ptr, sizeof(int)));
+    PADDLE_ENFORCE_CUDA_SUCCESS(
+        cudaMemcpy(numel_ptr, &numel, sizeof(int), cudaMemcpyHostToDevice));
+
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+        numel_ptr, 1, ncclInt, peer, comm->comm(), stream));
+    PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclSend(
+        x->data<T>(), numel, dtype, peer, comm->comm(), stream));
+    VLOG(3) << "rank " << comm->rank() << " send "
+            << framework::product(x->dims()) << " to " << peer;
+#else
+    PADDLE_THROW(platform::errors::Unavailable(
+        "PaddlePaddle should be compiled with NCCL "
+        "and NCCL version >= 2.7.3 is needed."));
+#endif
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(send_v2, ops::SendOpV2CUDAKernel<float>,
+                        ops::SendOpV2CUDAKernel<double>,
+                        ops::SendOpV2CUDAKernel<int>,
+                        ops::SendOpV2CUDAKernel<int64_t>,
+                        ops::SendOpV2CUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/collective/send_v2_op.h b/paddle/fluid/operators/collective/send_v2_op.h
new file mode 100644
index 0000000000000..6215fb1f3b643
--- /dev/null
+++ b/paddle/fluid/operators/collective/send_v2_op.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class SendOpV2CPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support send for cpu kernel now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 01c5cfa0aaee3..6fcc8b9691703 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -64,6 +64,7 @@ if(NOT WITH_GPU OR WIN32)
     LIST(REMOVE_ITEM TEST_OPS test_broadcast)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce)
     LIST(REMOVE_ITEM TEST_OPS test_collective_scatter)
+    LIST(REMOVE_ITEM TEST_OPS test_collective_sendrecv)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter)
     LIST(REMOVE_ITEM TEST_OPS test_reducescatter_api)
     LIST(REMOVE_ITEM TEST_OPS test_collective_reduce_api)
diff --git a/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py b/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
new file mode 100644
index 0000000000000..0a1967aa658ed
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/collective_sendrecv_op.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import numpy as np
+import argparse
+import os
+import sys
+import signal
+import time
+import socket
+from contextlib import closing
+from six import string_types
+import math
+import paddle
+import paddle.fluid as fluid
+import paddle.fluid.profiler as profiler
+import paddle.fluid.unique_name as nameGen
+from paddle.fluid import core
+import unittest
+from multiprocessing import Process
+import paddle.fluid.layers as layers
+from functools import reduce
+from test_collective_base import TestCollectiveRunnerBase, runtime_main
+
+paddle.enable_static()
+
+
+class TestCollectiveSendRecv(TestCollectiveRunnerBase):
+    def __init__(self):
+        self.global_ring_id = 0
+
+    def get_model(self, main_prog, startup_program):
+        ring_id = self.global_ring_id
+        with fluid.program_guard(main_prog, startup_program):
+            tindata = layers.data(
+                name="tindata", shape=[10, 1000], dtype='float64')
+            if self.rank == 0:
+                main_prog.global_block().append_op(
+                    type="send_v2",
+                    inputs={'X': tindata},
+                    attrs={
+                        'ring_id': ring_id,
+                        'peer': 1,
+                        'use_calc_stream': True
+                    })
+            else:
+                main_prog.global_block().append_op(
+                    type="recv_v2",
+                    outputs={'Out': tindata},
+                    attrs={
+                        'peer': 0,
+                        'ring_id': ring_id,
+                        'dtype': tindata.dtype,
+                        'out_shape': tindata.shape,
+                        'use_calc_stream': True,
+                    })
+            return tindata
+
+
+if __name__ == "__main__":
+    runtime_main(TestCollectiveSendRecv, "sendrecv", 0)
diff --git a/python/paddle/fluid/tests/unittests/test_collective_base.py b/python/paddle/fluid/tests/unittests/test_collective_base.py
index 512b2967e02fd..fc267ed914ec2 100644
--- a/python/paddle/fluid/tests/unittests/test_collective_base.py
+++ b/python/paddle/fluid/tests/unittests/test_collective_base.py
@@ -103,6 +103,7 @@ def run_trainer(self, args):
         nranks = 2
         self.initCommunicator(startup_prog, rank, nranks, True,
                               current_endpoint, endpoints)
+        self.rank = rank
         result = self.get_model(train_prog, startup_prog)
         device_id = int(os.getenv("FLAGS_selected_gpus", "0"))
         place = fluid.CUDAPlace(
@@ -268,6 +269,11 @@ def check_with_place(self,
             self.assertTrue(
                 np.allclose(
                     tr1_out, need_result2, rtol=1e-05, atol=1e-05))
+        elif col_type == "sendrecv":
+            need_result = input1
+            self.assertTrue(
+                np.allclose(
+                    tr1_out, need_result, rtol=1e-05, atol=1e-05))
         elif col_type == "reduce_slicegather":
             slicesize = input1.shape[0] // 2
             tmp10 = input1[0:slicesize]
diff --git a/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py b/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
new file mode 100644
index 0000000000000..67c84a71bb335
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_collective_sendrecv.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+import unittest
+import numpy as np
+import paddle
+
+from test_collective_base import TestDistBase
+
+paddle.enable_static()
+
+
+class TestSendRecvOp(TestDistBase):
+    def _setup_config(self):
+        pass
+
+    def test_sendrecv(self):
+        self.check_with_place("collective_sendrecv_op.py", "sendrecv")
+
+
+if __name__ == '__main__':
+    unittest.main()

From 1adc09b56ad3b5386cf395741113341d68c79113 Mon Sep 17 00:00:00 2001
From: Huihuang Zheng <zhhsplendid@gmail.com>
Date: Fri, 13 Nov 2020 17:01:32 +0800
Subject: [PATCH 185/185] Make Batch Size Smaller on Mac Because of CI Machine
 (#28569)

I found the unittest failed due to batch size. Maybe the reason is that our CI machine has limited memory. I decreased the batch size.
---
 .../paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
index b883f1820c1b1..d8f088019ba46 100644
--- a/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
+++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/yolov3.py
@@ -15,6 +15,9 @@
 from __future__ import division
 from __future__ import print_function
 
+import os
+import sys
+
 import paddle.fluid as fluid
 from paddle.fluid.dygraph import declarative
 from paddle.fluid.dygraph.base import to_variable
@@ -90,7 +93,7 @@ def __setattr__(self, name, value):
 # SOLVER options
 #
 # batch size
-cfg.batch_size = 4
+cfg.batch_size = 2 if sys.platform == 'darwin' or os.name == 'nt' else 4 
 # derived learning rate the to get the final learning rate.
 cfg.learning_rate = 0.001
 # maximum number of iterations