From 4de4ef5706321268fcac2448a70ef6b80ca1b86c Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Wed, 8 Jan 2020 16:07:14 -0800
Subject: [PATCH 01/10] refactor optimizer

---
 python/mxnet/gluon/trainer.py                 |   18 +-
 python/mxnet/optimizer/__init__.py            |   45 +-
 python/mxnet/optimizer/adadelta.py            |  112 +
 python/mxnet/optimizer/adagrad.py             |  145 ++
 python/mxnet/optimizer/adam.py                |  188 ++
 python/mxnet/optimizer/adamax.py              |  112 +
 python/mxnet/optimizer/contrib.py             |  136 +-
 python/mxnet/optimizer/dcasgd.py              |  117 ++
 python/mxnet/optimizer/ftml.py                |  160 ++
 python/mxnet/optimizer/ftrl.py                |  171 ++
 python/mxnet/optimizer/lamb.py                |  263 +++
 python/mxnet/optimizer/lars.py                |  282 +++
 python/mxnet/optimizer/nadam.py               |  125 ++
 python/mxnet/optimizer/nag.py                 |  166 ++
 python/mxnet/optimizer/optimizer.py           | 1829 ++---------------
 python/mxnet/optimizer/rmsprop.py             |  185 ++
 python/mxnet/optimizer/sgd.py                 |  247 +++
 python/mxnet/optimizer/sgld.py                |   89 +
 python/mxnet/optimizer/signum.py              |  162 ++
 python/mxnet/optimizer/updater.py             |  142 ++
 python/mxnet/optimizer/utils.py               |   43 +
 python/mxnet/test_utils.py                    |  106 +-
 src/operator/contrib/optimizer_op-inl.h       |    2 +-
 src/operator/contrib/optimizer_op.cc          |    2 +-
 src/operator/optimizer_op-inl.h               |  277 +--
 src/operator/optimizer_op.cc                  |   49 +-
 src/operator/optimizer_op.cu                  |   18 +-
 src/optimizer/sgd-inl.h                       |    4 +-
 .../python/unittest/test_contrib_optimizer.py |   73 +-
 tests/python/unittest/test_optimizer.py       | 1584 ++++++--------
 30 files changed, 3788 insertions(+), 3064 deletions(-)
 create mode 100644 python/mxnet/optimizer/adadelta.py
 create mode 100644 python/mxnet/optimizer/adagrad.py
 create mode 100644 python/mxnet/optimizer/adam.py
 create mode 100644 python/mxnet/optimizer/adamax.py
 create mode 100644 python/mxnet/optimizer/dcasgd.py
 create mode 100644 python/mxnet/optimizer/ftml.py
 create mode 100644 python/mxnet/optimizer/ftrl.py
 create mode 100644 python/mxnet/optimizer/lamb.py
 create mode 100644 python/mxnet/optimizer/lars.py
 create mode 100644 python/mxnet/optimizer/nadam.py
 create mode 100644 python/mxnet/optimizer/nag.py
 create mode 100644 python/mxnet/optimizer/rmsprop.py
 create mode 100644 python/mxnet/optimizer/sgd.py
 create mode 100644 python/mxnet/optimizer/sgld.py
 create mode 100644 python/mxnet/optimizer/signum.py
 create mode 100644 python/mxnet/optimizer/updater.py
 create mode 100644 python/mxnet/optimizer/utils.py

diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index a27c951c01b9..85e7409cde92 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,9 +60,11 @@ class Trainer(object):
         Arguments would then be {'type':'2bit', 'threshold':0.5}
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
-        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
-        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
+        Whether to perform parameter updates on kvstore. If None and optimizer.aggregate_num <= 1,
+        then trainer will choose the more suitable option depending on the type of kvstore.
+        If None and optimizer.aggregate_num > 1, `update_on_kvstore` is set to False.
+        If the `update_on_kvstore` argument is provided,
+        environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
 
     Properties
     ----------
@@ -103,6 +105,12 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
         optimizer_params = optimizer_params if optimizer_params else {}
         self._init_optimizer(optimizer, optimizer_params)
         self._scale = self._optimizer.rescale_grad
+        if self._optimizer.aggregate_num > 1 and update_on_kvstore is not None:
+            if update_on_kvstore:
+                raise ValueError("Cannot set update_on_kvstore=True "
+                                 "when optimizer.aggregate_num > 1.")
+        if update_on_kvstore is None and self._optimizer.aggregate_num > 1:
+            update_on_kvstore = False
         self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore}
         self._kv_initialized = False
         self._kvstore = None
@@ -457,8 +465,8 @@ def _update(self, ignore_stale_grad=False):
         if not (self._kvstore and self._update_on_kvstore):
             for updater, upd in zip(self._updaters, updates):
                 if upd:
-                    i, w, g = zip(*upd)
-                    updater(i, w, g)
+                    i, g, w = zip(*upd)
+                    updater(i, g, w)
 
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py
index 72eb5a741520..89b37de1c873 100644
--- a/python/mxnet/optimizer/__init__.py
+++ b/python/mxnet/optimizer/__init__.py
@@ -16,9 +16,48 @@
 # under the License.
 """Optimizer API of MXNet."""
 
-from . import optimizer, contrib
+from . import (optimizer, contrib, updater, utils, sgd,
+               sgld, signum, dcasgd, nag, adagrad,
+               adadelta, adam, adamax, nadam, ftrl,
+               ftml, lars, lamb, rmsprop)
 # pylint: disable=wildcard-import
 from .optimizer import *
-# pylint: enable=wildcard-import
 
-__all__ = optimizer.__all__ + ['contrib']
+from .updater import *
+
+from .utils import *
+
+from .sgd import *
+
+from .sgld import *
+
+from .signum import *
+
+from .dcasgd import *
+
+from .nag import *
+
+from .adagrad import *
+
+from .adadelta import *
+
+from .adam import *
+
+from .adamax import *
+
+from .nadam import *
+
+from .ftrl import *
+
+from .ftml import *
+
+from .lars import *
+
+from .lamb import *
+
+from .rmsprop import *
+
+__all__ = optimizer.__all__ + updater.__all__ + ['contrib'] + sgd.__all__ + sgld.__all__ \
+          + signum.__all__ + dcasgd.__all__ + nag.__all__ + adagrad.__all__ + adadelta.__all__ \
+          + adam.__all__ + adamax.__all__ + nadam.__all__ + ftrl.__all__ + ftml.__all__ \
+          + lars.__all__ + lamb.__all__ + rmsprop.__all__
diff --git a/python/mxnet/optimizer/adadelta.py b/python/mxnet/optimizer/adadelta.py
new file mode 100644
index 000000000000..0c6fdd02aaca
--- /dev/null
+++ b/python/mxnet/optimizer/adadelta.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""AdaDelta optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['AdaDelta']
+
+
+@register
+class AdaDelta(Optimizer):
+    """The AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= learning_rate * delta
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 1.0
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    rho: float, default 0.9
+        Decay rate for both squared gradients and delta.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=1.0, rho=0.9, epsilon=1e-6, use_fused_step=False, **kwargs):
+        super(AdaDelta, self).__init__(learning_rate=learning_rate,
+                                       use_fused_step=use_fused_step,
+                                       **kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context),  # accumulated g
+                zeros(weight.shape, weight.context))  # accumulated delta
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            acc_g, acc_delta = state
+
+            # update g, delta
+            acc_g[:] *= self.rho
+            acc_g[:] += (1. - self.rho) * square(grad)
+            current_delta = sqrt(acc_delta + self.epsilon)
+            current_delta /= sqrt(acc_g + self.epsilon)
+            current_delta *= grad
+            acc_delta[:] *= self.rho
+            acc_delta[:] += (1. - self.rho) * square(current_delta)
+
+            # update weight
+            weight[:] -= lr * current_delta
diff --git a/python/mxnet/optimizer/adagrad.py b/python/mxnet/optimizer/adagrad.py
new file mode 100644
index 000000000000..8e181fda90a6
--- /dev/null
+++ b/python/mxnet/optimizer/adagrad.py
@@ -0,0 +1,145 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""AdaGrad optimizer"""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import sparse
+from .optimizer import Optimizer, register
+
+__all__ = ['AdaGrad']
+
+
+@register
+class AdaGrad(Optimizer):
+    """AdaGrad optimizer.
+
+    This class implements the AdaGrad optimizer described in *Adaptive Subgradient
+    Methods for Online Learning and Stochastic Optimization*, and available at
+    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        history += square(grad)
+        weight -= learning_rate * grad / (sqrt(history) + epsilon)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    See Also
+    ----------
+    :meth:`mxnet.ndarray.sparse.adagrad_update`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False or grad is not sparse, step is called,
+        otherwise, fused_step is called.
+
+    """
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, use_fused_step=True, **kwargs):
+        super(AdaGrad, self).__init__(learning_rate=learning_rate,
+                                      use_fused_step=use_fused_step,
+                                      **kwargs)
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update history
+            history = state
+            history[:] += square(grad)
+            d = grad / (sqrt(history) + self.epsilon)
+
+            # update weight
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            is_sparse = grad.stype == 'row_sparse'
+
+            if is_sparse:
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                history = state
+
+                # When grad is sparse, update weight with fused kernel
+                sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                # When the grad is not sparse, the func step is called to update weight and state
+                self.step([index], [weight], [grad], [state])
diff --git a/python/mxnet/optimizer/adam.py b/python/mxnet/optimizer/adam.py
new file mode 100644
index 000000000000..a08c5c73e6fe
--- /dev/null
+++ b/python/mxnet/optimizer/adam.py
@@ -0,0 +1,188 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Adam optimizer."""
+from __future__ import absolute_import
+import math
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import adam_update
+from .optimizer import Optimizer, register
+
+__all__ = ['Adam']
+
+
+@register
+class Adam(Optimizer):
+    """The Adam optimizer.
+
+    This class implements the optimizer described in *Adam: A Method for
+    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
+
+    If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
+    **lazy updates** at step t are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient) + wd * weight[row]
+            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
+            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
+            lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
+
+    The lazy update only updates the mean and var for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all indices.
+    Compared with the original update, it can provide large improvements in model training
+    throughput for some applications. However, it provides slightly different semantics than
+    the original update, and may lead to different empirical results.
+
+    Otherwise, **standard updates** at step t are applied by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m + (1 - beta1) * rescaled_grad
+        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
+        lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+        w = w - lr * m / (sqrt(v) + epsilon)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    lazy_update : bool, default False
+       Default is False. If True, lazy updates are applied \
+       if the storage types of weight and grad are both ``row_sparse``.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 lazy_update=False, use_fused_step=True, **kwargs):
+        super(Adam, self).__init__(use_fused_step=use_fused_step,
+                                   learning_rate=learning_rate,
+                                   **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update,\
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        self.lazy_update = lazy_update
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        stype = weight.stype if self.lazy_update else 'default'
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=stype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=stype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+            lr *= math.sqrt(coef2) / coef1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            # update weight
+            d = mean / (sqrt(var) + self.epsilon)
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+
+            lr *= math.sqrt(coef2)/coef1
+
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            mean, var = state
+
+            # update weight with fused kernel
+            adam_update(weight, grad, mean, var, out=weight,
+                        lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/adamax.py b/python/mxnet/optimizer/adamax.py
new file mode 100644
index 000000000000..50af82138f43
--- /dev/null
+++ b/python/mxnet/optimizer/adamax.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Adamax optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, maximum, abs as NDabs)
+from .optimizer import Optimizer, register
+
+__all__ = ['Adamax']
+
+
+# pylint: enable=line-too-long
+@register
+class Adamax(Optimizer):
+    """The AdaMax optimizer.
+
+    It is a variant of Adam based on the infinity norm
+    available at http://arxiv.org/abs/1412.6980 Section 7.
+
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.002
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999,
+                 use_fused_step=False, **kwargs):
+        super(Adamax, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            lr /= (1. - self.beta1**t)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] = maximum(self.beta2 * var, NDabs(grad))
+
+            # update weight
+            d = mean / var
+            weight[:] -= lr * d
+
diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
index d269aa1bd069..1092db3979ea 100644
--- a/python/mxnet/optimizer/contrib.py
+++ b/python/mxnet/optimizer/contrib.py
@@ -18,11 +18,8 @@
 
 # pylint: disable=too-many-lines
 """Contrib optimizers."""
-from ..ndarray import (NDArray, clip, contrib, mean, sqrt, square, zeros)
-from .optimizer import Optimizer
-
-# convenience wrapper for Optimizer.Register
-register = Optimizer.register  # pylint: disable=invalid-name
+from ..ndarray import (clip, contrib, mean, sqrt, square, zeros)
+from .optimizer import Optimizer, register
 
 __all__ = ['GroupAdaGrad']
 
@@ -40,8 +37,7 @@ class GroupAdaGrad(Optimizer):
 
         grad = clip(grad * rescale_grad, clip_gradient)
         history += mean(square(grad), axis=1, keepdims=True)
-        div = grad / sqrt(history + float_stable_eps)
-        weight -= div * lr
+        weight -= lr * grad / (sqrt(history) + epsilon)
 
     Weights are updated lazily if the gradient is sparse.
 
@@ -53,14 +49,24 @@ class GroupAdaGrad(Optimizer):
 
     Parameters
     ----------
-    eps: float, optional
-        Initial value of the history accumulator. Avoids division by 0.
-
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False or grad is not sparse, step is called,
+        otherwise, fused_step is called.
     """
 
-    def __init__(self, eps=1e-5, **kwargs):
-        super(GroupAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, use_fused_step=True, **kwargs):
+        super(GroupAdaGrad, self).__init__(learning_rate=learning_rate,
+                                           use_fused_step=use_fused_step,
+                                           **kwargs)
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
         assert len(weight.shape) == 2
@@ -68,33 +74,83 @@ def create_state(self, index, weight):
             (weight.shape[0], 1), weight.context, stype=weight.stype)
         return history
 
-    def update(self, index, weight, grad, state):
-        assert (isinstance(weight, NDArray))
-        assert (isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
-
-        is_sparse = grad.stype == 'row_sparse'
-        if is_sparse:
-            kwargs = {
-                'epsilon': self.float_stable_eps,
-                'rescale_grad': self.rescale_grad
-            }
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            contrib.group_adagrad_update(
-                weight,
-                grad,
-                state,
-                out=weight,
-                lr=lr,
-                **kwargs)
-        else:
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+         Parameters
+         ----------
+         indices : list of int
+             List of unique indices of the parameters into the individual learning rates
+             and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+             and `set_wd_mult()`, respectively.
+         weights : list of NDArray
+             List of parameters to be updated.
+         grads : list of NDArray
+             List of gradients of the objective with respect to this parameter.
+         states : List of any obj
+             List of state returned by `create_state()`.
+         """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+            assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
+
+            # preprocess grad
             grad = grad * self.rescale_grad
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            state[:] += mean(square(grad), axis=1, keepdims=True)
-            div = lr * grad / sqrt(state + self.float_stable_eps)
-            weight[:] -= div
+
+            # update history
+            history = state
+            history[:] += mean(square(grad), axis=1, keepdims=True)
+
+            # update weight
+            d = grad / (sqrt(history) + self.epsilon)
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            is_sparse = grad.stype == 'row_sparse'
+
+            if is_sparse:
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
+
+                kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                history = state
+
+                # When grad is sparse, update weight with fused kernel
+                contrib.group_adagrad_update(
+                    weight,
+                    grad,
+                    history,
+                    out=weight,
+                    lr=lr,
+                    **kwargs)
+            else:
+                # When the grad is not sparse, the func step is called to update weight and state
+                self.step([index], [weight], [grad], [state])
\ No newline at end of file
diff --git a/python/mxnet/optimizer/dcasgd.py b/python/mxnet/optimizer/dcasgd.py
new file mode 100644
index 000000000000..f9ef2624c982
--- /dev/null
+++ b/python/mxnet/optimizer/dcasgd.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""DCASGD optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['DCASGD']
+
+
+@register
+class DCASGD(Optimizer):
+    """The DCASGD optimizer.
+
+    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent
+    with Delay Compensation for Distributed Deep Learning*,
+    available at https://arxiv.org/abs/1609.08326.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, optional
+       The momentum value.
+    lamda : float, optional
+       Scale DC value.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, lamda=0.04,
+                 use_fused_step=False, **kwargs):
+        super(DCASGD, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.momentum = momentum
+        self.weight_previous = {}
+        self.lamda = lamda
+
+    def create_state(self, index, weight):
+        if self.momentum == 0.0:
+            return None, weight.copy()  # previous weight
+        else:
+            return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # momentum
+                    weight.copy())  # previous weight
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom, previous_weight
+            mom, previous_weight = state
+
+            d = square(grad)
+            d *= weight - previous_weight
+            d *= self.lamda
+            d += grad
+
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * d
+            else:
+                assert (self.momentum == 0.0)
+                mom = d
+                mom *= -lr
+            previous_weight[:] = weight
+
+            # update weight
+            weight[:] += mom
diff --git a/python/mxnet/optimizer/ftml.py b/python/mxnet/optimizer/ftml.py
new file mode 100644
index 000000000000..9b5aec3054d4
--- /dev/null
+++ b/python/mxnet/optimizer/ftml.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""FTML optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import ftml_update
+from .optimizer import Optimizer, register
+
+__all__ = ['FTML']
+
+
+@register
+class FTML(Optimizer):
+    """The FTML optimizer.
+
+    This class implements the optimizer described in
+    *FTML - Follow the Moving Leader in Deep Learning*,
+    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
+
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * (square_root(v / (1 - power(beta2, t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
+        weight = - z / d_t
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.0025
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.6
+        0 < beta1 < 1. Generally close to 0.5.
+    beta2 : float, default 0.999
+        0 < beta2 < 1. Generally close to 1.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.0025, beta1=0.6, beta2=0.999, epsilon=1e-8,
+                 use_fused_step=True, **kwargs):
+        super(FTML, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
+                zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
+                zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+
+            # update d, v, z
+            d, v, z = state
+
+            v[:] *= self.beta2
+            v[:] += (1. - self.beta2) * square(grad)
+            sigma = - self.beta1 * d
+            d[:] = sqrt(v / coef2) + self.epsilon
+            d[:] *= coef1 / lr
+            sigma += d
+            z[:] *= self.beta1
+            z[:] += (1. - self.beta1) * grad
+            z[:] -= sigma * weight
+
+            # update weight
+            weight[:] = - z / d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad, 't': t}
+            if self.clip_gradient:
+                kwargs['clip_grad'] = self.clip_gradient
+
+            d, v, z = state
+
+            # update weight with fused kernel
+            ftml_update(weight, grad, d, v, z, out=weight, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/ftrl.py b/python/mxnet/optimizer/ftrl.py
new file mode 100644
index 000000000000..b0e484b8f971
--- /dev/null
+++ b/python/mxnet/optimizer/ftrl.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""FTRL optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square, sign, maximum, abs as NDabs)
+from ..ndarray import ftrl_update
+from .optimizer import Optimizer, register
+
+__all__ = ['Ftrl']
+
+
+#pylint: disable=invalid-name
+#pylint: disable=line-too-long
+@register
+class Ftrl(Optimizer):
+    """The Ftrl optimizer.
+
+    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
+    http://dl.acm.org/citation.cfm?id=2488200.
+
+    eta :
+        .. math::
+           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
+        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
+        n += rescaled_grad**2
+        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
+
+    If the storage types of weight, state and grad are all ``row_sparse``, \
+    **sparse updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
+            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
+            n[row] += rescaled_grad[row]**2
+            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
+
+    The sparse update only updates the z and n for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    lamda1 : float, default 0.01
+        L1 regularization coefficient.
+    beta : float, default 1.0
+        Per-coordinate learning rate correlation parameter.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+
+    def __init__(self, learning_rate=0.1, lamda1=0.01, beta=1.,
+                 use_fused_step=True, **kwargs):
+        super(Ftrl, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.lamda1 = lamda1
+        self.beta = beta
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
+                zeros(weight.shape, weight.context, stype=weight.stype))  # n
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+
+            # update z, n
+            z, n = state
+
+            sigma = - sqrt(n)
+            n[:] += square(grad)
+            denom = sqrt(n)
+            sigma += denom
+            sigma /= lr
+            z[:] += grad - sigma * weight
+
+            # update weight
+            denom += self.beta
+            denom /= lr
+            denom += wd
+            d = sign(z) * maximum(NDabs(z) - self.lamda1, 0)
+            weight[:] = - d / denom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            # update weight with fused kernel
+            z, n = state
+            ftrl_update(weight, grad, z, n, out=weight, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/lamb.py b/python/mxnet/optimizer/lamb.py
new file mode 100644
index 000000000000..11b7e18c0bf3
--- /dev/null
+++ b/python/mxnet/optimizer/lamb.py
@@ -0,0 +1,263 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Lamb optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip, sqrt, where, square, ones_like,
+                       maximum, minimum)
+from ..ndarray import (lamb_update_phase1, lamb_update_phase2,
+                       mp_lamb_update_phase1, mp_lamb_update_phase2)
+from ..ndarray.contrib import (multi_lamb_update, multi_mp_lamb_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['LAMB']
+
+
+@register
+class LAMB(Optimizer):
+    """LAMB Optimizer.
+
+    Referenced from 'Large Batch Optimization for Deep Learning: Training BERT in 76 minutes'
+    (https://arxiv.org/pdf/1904.00962.pdf)
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    lower_bound : float, default None
+        Lower limit of norm of weight
+    upper_bound : float, default None
+        Upper limit of norm of weight
+    bias_correction : bool, default True
+        Whether or not to apply bias correction
+    aggregate_num : int, default 4
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+        In default, all the weights are aggregated.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
+                 lower_bound=None, upper_bound=None, bias_correction=True,
+                 aggregate_num=4, use_fused_step=True, **kwargs):
+        assert aggregate_num <= 45,\
+            'When use_fused_step is True, LAMB only supports aggregate_num <= 45,' \
+            ' and receives {}'.format(aggregate_num)
+        super(LAMB, self).__init__(learning_rate=learning_rate,
+                                   aggregate_num=aggregate_num,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.bias_correction = bias_correction
+
+    def create_state(self, index, weight):
+        stype = weight.stype
+        return (zeros(weight.shape, weight.context, dtype=numpy.float32, stype=stype),  # mean
+                zeros(weight.shape, weight.context, dtype=numpy.float32, stype=stype))  # var
+
+    def step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
+            # update mean, var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            r1 = weight.norm()
+            if self.lower_bound is not None:
+                r1 = maximum(r1, self.lower_bound)
+            if self.upper_bound is not None:
+                r1 = minimum(r1, self.upper_bound)
+
+            if self.bias_correction:
+                # apply bias correction
+                coef1 = 1. - self.beta1**t
+                coef2 = 1. - self.beta2**t
+                mean_hat = mean / coef1
+                var_hat = var / coef2
+                sqrt(var_hat, out=var_hat)
+                var_hat += self.epsilon
+                mean_hat /= var_hat
+                mean_hat += wd * weight
+            else:
+                mean_hat = sqrt(var)
+                mean_hat += self.epsilon
+                mean_hat[:] = mean / mean_hat
+                mean_hat += wd * weight
+
+            g = mean_hat
+            r2 = g.norm()
+
+            # calculate lamb_trust_ratio
+            ratio = r1 / r2
+            # becomes NaN if ratio == NaN or 0, otherwise 0
+            nan_or_zero = 1 - ratio / ratio
+            r = where(nan_or_zero, ones_like(ratio), ratio)
+            lr *= r
+
+            # update weight
+            g *= lr
+            weight[:] -= g
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        if aggregate:
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'bias_correction': self.bias_correction,
+                      'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.lower_bound:
+                kwargs['lower_bound'] = self.lower_bound
+            if self.upper_bound:
+                kwargs['upper_bound'] = self.upper_bound
+
+            step_counts = []
+            for index in indices:
+                step_counts.append(self._index_update_count[index])
+
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+
+            if not multi_precision:
+                mean, var = list(zip(*states))
+                multi_lamb_update(weights, grads, mean, var,
+                                  out=weights, step_count=step_counts,
+                                  lrs=lrs, wds=wds, **kwargs)
+            else:
+                weights32, mean_var = list(zip(*states))
+                mean, var = list(zip(*mean_var))
+                multi_mp_lamb_update(weights, grads,
+                                     mean, var, weights32,
+                                     out=weights, step_count=step_counts,
+                                     lrs=lrs, wds=wds, **kwargs)
+        else:
+            for index, weight, grad, state in zip(indices, weights, grads, states):
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                          'bias_correction': self.bias_correction,
+                          'rescale_grad': self.rescale_grad, 't': t}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                multi_precision = self.multi_precision and weight.dtype == numpy.float16
+
+                if multi_precision:
+                    weight32 = state[0]
+                    mean, var = state[1]
+                    g = mp_lamb_update_phase1(weight, grad, mean, var, weight32, wd=wd, **kwargs)
+
+                    kwargs = {}
+                    if self.lower_bound:
+                        kwargs['lower_bound'] = self.lower_bound
+                    if self.upper_bound:
+                        kwargs['upper_bound'] = self.upper_bound
+                    r_1 = weight32.norm()
+                    r_2 = g.norm()
+                    mp_lamb_update_phase2(weight, g, r_1, r_2, weight32, lr=lr, out=weight, **kwargs)
+                else:
+                    mean, var = state
+                    g = lamb_update_phase1(weight, grad, mean, var, wd=wd, **kwargs)
+
+                    kwargs = {}
+                    if self.lower_bound:
+                        kwargs['lower_bound'] = self.lower_bound
+                    if self.upper_bound:
+                        kwargs['upper_bound'] = self.upper_bound
+                    r_1 = weight.norm()
+                    r_2 = g.norm()
+                    lamb_update_phase2(weight, g, r_1, r_2, lr=lr, out=weight, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(LAMB, self).update_multi_precision(indices, weights, grads, states)
+
diff --git a/python/mxnet/optimizer/lars.py b/python/mxnet/optimizer/lars.py
new file mode 100644
index 000000000000..1cd746c6dd32
--- /dev/null
+++ b/python/mxnet/optimizer/lars.py
@@ -0,0 +1,282 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""LARS optimizer."""
+from __future__ import absolute_import
+import os
+import numpy
+from ..ndarray import (zeros, clip, sqrt, array,
+                       multi_sum_sq, multi_lars, norm as NDnorm,
+                       where, ones_like)
+from ..ndarray import (sgd_update, sgd_mom_update,
+                       mp_sgd_update, mp_sgd_mom_update,
+                       preloaded_multi_sgd_update, preloaded_multi_sgd_mom_update,
+                       preloaded_multi_mp_sgd_update, preloaded_multi_mp_sgd_mom_update)
+from .optimizer import Optimizer, register
+from .utils import _flatten_list
+
+__all__ = ['LARS']
+
+
+@register
+class LARS(Optimizer):
+    """the LARS optimizer from 'Large Batch Training of Convolution Networks' \
+    (https://arxiv.org/abs/1708.03888)
+
+    Behave mostly like SGD with momentum and weight decay but is scaling \
+    adaptively the learning for each layer:
+    w_norm = L2norm(weights)
+    g_norm = L2norm(gradients)
+    if w_norm > 0 and g_norm > 0:
+        lr_layer = lr * w_norm / (g_norm + weight_decay * w_norm + epsilon)
+    else:
+        lr_layer = lr
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.
+        The momentum value.
+    eta : float, default 0.001
+        LARS coefficient used to scale the learning rate.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    lazy_update : bool, default False
+        Default is False. If True, lazy updates are applied \
+        if the storage types of weight and grad are both ``row_sparse``.
+    aggregate_num : int, default 1
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, eta=0.001,
+                 epsilon=1e-8, lazy_update=False, use_fused_step=True,
+                 aggregate_num=1, **kwargs):
+        super(LARS, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   aggregate_num=aggregate_num,
+                                   **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update,\
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        if lazy_update:
+            assert not self.multi_precision, \
+                'When lazy_update is set to True, multi_precision has be turned off.'
+        self.lazy_update = lazy_update
+        self.momentum = momentum
+        self.eta = eta
+        self.epsilon = epsilon
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            stype = weight.stype if self.lazy_update else 'default'
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
+        return momentum
+
+    def _l2norm(self, v, rescale=False):
+        """L2 Norm implementation"""
+        v = v.astype('float32')
+        if rescale:
+            v *= self.rescale_grad
+        norm = NDnorm(v)
+        return norm
+
+    def _get_lars(self, index, weight, grad, wd):
+        """Returns a scaling factor for the learning rate for this layer"""
+        lars = 1.0
+        name = self.idx2name[index] if index in self.idx2name else str(index)
+        if name.endswith('gamma') or name.endswith('beta') or name.endswith('bias'):
+            return lars
+
+        w_norm = self._l2norm(weight)
+        g_norm = self._l2norm(grad, rescale=True)
+
+        # calculate lars_trust_ratio
+        ratio = w_norm / g_norm
+        # becomes NaN if ratio == NaN or 0, otherwise 0
+        nan_or_zero = 1 - ratio / ratio
+        lars = self.eta * w_norm / (g_norm + wd * w_norm + self.epsilon)
+        lars = where(nan_or_zero, ones_like(lars), lars)
+
+        return lars.asscalar()
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # compute lars
+            # clip grad + wd * weight is performed after computing lars
+            lars = self._get_lars(index, weight, grad, wd)
+            lr *= lars
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+            else:
+                mom = -lr * grad
+
+            # update weight
+            weight[:] += mom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient is not None:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            nb_params = len(indices)
+            names = [self.idx2name[i] if i in self.idx2name else str(i) for i in indices]
+            lars_idx = [i for i in range(nb_params) if
+                        not(names[i].endswith('gamma') or names[i].endswith('beta') or
+                            names[i].endswith('bias'))]
+            nb_lars = len(lars_idx)
+            no_lars_idx = [i for i in range(nb_params) if
+                           (names[i].endswith('gamma') or names[i].endswith('beta') or
+                            names[i].endswith('bias'))]
+            cur_ctx = weights[0].context
+            full_idx = lars_idx + no_lars_idx
+            new_lrs = array([lrs[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
+            new_wds = array([wds[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
+            new_weights = [weights[i] for i in full_idx]
+            new_grads = [grads[i] for i in full_idx]
+            new_states = [states[i] for i in full_idx]
+            if nb_lars > 0:
+                w_sum_sq = multi_sum_sq(*new_weights[:nb_lars], num_arrays=nb_lars)
+                g_sum_sq = multi_sum_sq(*new_grads[:nb_lars], num_arrays=nb_lars)
+                multi_lars(new_lrs[:nb_lars], w_sum_sq, g_sum_sq, new_wds[:nb_lars],
+                           eta=self.eta, eps=self.epsilon, rescale_grad=self.rescale_grad,
+                           out=new_lrs[:nb_lars])
+            # Same than usual using preloaded sgd functions
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+            if not multi_precision:
+                if self.momentum > 0:
+                    preloaded_multi_sgd_mom_update(
+                        *(_flatten_list(zip(new_weights, new_grads, new_states)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+                else:
+                    preloaded_multi_sgd_update(
+                        *(_flatten_list(zip(new_weights, new_grads)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+            else:
+                states = list(zip(*states))
+                weights32, moms = states
+                if self.momentum > 0:
+                    preloaded_multi_mp_sgd_mom_update(
+                        *(_flatten_list(zip(new_weights, new_grads, moms, weights32)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+                else:
+                    preloaded_multi_mp_sgd_update(
+                        *(_flatten_list(zip(new_weights, new_grads, weights32)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+        else:
+            for i, (index, weight, grad, state) in enumerate(zip(indices, weights, grads, states)):
+                wd = wds[i]
+                lr = lrs[i]
+                lr *= self._get_lars(index, weight, grad, wd)
+                multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+                if not multi_precision:
+                    mom = state
+                    if state is not None:
+                        sgd_mom_update(weight, grad, mom, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    weight32, mom = state
+                    if mom is not None:
+                        mp_sgd_mom_update(weight, grad, mom, weight32, out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(LARS, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/nadam.py b/python/mxnet/optimizer/nadam.py
new file mode 100644
index 000000000000..483a44a8cc46
--- /dev/null
+++ b/python/mxnet/optimizer/nadam.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Nadam optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['Nadam']
+
+
+@register
+class Nadam(Optimizer):
+    """The Nesterov Adam optimizer.
+
+    Much like Adam is essentially RMSprop with momentum,
+    Nadam is Adam RMSprop with Nesterov momentum available
+    at http://cs229.stanford.edu/proj2015/054_report.pdf.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    schedule_decay : float, default 0.004
+        Exponential decay rate for the momentum schedule
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 schedule_decay=0.004, use_fused_step=False, **kwargs):
+        super(Nadam, self).__init__(learning_rate=learning_rate,
+                                    use_fused_step=use_fused_step,
+                                    **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.schedule_decay = schedule_decay
+        self.m_schedule = 1.
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef2 = 1. - self.beta2**t
+
+            # warming momentum schedule
+            momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
+            momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
+            self.m_schedule = self.m_schedule * momentum_t
+            m_schedule_next = self.m_schedule * momentum_t_1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            grad_prime = grad / (1. - self.m_schedule)
+            mean_prime = mean / (1. - m_schedule_next)
+            var_prime = var / coef2
+            mean_bar = momentum_t_1 * mean_prime + (1. - momentum_t) * grad_prime
+
+            # update weight
+            d = mean_bar / (sqrt(var_prime) + self.epsilon)
+            weight[:] -= lr * d
+
diff --git a/python/mxnet/optimizer/nag.py b/python/mxnet/optimizer/nag.py
new file mode 100644
index 000000000000..463f7949a9e9
--- /dev/null
+++ b/python/mxnet/optimizer/nag.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""NAG optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip)
+from ..ndarray import (sgd_update, mp_sgd_update, nag_mom_update, mp_nag_mom_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['NAG']
+
+
+@register
+class NAG(Optimizer):
+    """Nesterov accelerated gradient.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        state = momentum * state + lr * grad
+        weight = weight - (momentum * state + lr * grad)
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.9
+       The momentum value.
+    multi_precision: bool, default False
+        Flag to control the internal precision of the optimizer.
+        False: results in using the same precision as the weights (default),
+        True: makes internal 32-bit copy of the weights and applies gradients
+        in 32-bit precision even if actual weights used in the model have lower precision.
+        Turning this on can improve convergence and accuracy when training with float16.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.9, multi_precision=False,
+                 use_fused_step=True, **kwargs):
+        super(NAG, self).__init__(learning_rate=learning_rate,
+                                  multi_precision=multi_precision,
+                                  use_fused_step=use_fused_step,
+                                  **kwargs)
+        self.momentum = momentum
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+                d = self.momentum * mom - lr * grad
+            else:
+                d = -lr * grad
+
+            # update weight
+            weight[:] += d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'rescale_grad': self.rescale_grad}
+            if self.momentum > 0:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            multi_precision = self.multi_precision and weight.dtype == numpy.float16
+
+            if not multi_precision:
+                mom = state
+                if mom is not None:
+                    nag_mom_update(weight, grad, mom, out=weight, lr=lr, wd=wd, **kwargs)
+                else:
+                    sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                weight32, mom = state
+                if mom is not None:
+                    mp_nag_mom_update(weight, grad, mom, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+                else:
+                    mp_sgd_update(weight, grad, weight32, out=weight,
+                                  lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(NAG, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 09e881ebfca6..b5e8c2468304 100755
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -17,38 +17,17 @@
 # under the License.
 
 # pylint: disable=too-many-lines
-"""Weight updating functions."""
+"""Base Optimizer class."""
 from __future__ import absolute_import
-import logging
-import math
-import pickle
 import warnings
-import os
 import numpy
-from ..base import py_str
-from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply,
-                       multi_sum_sq, multi_lars, norm as NDnorm)
-from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
-                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update, nag_mom_update, mp_nag_mom_update,
-                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
-                       multi_mp_sgd_mom_update, preloaded_multi_sgd_update,
-                       preloaded_multi_sgd_mom_update, preloaded_multi_mp_sgd_update,
-                       preloaded_multi_mp_sgd_mom_update, lamb_update_phase1, lamb_update_phase2,
-                       mp_lamb_update_phase1, mp_lamb_update_phase2)
-from ..ndarray.contrib import (multi_lamb_update, multi_mp_lamb_update)
-from ..ndarray import sparse
-from ..random import normal
+from ..ndarray import (NDArray, zeros, cast)
 from ..util import is_np_array
 
 __all__ = [
-    'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LARS', 'LBSGD',
-    'NAG', 'NDabs', 'Nadam', 'Optimizer', 'RMSProp', 'SGD', 'SGLD', 'Signum', 'LAMB',
-    'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
+    'Optimizer', 'Test', 'create', 'register'
 ]
 
-def _flatten_list(nested_list):
-    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -95,6 +74,17 @@ class Optimizer(object):
         Dictionary of parameter index to gluon.Parameter, used to lookup parameter attributes
         such as lr_mult, wd_mult, etc. param_dict shall not be deep copied.
 
+    aggregate_num : int, optional, default None
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+        In default, only one weight is aggregated.
+        When `aggregate_num` is set to numpy.inf, all the weights are aggregated.
+
+    use_fused_step : bool, optional, default None
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+
     Properties
     ----------
     learning_rate : float
@@ -104,7 +94,9 @@ class Optimizer(object):
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=None,
                  lr_scheduler=None, sym=None, begin_num_update=0,
-                 multi_precision=False, param_dict=None):
+                 multi_precision=False, param_dict=None, aggregate_num=None,
+                 use_fused_step=None, **kwargs):
+        super(Optimizer, self).__init__(**kwargs)
         self.rescale_grad = rescale_grad
         self.lr_scheduler = lr_scheduler
         if self.lr_scheduler is None and learning_rate is None:
@@ -125,7 +117,11 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = self._all_index_update_counts[0]
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
-        self.aggregate_num = 0
+
+        if aggregate_num is None:
+            self.aggregate_num = 1
+        else:
+            self.aggregate_num = aggregate_num
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -135,6 +131,8 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
         self.allow_np_array = is_np_array()
+        self.use_fused_step = use_fused_step \
+            if use_fused_step is not None else False
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -250,7 +248,6 @@ def create_state_multi_precision(self, index, weight):
         state : any obj
             The state associated with the weight.
         """
-        weight_master_copy = None
         if self.multi_precision and weight.dtype == numpy.float16:
             weight_master_copy = weight.astype(numpy.float32)
             return (weight_master_copy,) + (self.create_state(index, weight_master_copy),)
@@ -261,50 +258,101 @@ def create_state_multi_precision(self, index, weight):
                           "optimizer")
         return self.create_state(index, weight)
 
-    def update(self, index, weight, grad, state):
-        """Updates the given parameter using the corresponding gradient and state.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
-    def update_multi_precision(self, index, weight, grad, state):
-        """Updates the given parameter using the corresponding gradient and state.
-        Mixed precision version.
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        New operators that fuses optimizer's update should be put in this function.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        if self.multi_precision and weight.dtype == numpy.float16:
-            # Wrapper for mixed precision
-            weight_master_copy = state[0]
-            original_state = state[1]
-            grad32 = grad.astype(numpy.float32)
-            self.update(index, weight_master_copy, grad32, original_state)
-            cast(weight_master_copy, dtype=weight.dtype, out=weight)
+        raise NotImplementedError
+
+    def update(self, indices, weights, grads, states):
+        """Call step to perform a single optimization update if use_fused_step is False,
+         otherwise fused_step is called.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for weight, grad in zip(weights, grads):
+            assert(isinstance(weight, NDArray))
+            assert(isinstance(grad, NDArray))
+        if not self.use_fused_step:
+            self.step(indices, weights, grads, states)
         else:
-            self.update(index, weight, grad, state)
+            self.fused_step(indices, weights, grads, states)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Call step to perform a single optimization update if use_fused_step is False,
+         otherwise fused_step is called. Mixed precision version.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        weights_master_copy = []
+        original_states = []
+        grads32 = []
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            if self.multi_precision and weight.dtype == numpy.float16:
+                weights_master_copy.append(state[0])
+                original_states.append(state[1])
+                grads32.append(grad.astype(numpy.float32))
+            else:
+                weights_master_copy.append(weight)
+                original_states.append(state)
+                grads32.append(grad)
+        self.update(indices, weights_master_copy, grads32, original_states)
+        for weight_master_copy, weight in zip(weights_master_copy, weights):
+            if self.multi_precision and weight.dtype == numpy.float16:
+                cast(weight_master_copy, dtype=weight.dtype, out=weight)
 
     def set_learning_rate(self, lr):
         """Sets a new learning rate of the optimizer.
@@ -323,10 +371,6 @@ def set_learning_rate(self, lr):
         else:
             self.lr = lr
 
-    def set_lr_scale(self, args_lrscale): # pylint: disable=unused-argument
-        """[DEPRECATED] Sets lr scale. Use set_lr_mult instead."""
-        raise DeprecationWarning
-
     def set_lr_mult(self, args_lr_mult):
         """Sets an individual learning rate multiplier for each parameter.
 
@@ -363,11 +407,6 @@ def set_lr_mult(self, args_lr_mult):
     def set_wd_mult(self, args_wd_mult):
         """Sets an individual weight decay multiplier for each parameter.
 
-        By default, if `param_idx2name` was provided in the
-        constructor, the weight decay multipler is set as 0 for all
-        parameters whose name don't end with ``_weight`` or
-        ``_gamma``.
-
         .. note:: The default weight decay multiplier for a `Variable`
             can be set with its `wd_mult` argument in the constructor.
 
@@ -387,9 +426,6 @@ def set_wd_mult(self, args_wd_mult):
             compatibility, and we recommend to use the name instead.
         """
         self.wd_mult = {}
-        for n in self.idx2name.values():
-            if not (n.endswith('_weight') or n.endswith('_gamma')):
-                self.wd_mult[n] = 0.0
         if self.sym_info:
             attr, arg_names = self.sym_info
             for name in arg_names:
@@ -519,1514 +555,10 @@ def __setstate__(self, state):
         # param_dict needs to be explicitly set by the trainer
         self.param_dict = {}
 
+
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
-# pylint: disable=line-too-long
-@register
-class SGD(Optimizer):
-    """The SGD optimizer with momentum and weight decay.
-
-    If the storage types of grad is ``row_sparse`` and ``lazy_update`` is True, \
-    **lazy updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = lr * (rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row])
-            state[row] = momentum[row] * state[row] + rescaled_grad[row]
-            weight[row] = weight[row] - state[row]
-
-    The sparse update only updates the momentum for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    In the case when ``update_on_kvstore`` is set to False (either globally via
-    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
-    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
-    of parameters, which may lead to improved performance. The aggregation size
-    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
-    defaults to 4.
-
-    Otherwise, **standard updates** are applied by::
-
-        rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
-        state = momentum * state + rescaled_grad
-        weight = weight - state
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    lazy_update : bool, optional
-        Default is True. If True, lazy updates are applied \
-        if the storage types of weight and grad are both ``row_sparse``.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-    """
-    def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
-        super(SGD, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            stype = weight.stype if self.lazy_update else 'default'
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
-        return momentum
-
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            if not multi_precision:
-                if self.momentum > 0:
-                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
-                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
-                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-            else:
-                if self.momentum > 0:
-                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
-                                            out=weights, num_weights=len(weights),
-                                            lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
-                                                           list(zip(*states))[1])),
-                                        out=weights, num_weights=len(weights),
-                                        lrs=lrs, wds=wds, **kwargs)
-        else:
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-@register
-class Signum(Optimizer):
-    r"""The Signum optimizer that takes the sign of gradient or momentum.
-
-    The optimizer updates the weight by::
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
-
-    References
-    ----------
-    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
-    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
-
-    See: https://arxiv.org/abs/1802.04434
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    wd_lh : float, optional
-       The amount of decoupled weight decay regularization, see details in the original paper at:\
-       https://arxiv.org/abs/1711.05101
-    """
-    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh=0.0, **kwargs):
-        super(Signum, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.momentum = momentum
-        self.wd_lh = wd_lh
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def _update_impl(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-        if self.wd_lh:
-            kwargs['wd_lh'] = self.wd_lh
-
-        if state is not None:
-            signum_update(weight, grad, state, out=weight,
-                          lr=lr, wd=wd, **kwargs)
-        else:
-            signsgd_update(weight, grad, out=weight,
-                           lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state)
-
-@register
-class FTML(Optimizer):
-    """The FTML optimizer.
-
-    This class implements the optimizer described in
-    *FTML - Follow the Moving Leader in Deep Learning*,
-    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
-
-    Denote time step by t. The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
-        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon)
-        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
-        weight = - z / d_t
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        0 < beta1 < 1. Generally close to 0.5.
-    beta2 : float, optional
-        0 < beta2 < 1. Generally close to 1.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    """
-    def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
-        super(FTML, self).__init__(**kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
-                zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
-                zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad, 't': t}
-        if self.clip_gradient:
-            kwargs['clip_grad'] = self.clip_gradient
-
-        prev_d, prev_v, prev_z = state
-        ftml_update(weight, grad, prev_d, prev_v, prev_z, out=weight,
-                    lr=lr, wd=wd, **kwargs)
-
-@register
-class LARS(Optimizer):
-    """the LARS optimizer from 'Large Batch Training of Convolution Networks' \
-    (https://arxiv.org/abs/1708.03888)
-
-    Behave mostly like SGD with momentum and weight decay but is scaling \
-    adaptively the learning for each layer (except bias and batch norm parameters):
-    w_norm = L2norm(weights)
-    g_norm = L2norm(gradients)
-    if w_norm > 0 and g_norm > 0:
-        lr_layer = lr * lr_mult * eta * w_norm / (g_norm + weight_decay * w_norm + eps)
-    else:
-        lr_layer = lr * lr_mult
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    lazy_update : bool, optional
-        Default is True. If True, lazy updates are applied \
-        if the storage types of weight and grad are both ``row_sparse``.
-    lars_eta : float, optional
-        LARS coefficient used to scale the learning rate. Default set to 0.001.
-    lars_epsilon : float, optional
-        Optional epsilon in case of very small gradients. Default set to 0.
-    momentum_correction : bool, optional
-        If True scale momentum w.r.t global learning rate change (with an lr_scheduler) \
-        as indicated in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour` \
-        (https://arxiv.org/pdf/1706.02677.pdf)
-        Default set to True.
-    """
-    def __init__(self, momentum=0.0, lazy_update=True, eta=0.001, eps=0,
-                 momentum_correction=True, **kwargs):
-        super(LARS, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.momentum_correction = momentum_correction
-        self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
-        self.eta = eta
-        self.eps = eps
-        self.skip = 0
-        self.last_lr = None
-        self.cur_lr = None
-
-
-    def _get_lrs(self, indices):
-        """Gets the learning rates given the indices of the weights.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices corresponding to weights.
-
-        Returns
-        -------
-        lrs : list of float
-            Learning rates for those indices.
-        """
-        if self.cur_lr is not None:
-            self.last_lr = self.cur_lr
-
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
-
-        if self.cur_lr is None:
-            self.last_lr = lr
-        self.cur_lr = lr
-
-        lrs = [lr for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                lrs[i] *= self.param_dict[index].lr_mult
-            elif index in self.lr_mult:
-                lrs[i] *= self.lr_mult[index]
-            elif index in self.idx2name:
-                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lrs
-
-    def set_wd_mult(self, args_wd_mult):
-        self.wd_mult = {}
-        for n in self.idx2name.values():
-            is_weight = n.endswith('_weight')
-
-            if not is_weight:
-                self.wd_mult[n] = 0.0
-
-        if self.sym_info:
-            attr, arg_names = self.sym_info
-            for name in arg_names:
-                if name in attr and '__wd_mult__' in attr[name]:
-                    self.wd_mult[name] = float(attr[name]['__wd_mult__'])
-        self.wd_mult.update(args_wd_mult)
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            stype = weight.stype if self.lazy_update else 'default'
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
-        return momentum
-
-    def _l2norm(self, v, rescale=False):
-        """L2 Norm implementation"""
-        v = v.astype('float32')
-        if rescale:
-            v *= self.rescale_grad
-        norm = NDnorm(v).asnumpy()[0]
-        return norm
-
-    def _get_lars(self, i, weight, g, lr, wd):
-        """Returns a scaling factor for the learning rate for this layer"""
-        name = self.idx2name[i] if i in self.idx2name else str(i)
-        if name.endswith('gamma') or name.endswith('beta') or name.endswith('bias'):
-            return lr
-
-        w_norm = self._l2norm(weight)
-        g_norm = self._l2norm(g, rescale=True)
-
-        if w_norm > 0.0 and g_norm > 0.0:
-            lars = self.eta * w_norm/(g_norm + wd * w_norm + self.eps)
-        else:
-            lars = 1.0
-        return lars * lr
-
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = (self.momentum * (self.cur_lr / self.last_lr)) \
-                                 if (self.momentum_correction and self.last_lr != 0) else \
-                                 self.momentum
-
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            nb_params = len(indices)
-            names = [self.idx2name[i] if i in self.idx2name else str(i) for i in indices]
-            lars_idx = [i for i in range(nb_params) if
-                        not(names[i].endswith('gamma') or names[i].endswith('beta') or
-                            names[i].endswith('bias'))]
-            nb_lars = len(lars_idx)
-            no_lars_idx = [i for i in range(nb_params) if
-                           (names[i].endswith('gamma') or names[i].endswith('beta') or
-                            names[i].endswith('bias'))]
-            cur_ctx = weights[0].context
-            full_idx = lars_idx + no_lars_idx
-            new_lrs = array([lrs[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
-            new_wds = array([wds[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
-            new_weights = [weights[i] for i in full_idx]
-            new_grads = [grads[i] for i in full_idx]
-            new_states = [states[i] for i in full_idx]
-            if nb_lars > 0:
-                w_sum_sq = multi_sum_sq(*new_weights[:nb_lars], num_arrays=nb_lars)
-                g_sum_sq = multi_sum_sq(*new_grads[:nb_lars], num_arrays=nb_lars)
-                multi_lars(new_lrs[:nb_lars], w_sum_sq, g_sum_sq, new_wds[:nb_lars],
-                           eta=self.eta, eps=self.eps, rescale_grad=self.rescale_grad,
-                           out=new_lrs[:nb_lars])
-            # Same than usual using preloaded sgd functions
-            sidx = 0
-            while sidx < len(indices):
-                eidx = sidx + len(new_weights[sidx:sidx+self.aggregate_num])
-                if not multi_precision:
-                    if self.momentum > 0:
-                        preloaded_multi_sgd_mom_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                new_states[sidx:eidx])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                    else:
-                        preloaded_multi_sgd_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                else:
-                    if self.momentum > 0:
-                        preloaded_multi_mp_sgd_mom_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                *zip(*new_states[sidx:eidx]))) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                    else:
-                        preloaded_multi_mp_sgd_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                list(zip(*new_states[sidx:eidx]))[1])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                sidx += self.aggregate_num
-        else:
-            lrs = [self._get_lars(i, w, g, lr, wd) for (i, w, g, lr, wd) in
-                   zip(indices, weights, grads, lrs, wds)]
-
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-#
-@register
-class LBSGD(Optimizer):
-    """The Large Batch SGD optimizer with momentum and weight decay.
-
-    The optimizer updates the weight by::
-
-        state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
-        weight = weight - state
-
-    For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update`
-    and :class:`~mxnet.ndarray.sgd_mom_update`.
-    In addition to the SGD updates the LBSGD optimizer uses the LARS, Layer-wise
-    Adaptive Rate Scaling, algorithm to have a separate learning rate for each
-    layer of the network, which leads to better stability over large batch sizes.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-
-    warmup_strategy: string ('linear', 'power2', 'sqrt'. , 'lars'   default : 'linear')
-    warmup_epochs: unsigned, default: 5
-    batch_scale:   unsigned, default: 1 (same as batch size * numworkers)
-    updates_per_epoch: updates_per_epoch (default: 32, Default might not reflect true number batches per epoch. Used for warmup.)
-    begin_epoch: unsigned, default 0, starting epoch.
-    """
-    def __init__(self, momentum=0.0, multi_precision=False, warmup_strategy='linear',
-                 warmup_epochs=5, batch_scale=1, updates_per_epoch=32, begin_epoch=0, num_epochs=60,
-                 **kwargs):
-        super(LBSGD, self).__init__(**kwargs)
-        logging.info('Running Large-Batch SGD Algorithm')
-        logging.info('(Batch_scale=%f, warmup_epochs=%d, warmup_strategy=%s, updates_per_epoch=%d)',
-                     batch_scale, warmup_epochs, warmup_strategy, updates_per_epoch)
-        self.momentum = momentum
-        self.multi_precision = multi_precision
-        # new user parameters for large batch
-        self.warmup_strategy = warmup_strategy
-        self.warmup_epochs = warmup_epochs
-        self.batch_scale = batch_scale
-        self.updates_per_epoch = updates_per_epoch
-        self.init_updates = begin_epoch * updates_per_epoch
-        self.num_epochs = num_epochs
-        # addl internal usage parameters and storage
-        self.lbmult = 1
-        self.cumgrads = {}
-        # for adaptive lr
-        self.adaptive = False
-        self.admult = 1  # adaptation constant
-
-    def create_state(self, index, weight):
-        momentum = None
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32)
-            if self.momentum != 0.0:
-                momentum = zeros(weight.shape, weight.context, dtype=numpy.float32,
-                                 stype=weight.stype)
-            return (momentum, weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def _get_lbmult(self, nup):
-        """Returns lr scaling factor for large batch according to warmup schedule
-        (to be implemented)
-        """
-        nwup = self.warmup_epochs * self.updates_per_epoch
-        strategy = self.warmup_strategy
-        maxmult = float(self.batch_scale)
-        if nup >= nwup:
-            mult = maxmult
-        elif nwup <= 1:
-            mult = 1.0
-        else:
-            if (strategy == 'linear'):
-                mult = 1.0 + (maxmult - 1) * nup / nwup
-            elif (strategy == 'power2'):
-                mult = 1.0 + (maxmult-1) * (nup*nup)/(nwup*nwup)
-            elif (strategy == 'sqrt'):
-                mult = 1.0 + (maxmult - 1) * math.sqrt(float(nup) / nwup)
-            else:
-                mult = 1.0
-        return mult
-
-    def _get_lars(self, weight, g, wd):
-        """Returns a scaling factor for the learning rate for this layer
-        default is 1
-        """
-        weight2 = self._l2norm(weight)
-        grad2 = self._l2norm(g)
-        lars = math.sqrt(weight2 / (grad2 + wd * weight2 + 1e-18))
-        if lars < 0.01:
-            lars = 0.01
-        elif lars > 100:
-            lars = 100
-        return lars
-
-    def _l2norm(self, v):
-        "inner product implementation"
-        norm = multiply(v, v).asnumpy().sum()
-        return norm
-
-    def _reset_cum_gradient(self, index):
-        "called every macro-batch to reset cumulated gradients to 0 for a given index"
-        self.cumgrads[index]['cum_grad'] = 0
-
-    def _get_cum_gradient(self, index):
-        "get the cumulated gradient for index"
-        if index in self.cumgrads:
-            return self.cumgrads[index]
-        else:
-            return {}
-
-    def _put_cum_gradient(self, index, cgrad):
-        "store cumulated gradient for index"
-        self.cumgrads[index] = cgrad
-
-    def _cumulate_gradient(self, grad, index):
-        "Cumulate gradients for large-batch emulation. Cumulated by index (layer)"
-        cgrad = self._get_cum_gradient(index)
-        if cgrad:
-            num_cums = cgrad['num_cums']
-            if num_cums > 0:
-                cum_grad = cgrad['cum_grad'] + grad
-                num_cums += 1
-            else:
-                cum_grad = grad
-                num_cums = self.init_updates + 1
-        else:
-            cum_grad = grad
-            num_cums = self.init_updates + 1
-        cgrad = {'cum_grad': cum_grad, 'num_cums': num_cums}
-        self._put_cum_gradient(index, cgrad)
-        return cgrad
-
-    def update(self, index, weight, grad, state):
-        assert (isinstance(weight, NDArray))
-        assert (isinstance(grad, NDArray))
-
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        # new stuff for large batch
-        cgrad = self._cumulate_gradient(grad, index)
-        if (cgrad['num_cums'] % self.batch_scale) == 0:
-            grad = cgrad['cum_grad'] / self.batch_scale
-            if self.warmup_strategy == 'lars':
-                lbmult = self._get_lars(weight, grad, wd)
-            else:
-                lbmult = self._get_lbmult(cgrad['num_cums'])
-            lr = lr * lbmult
-            # do the regular sgd update flow
-            kwargs = {'rescale_grad': self.rescale_grad}
-            if self.momentum > 0:
-                kwargs['momentum'] = self.momentum
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            use_multi_precision = isinstance(state, (list, tuple))
-
-            if not use_multi_precision:
-                if state is not None:
-                    sgd_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
-                else:
-                    sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-            else:
-                if state[0] is not None:
-                    mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, lr=lr, wd=wd,
-                                      **kwargs)
-                else:
-                    mp_sgd_update(weight, grad, state[1], out=weight, lr=lr, wd=wd, **kwargs)
-            # reset update count and cumulated gradient per large batch
-            self._reset_cum_gradient(index)
-        else:
-            lr = 0.0
-            kwargs = {}
-            sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-
-
-@register
-class LAMB(Optimizer):
-    """LAMB Optimizer.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 lower_bound=None, upper_bound=None, bias_correction=True, **kwargs):
-        super(LAMB, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-        self.bias_correction = bias_correction
-        self.aggregate_num = max(1, min(45, int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "45"))))
-
-    def create_state(self, index, weight):
-        stype = weight.stype
-        dtype = weight.dtype
-        return (zeros(weight.shape, weight.context, dtype=dtype, stype=stype),
-                zeros(weight.shape, weight.context, dtype=dtype, stype=stype))
-
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'bias_correction': self.bias_correction,
-                  'rescale_grad': self.rescale_grad}
-
-        if self.aggregate_num <= 1 or not isinstance(index, (tuple, list)):
-            if isinstance(index, (tuple, list)):
-                assert(len(index) == self.aggregate_num)
-                index, weight, grad, state = index[0], weight[0], grad[0], state[0]
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            self._update_count(index)
-            lr = self._get_lr(index)
-            wd = self._get_wd(index)
-            t = self._index_update_count[index]
-            weight_ptr = weight
-            grad_ptr = grad
-            if multi_precision:
-                mean, var = state[1]
-                weight32 = state[0]
-            else:
-                mean, var = state
-            kwargs['t'] = t
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-
-            if multi_precision:
-                g = mp_lamb_update_phase1(weight_ptr, grad_ptr, mean, var, weight32, wd=wd, **kwargs)
-                kwargs = {}
-                if self.lower_bound:
-                    kwargs['lower_bound'] = self.lower_bound
-                if self.upper_bound:
-                    kwargs['upper_bound'] = self.upper_bound
-                r_1 = weight32.norm()
-                r_2 = g.norm()
-                mp_lamb_update_phase2(weight_ptr, g, r_1, r_2, weight32, lr=lr, out=weight_ptr, **kwargs)
-            else:
-                g = lamb_update_phase1(weight_ptr, grad_ptr, mean, var, wd=wd, **kwargs)
-                kwargs = {}
-                if self.lower_bound:
-                    kwargs['lower_bound'] = self.lower_bound
-                if self.upper_bound:
-                    kwargs['upper_bound'] = self.upper_bound
-                r_1 = weight_ptr.norm()
-                r_2 = g.norm()
-                lamb_update_phase2(weight_ptr, g, r_1, r_2, lr=lr, out=weight_ptr, **kwargs)
-        else:
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            if self.lower_bound:
-                kwargs['lower_bound'] = self.lower_bound
-            if self.upper_bound:
-                kwargs['upper_bound'] = self.upper_bound
-
-            step_count, lrs, wds = [], [], []
-            for i, w_i, g_i in zip(index, weight, grad):
-                assert(isinstance(w_i, NDArray))
-                assert(isinstance(g_i, NDArray))
-                self._update_count(i)
-                step_count.append(self._index_update_count[i])
-                lrs.append(self._get_lr(i))
-                wds.append(self._get_wd(i))
-
-            updated_tensors = 0
-            while updated_tensors < len(weight):
-                sidx = updated_tensors
-                eidx = min(updated_tensors + self.aggregate_num, len(weight))
-                if not multi_precision:
-                    mean, var = list(zip(*state[sidx:eidx]))
-                    multi_lamb_update(weight[sidx:eidx],
-                                      grad[sidx:eidx],
-                                      mean, var,
-                                      out=weight[sidx:eidx],
-                                      step_count=step_count[sidx:eidx],
-                                      lrs=lrs[sidx:eidx],
-                                      wds=wds[sidx:eidx],
-                                      **kwargs)
-                else:
-                    mean_var = list(zip(*state[sidx:eidx]))[1]
-                    temp = list(zip(*mean_var))
-                    mean = temp[0]
-                    var = temp[1]
-                    multi_mp_lamb_update(weight[sidx:eidx],
-                                         grad[sidx:eidx],
-                                         mean, var,
-                                         list(zip(*state[sidx:eidx]))[0],
-                                         out=weight[sidx:eidx],
-                                         step_count=step_count[sidx:eidx],
-                                         lrs=lrs[sidx:eidx],
-                                         wds=wds[sidx:eidx],
-                                         **kwargs)
-                updated_tensors += self.aggregate_num
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-# pylint: enable=line-too-long
-@register
-class DCASGD(Optimizer):
-    """The DCASGD optimizer.
-
-    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent
-    with Delay Compensation for Distributed Deep Learning*,
-    available at https://arxiv.org/abs/1609.08326.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-
-    lamda : float, optional
-       Scale DC value.
-    """
-    def __init__(self, momentum=0.0, lamda=0.04, **kwargs):
-        super(DCASGD, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.weight_previous = {}
-        self.lamda = lamda
-
-    def create_state(self, index, weight):
-        if self.momentum == 0.0:
-            return (None,
-                    weight.copy())  # previous weight
-        else:
-            return (zeros(weight.shape, weight.context, dtype=weight.dtype), # momentum
-                    weight.copy())  # previous weight
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        mom, previous_weight = state
-        if mom:
-            mom[:] *= self.momentum
-            mom[:] += -lr * (grad + wd * weight + self.lamda \
-                             * grad * grad * (weight - previous_weight))
-        else:
-            assert(self.momentum == 0.0)
-            mom = -lr * (grad + wd * weight + self.lamda \
-                         * grad * grad * (weight - previous_weight))
-        previous_weight[:] = weight
-        weight[:] += mom
-
-@register
-class NAG(Optimizer):
-    """Nesterov accelerated gradient.
-
-    This optimizer updates each weight by::
-
-        state = momentum * state + grad + wd * weight
-        weight = weight - (lr * (grad + momentum * state))
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-    """
-    def __init__(self, momentum=0.0, **kwargs):
-        super(NAG, self).__init__(**kwargs)
-        self.momentum = momentum
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "NAG optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
-        return momentum
-
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if not multi_precision:
-            if state is not None:
-                nag_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
-            else:
-                sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            if state[0] is not None:
-                mp_nag_mom_update(weight, grad, state[0], state[1], out=weight,
-                                  lr=lr, wd=wd, **kwargs)
-            else:
-                mp_sgd_update(weight, grad, state[1], out=weight,
-                              lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 \
-                                and isinstance(state, (tuple, list))
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-
-@register
-class SGLD(Optimizer):
-    """Stochastic Gradient Riemannian Langevin Dynamics.
-
-    This class implements the optimizer described in the paper *Stochastic Gradient
-    Riemannian Langevin Dynamics on the Probability Simplex*, available at
-    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
-
-    """
-    def __init__(self, **kwargs):
-        super(SGLD, self).__init__(**kwargs)
-
-    def create_state(self, index, weight):
-        return None
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight)
-        weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
-                            dtype=weight.dtype, ctx=weight.context)
-
-
-
-@register  # pylint: disable=invalid-name
-class ccSGD(SGD):
-    """[DEPRECATED] Same as `SGD`. Left here for backward compatibility."""
-    def __init__(self, *args, **kwargs):
-        super(ccSGD, self).__init__(*args, **kwargs)
-
-@register
-class Adam(Optimizer):
-    """The Adam optimizer.
-
-    This class implements the optimizer described in *Adam: A Method for
-    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
-
-    If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
-    **lazy updates** at step t are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad + wd * weight[row], clip_gradient)
-            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
-            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
-            lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
-            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
-
-    The lazy update only updates the mean and var for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all indices.
-    Compared with the original update, it can provide large improvements in model training
-    throughput for some applications. However, it provides slightly different semantics than
-    the original update, and may lead to different empirical results.
-
-    Otherwise, **standard updates** at step t are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m + (1 - beta1) * rescaled_grad
-        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
-        lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
-        w = w - lr * m / (sqrt(v) + epsilon)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    lazy_update : bool, optional
-       Default is True. If True, lazy updates are applied \
-       if the storage types of weight and grad are both ``row_sparse``.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 lazy_update=True, **kwargs):
-        super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lazy_update = lazy_update
-
-    def create_state(self, index, weight):
-        stype = weight.stype if self.lazy_update else 'default'
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
-                      stype=stype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype,
-                      stype=stype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        coef1 = 1. - self.beta1**t
-        coef2 = 1. - self.beta2**t
-        lr *= math.sqrt(coef2)/coef1
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        mean, var = state
-        adam_update(weight, grad, mean, var, out=weight,
-                    lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-
-@register
-class AdaGrad(Optimizer):
-    """AdaGrad optimizer.
-
-    This class implements the AdaGrad optimizer described in *Adaptive Subgradient
-    Methods for Online Learning and Stochastic Optimization*, and available at
-    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad, clip_gradient)
-        history += square(grad)
-        div = grad / sqrt(history + float_stable_eps)
-        weight += (div + weight * wd) * -lr
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    See Also
-    ----------
-    :meth:`mxnet.ndarray.sparse.adagrad_update`.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Initial value of the history accumulator. Avoids division by 0.
-
-    """
-    def __init__(self, eps=1e-7, **kwargs):
-        super(AdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
-
-    def create_state(self, index, weight):
-        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        is_sparse = grad.stype == 'row_sparse'
-        history = state
-
-        if is_sparse:
-            kwargs = {'epsilon': self.float_stable_eps,
-                      'rescale_grad': self.rescale_grad}
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            history[:] += square(grad)
-            div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += (div + weight * wd) * -lr
-
-@register
-class RMSProp(Optimizer):
-    """The RMSProp optimizer.
-
-    Two versions of RMSProp are implemented:
-
-    If ``centered=False``, we follow
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012.
-    For details of the update algorithm see :class:`~mxnet.ndarray.rmsprop_update`.
-
-    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
-    by Alex Graves, 2013.
-    For details of the update algorithm see :class:`~mxnet.ndarray.rmspropalex_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    gamma1: float, optional
-        A decay factor of moving average over past squared gradient.
-    gamma2: float, optional
-        A "momentum" factor. Only used if `centered`=``True``.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    centered : bool, optional
-        Flag to control which version of RMSProp to use.::
-
-            True: will use Graves's version of `RMSProp`,
-            False: will use Tieleman & Hinton's version of `RMSProp`.
-
-    clip_weights : float, optional
-        Clips weights into range ``[-clip_weights, clip_weights]``.
-    """
-    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
-                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
-        super(RMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.gamma1 = gamma1
-        self.gamma2 = gamma2
-        self.centered = centered
-        self.epsilon = epsilon
-        self.clip_weights = clip_weights
-
-    def create_state(self, index, weight):
-        if self.centered:
-            return (
-                zeros(weight.shape, weight.context, stype=weight.stype),  # n
-                zeros(weight.shape, weight.context, stype=weight.stype),  # g
-                zeros(weight.shape, weight.context, stype=weight.stype))  # delta
-        else:
-            return (zeros(weight.shape, weight.context, stype=weight.stype),)  # n
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'gamma1': self.gamma1, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.centered:
-            kwargs['gamma2'] = self.gamma2
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-        if self.clip_weights:
-            kwargs['clip_weights'] = self.clip_weights
-
-        if not self.centered:
-            (n, ) = state
-            rmsprop_update(
-                weight, grad, n, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            n, g, delta = state
-            rmspropalex_update(weight, grad, n, g, delta, out=weight,
-                               lr=lr, wd=wd, **kwargs)
-
-@register
-class AdaDelta(Optimizer):
-    """The AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
-        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
-        weight -= (delta + wd * weight)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(AdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context),  # accumulated g
-                zeros(weight.shape, weight.context))  # accumulated delta
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        # preprocess grad
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, - self.clip_gradient, self.clip_gradient)
-
-        # accumulated g and delta initlization
-        acc_g, acc_delta = state
-
-        # update g, delta
-        acc_g[:] *= self.rho
-        acc_g[:] += (1. - self.rho) * grad * grad
-        current_delta = sqrt(acc_delta + self.epsilon) / sqrt(acc_g + self.epsilon) * grad
-        acc_delta[:] *= self.rho
-        acc_delta[:] += (1. - self.rho) * current_delta * current_delta
-
-        # update weight
-        weight[:] -= current_delta + wd * weight
-
-#pylint: disable=invalid-name
-#pylint: disable=line-too-long
-@register
-class Ftrl(Optimizer):
-    """The Ftrl optimizer.
-
-    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
-    http://dl.acm.org/citation.cfm?id=2488200.
-
-    eta :
-        .. math::
-           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
-
-    The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
-        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
-        n += rescaled_grad**2
-        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
-
-    If the storage types of weight, state and grad are all ``row_sparse``, \
-    **sparse updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
-            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
-            n[row] += rescaled_grad[row]**2
-            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
-
-    The sparse update only updates the z and n for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    lamda1 : float, optional
-        L1 regularization coefficient.
-    learning_rate : float, optional
-        The initial learning rate.
-    beta : float, optional
-        Per-coordinate learning rate correlation parameter.
-    """
-
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
-        super(Ftrl, self).__init__(**kwargs)
-        self.lamda1 = lamda1
-        self.beta = beta
-        self.lr = learning_rate
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
-                zeros(weight.shape, weight.context, stype=weight.stype))  # n
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        wd = self._get_wd(index)
-        lr = self._get_lr(index)
-
-        kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        # accumulated g and delta initialization
-        z, n = state
-        ftrl_update(weight, grad, z, n, out=weight,
-                    lr=lr, wd=wd, **kwargs)
-
-# pylint: enable=line-too-long
-@register
-class Adamax(Optimizer):
-    """The AdaMax optimizer.
-
-    It is a variant of Adam based on the infinity norm
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    The optimizer updates the weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
-        super(Adamax, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        lr /= (1. - self.beta1**t)
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # update m_t and u_t
-        m_t, u_t = state
-        m_t[:] *= self.beta1
-        m_t[:] += (1. - self.beta1) * grad
-        u_t[:] = maximum(self.beta2 * u_t, NDabs(grad))
-
-        # update weight
-        weight[:] -= lr * m_t / u_t
-
-@register
-class Nadam(Optimizer):
-    """The Nesterov Adam optimizer.
-
-    Much like Adam is essentially RMSprop with momentum,
-    Nadam is Adam RMSprop with Nesterov momentum available
-    at http://cs229.stanford.edu/proj2015/054_report.pdf.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    schedule_decay : float, optional
-        Exponential decay rate for the momentum schedule
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 schedule_decay=0.004, **kwargs):
-        super(Nadam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.schedule_decay = schedule_decay
-        self.m_schedule = 1.
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # warming momentum schedule
-        momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
-        momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
-        self.m_schedule = self.m_schedule * momentum_t
-        m_schedule_next = self.m_schedule * momentum_t_1
-
-        # update m_t and v_t
-        m_t, v_t = state
-        m_t[:] *= self.beta1
-        m_t[:] += (1. - self.beta1) * grad
-        v_t[:] *= self.beta2
-        v_t[:] += (1. - self.beta2) * grad * grad
-
-        grad_prime = grad / (1. - self.m_schedule)
-        m_t_prime = m_t / (1. - m_schedule_next)
-        v_t_prime = v_t / (1. - pow(self.beta2, t))
-        m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime
-
-        # update weight
-        weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
 
 @register
 class Test(Optimizer):
@@ -2038,139 +570,16 @@ def create_state(self, index, weight):
         """Creates a state to duplicate weight."""
         return zeros(weight.shape, weight.context)
 
-    def update(self, index, weight, grad, state):
+    def step(self, indices, weights, grads, states):
         """Performs w += rescale_grad * grad."""
-        weight[:] += grad * self.rescale_grad
-        state[:] = weight
-
-# backward compatibility wrapper for Optimizer.CreateOptimizer
-create = Optimizer.create_optimizer  # pylint: disable=invalid-name
-
-
-def _as_classic(a, allow_np):
-    # TODO(junwu): This is a temp solution for allowing converting
-    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
-    # users may have custom optimizers implemented using mx.nd.NDArray ops.
-    from ..numpy import ndarray as np_ndarray
-    if isinstance(a, (tuple, list)):
-        if any(isinstance(x, np_ndarray) for x in a):
-            if allow_np:
-                return [x.as_nd_ndarray() for x in a]
-            else:
-                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    else:
-        if isinstance(a, np_ndarray):
-            if allow_np:
-                return a.as_nd_ndarray()
-            else:
-                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    return a
-
-
-
-class Updater(object):
-    """Updater for kvstore."""
-    def __init__(self, optimizer):
-        self.optimizer = optimizer
-        self.states = {}
-        self.states_synced = {}
-        self.aggregate_updates = optimizer.aggregate_num > 0
-
-    def __call__(self, index, grad, weight):
-        """Updates weight given gradient and index."""
-        allow_np = self.optimizer.allow_np_array if hasattr(self.optimizer, "allow_np_array") else is_np_array()
-        if not isinstance(index, (list, tuple)):
-            indices = [index]
-            grads = [_as_classic(grad, allow_np)]
-            weights = [_as_classic(weight, allow_np)]
-        else:
-            indices = index
-            grads = _as_classic(grad, allow_np)
-            weights = _as_classic(weight, allow_np)
-        if weights:
-            self.optimizer._set_current_context(weights[0].context.device_id)
-        for i, idx in enumerate(indices):
-            # convert ctypes.char_p.value back to python str if needed
-            if isinstance(idx, bytes):
-                indices[i] = py_str(idx)
-                idx = indices[i]
-            if idx not in self.states:
-                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
-                self.states_synced[idx] = True
-            elif not self.states_synced[idx]:
-                self.states[idx] = \
-                    self.sync_state_context(self.states[idx], weights[i].context)
-                self.states_synced[idx] = True
-        if self.aggregate_updates:
-            # segregate values based on type
-            type_map = {}
-            for i, w, g in zip(indices, weights, grads):
-                if w.dtype in type_map:
-                    type_map[w.dtype].append((i, w, g))
-                else:
-                    type_map[w.dtype] = [(i, w, g)]
-            for idx in type_map:
-                current_index = 0
-                indices, weights, grads = zip(*type_map[idx])
-                while current_index < len(indices):
-                    states = []
-                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
-                    for j in range(step):
-                        states.append(self.states[indices[current_index + j]])
-                    self.optimizer.update_multi_precision(
-                        indices[current_index:current_index + self.optimizer.aggregate_num],
-                        weights[current_index:current_index + self.optimizer.aggregate_num],
-                        grads[current_index:current_index + self.optimizer.aggregate_num],
-                        states)
-                    current_index += self.optimizer.aggregate_num
-        else:
-            for i, w, g in zip(indices, weights, grads):
-                self.optimizer.update_multi_precision(i, w, g, self.states[i])
-
-    def sync_state_context(self, state, context):
-        """sync state context."""
-        if isinstance(state, NDArray):
-            return state.as_in_context(context)
-        elif isinstance(state, (tuple, list)):
-            synced_state = (self.sync_state_context(i, context) for i in state)
-            if isinstance(state, tuple):
-                return tuple(synced_state)
-            else:
-                return list(synced_state)
-        else:
-            return state
-
-    def set_states(self, states):
-        """Sets updater states."""
-        states = pickle.loads(states)
-        if isinstance(states, tuple) and len(states) == 2:
-            self.states, self.optimizer = states
-        else:
-            self.states = states
-        self.states_synced = dict.fromkeys(self.states.keys(), False)
-
-    def get_states(self, dump_optimizer=False):
-        """Gets updater states.
-
-        Parameters
-        ----------
-        dump_optimizer : bool, default False
-            Whether to also save the optimizer itself. This would also save optimizer
-            information such as learning rate and weight decay schedules.
-        """
-        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
+        for index, weight, grad in zip(indices, weights, grads):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+            grad = self.rescale_grad * grad
+            weight[:] -= lr * (grad + wd * weight)
 
-def get_updater(optimizer):
-    """Returns a closure of the updater needed for kvstore.
 
-    Parameters
-    ----------
-    optimizer: Optimizer
-         The optimizer.
+create = Optimizer.create_optimizer  # pylint: disable=invalid-name
 
-    Returns
-    -------
-    updater: function
-         The closure of the updater.
-    """
-    return Updater(optimizer)
diff --git a/python/mxnet/optimizer/rmsprop.py b/python/mxnet/optimizer/rmsprop.py
new file mode 100644
index 000000000000..b57c82130b4e
--- /dev/null
+++ b/python/mxnet/optimizer/rmsprop.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""RMSProp optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import (rmsprop_update, rmspropalex_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['RMSProp']
+
+
+@register
+class RMSProp(Optimizer):
+    """The RMSProp optimizer.
+
+    Two versions of RMSProp are implemented:
+
+    If ``centered=False``, we follow
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012.
+    For details of the update algorithm see :class:`~mxnet.ndarray.rmsprop_update`.
+
+    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
+    by Alex Graves, 2013.
+    For details of the update algorithm see :class:`~mxnet.ndarray.rmspropalex_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    rho: float, default 0.9
+        A decay factor of moving average over past squared gradient.
+    momentum: float, default 0.9
+        Heavy ball momentum factor. Only used if `centered`=``True``.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    centered : bool, default False
+        Flag to control which version of RMSProp to use.::
+
+            True: will use Graves's version of `RMSProp`,
+            False: will use Tieleman & Hinton's version of `RMSProp`.
+
+    clip_weights : float, optional
+        Clips weights into range ``[-clip_weights, clip_weights]``.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.9,
+                 epsilon=1e-8, centered=False, clip_weights=None,
+                 use_fused_step=True, **kwargs):
+        super(RMSProp, self).__init__(learning_rate=learning_rate,
+                                      use_fused_step=use_fused_step,
+                                      **kwargs)
+        self.rho = rho
+        self.momentum = momentum
+        self.centered = centered
+        self.epsilon = epsilon
+        self.clip_weights = clip_weights
+
+    def create_state(self, index, weight):
+        if self.centered:
+            return (
+                zeros(weight.shape, weight.context, stype=weight.stype),  # mean
+                zeros(weight.shape, weight.context, stype=weight.stype),  # var
+                zeros(weight.shape, weight.context, stype=weight.stype))  # mom
+        else:
+            return zeros(weight.shape, weight.context, stype=weight.stype)  # var
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            if not self.centered:
+                # update var
+                var = state
+                var[:] *= self.rho
+                var[:] += (1 - self.rho) * square(grad)
+
+                # update weight
+                d = grad / (sqrt(var) + self.epsilon)
+                weight[:] -= lr * d
+            else:
+                # update mean, var, mom
+                mean, var, mom = state
+                mean[:] *= self.rho
+                mean[:] += (1 - self.rho) * grad
+                var[:] *= self.rho
+                var[:] += (1 - self.rho) * square(grad)
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad / sqrt(var - square(mean) + self.epsilon)
+
+                # update weight
+                weight[:] += mom
+
+            if self.clip_weights:
+                clip(weight, -self.clip_weights, self.clip_weights, out=weight)
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'rho': self.rho, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad}
+            if self.centered:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.clip_weights:
+                kwargs['clip_weights'] = self.clip_weights
+
+            # update weight with fused kernel
+            if not self.centered:
+                var = state
+                rmsprop_update(weight, grad, var, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                mean, var, mom = state
+                rmspropalex_update(weight, grad, mean, var, mom, out=weight,
+                                   lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/sgd.py b/python/mxnet/optimizer/sgd.py
new file mode 100644
index 000000000000..3e0f74928182
--- /dev/null
+++ b/python/mxnet/optimizer/sgd.py
@@ -0,0 +1,247 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""SGD optimizer"""
+from __future__ import absolute_import
+import os
+import numpy
+from ..ndarray import (zeros, clip)
+from ..ndarray import (sgd_update, sgd_mom_update,
+                       mp_sgd_update, mp_sgd_mom_update,
+                       multi_sgd_update, multi_sgd_mom_update,
+                       multi_mp_sgd_update, multi_mp_sgd_mom_update)
+from .optimizer import Optimizer, register
+from .utils import _flatten_list
+
+__all__ = ['SGD']
+
+
+@register
+class SGD(Optimizer):
+    """The SGD optimizer with momentum and weight decay.
+
+    If the storage types of grad is ``row_sparse`` and ``lazy_update`` is True, \
+    **lazy updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(rescale_grad * grad[row] + wd * weight[row], clip_gradient)
+            state[row] = momentum[row] * state[row] + lr * rescaled_grad[row]
+            weight[row] = weight[row] - state[row]
+
+    The sparse update only updates the momentum for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    In the case when ``update_on_kvstore`` is set to False (either globally via
+    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
+    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
+    of parameters, which may lead to improved performance. The aggregation size
+    is controlled by ``aggregate_num`` and defaults to 4.
+
+    Otherwise, **standard updates** are applied by::
+
+        rescaled_grad = clip(rescale_grad * grad, clip_gradient)) + wd * weight
+        state = momentum * state + lr * rescaled_grad
+        weight = weight - state
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.
+        The momentum value.
+    lazy_update : bool, default False
+        Default is False. If True, lazy updates are applied \
+        if the storage types of weight and grad are both ``row_sparse``.
+    multi_precision: bool, default False
+        Flag to control the internal precision of the optimizer.
+        False: results in using the same precision as the weights (default),
+        True: makes internal 32-bit copy of the weights and applies gradients
+        in 32-bit precision even if actual weights used in the model have lower precision.
+        Turning this on can improve convergence and accuracy when training with float16.
+    aggregate_num : int, default 1
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, lazy_update=False,
+                 multi_precision=False, use_fused_step=True, aggregate_num=1, **kwargs):
+        super(SGD, self).__init__(learning_rate=learning_rate,
+                                  multi_precision=multi_precision,
+                                  aggregate_num=aggregate_num,
+                                  use_fused_step=use_fused_step,
+                                  **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update, \
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        if lazy_update:
+            assert not multi_precision, \
+                'When lazy_update is set to True, multi_precision has be turned off.'
+        self.momentum = momentum
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            stype = weight.stype if self.lazy_update else 'default'
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+            else:
+                mom = -lr * grad
+
+            # update weight
+            weight[:] += mom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        # When either weight or gradient is sparse, aggregate is False.
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            # update `aggregate_num` number of weights in a single kernel.
+            # this does not support sparse weight or gradient.
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+            if not multi_precision:
+                if self.momentum > 0:
+                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
+                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
+                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+            else:
+                states = list(zip(*states))
+                weights32, moms = states
+                if self.momentum > 0:
+                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads,
+                                                               moms, weights32)),
+                                            out=weights, num_weights=len(weights),
+                                            lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
+                                                           weights32)),
+                                        out=weights, num_weights=len(weights),
+                                        lrs=lrs, wds=wds, **kwargs)
+        else:
+            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
+                multi_precision = self.multi_precision and weight.dtype == numpy.float16
+                if not multi_precision:
+                    mom = state
+                    if mom is not None:
+                        sgd_mom_update(weight, grad, mom, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    # weight32 is a float32 copy of weight.
+                    # in the kernel, we firstly update weight32,
+                    # and then cast the result to float16 and save it to weight.
+                    weight32, mom = state
+                    if mom is not None:
+                        mp_sgd_mom_update(weight, grad, mom, weight32, out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(SGD, self).update_multi_precision(indices, weights, grads, states)
+
diff --git a/python/mxnet/optimizer/sgld.py b/python/mxnet/optimizer/sgld.py
new file mode 100644
index 000000000000..8a99d8f977d7
--- /dev/null
+++ b/python/mxnet/optimizer/sgld.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""SGLD optimizer."""
+from __future__ import absolute_import
+import math
+from ..ndarray import clip
+from ..random import normal
+from .optimizer import Optimizer, register
+
+__all__ = ['SGLD']
+
+
+@register
+class SGLD(Optimizer):
+    """Stochastic Gradient Riemannian Langevin Dynamics.
+
+    This class implements the optimizer described in the paper *Stochastic Gradient
+    Riemannian Langevin Dynamics on the Probability Simplex*, available at
+    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, use_fused_step=False, **kwargs):
+        super(SGLD, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+
+    def create_state(self, index, weight):
+        return None
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update weight
+            weight[:] -= lr / 2 * grad
+            weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
+                                dtype=weight.dtype, ctx=weight.context)
diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
new file mode 100644
index 000000000000..16188ccd2fb8
--- /dev/null
+++ b/python/mxnet/optimizer/signum.py
@@ -0,0 +1,162 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Signum optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sign)
+from ..ndarray import (signsgd_update, signum_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['Signum']
+
+
+@register
+class Signum(Optimizer):
+    r"""The Signum optimizer that takes the sign of gradient or momentum.
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
+        state = momentum * state + (1-momentum)*rescaled_grad
+        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
+
+    References
+    ----------
+    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
+    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
+
+    See: https://arxiv.org/abs/1802.04434
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, optional
+       The momentum value.
+    wd_lh : float, optional
+       The amount of decoupled weight decay regularization, see details in the original paper at:\
+       https://arxiv.org/abs/1711.05101
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh=0.0, use_fused_step=True, **kwargs):
+        super(Signum, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.momentum = momentum
+        self.wd_lh = wd_lh
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+         Parameters
+         ----------
+         indices : list of int
+             List of unique indices of the parameters into the individual learning rates
+             and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+             and `set_wd_mult()`, respectively.
+         weights : list of NDArray
+             List of parameters to be updated.
+         grads : list of NDArray
+             List of gradients of the objective with respect to this parameter.
+         states : List of any obj
+             List of state returned by `create_state()`.
+         """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            if state is not None:
+                # preprocess grad
+                grad *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+                grad += wd * weight
+
+                # update mom
+                mom = state
+                mom[:] *= self.momentum
+                mom[:] -= (1 - self.momentum) * grad
+                weight[:] *= 1 - lr * self.wd_lh
+
+                # update weight
+                weight[:] += lr * sign(mom)
+            else:
+                # update weight
+                weight[:] *= 1 - lr * (wd + self.wd_lh)
+                weight[:] -= lr * sign(grad)
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'rescale_grad': self.rescale_grad}
+            if self.momentum > 0:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.wd_lh:
+                kwargs['wd_lh'] = self.wd_lh
+
+            # update weight with fused kernel
+            if state is not None:
+                signum_update(weight, grad, state, out=weight,
+                              lr=lr, wd=wd, **kwargs)
+            else:
+                signsgd_update(weight, grad, out=weight,
+                               lr=lr, wd=wd, **kwargs)
+
diff --git a/python/mxnet/optimizer/updater.py b/python/mxnet/optimizer/updater.py
new file mode 100644
index 000000000000..03398396c449
--- /dev/null
+++ b/python/mxnet/optimizer/updater.py
@@ -0,0 +1,142 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Updater class."""
+from __future__ import absolute_import
+import pickle
+import numpy
+from ..base import py_str
+from ..ndarray import NDArray
+from ..util import is_np_array
+from .utils import _as_classic
+
+__all__ = ['Updater', 'get_updater']
+
+
+class Updater(object):
+    """Updater for kvstore."""
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+        self.states = {}
+        self.states_synced = {}
+        self.aggregate_updates = optimizer.aggregate_num > 1
+
+    def __call__(self, index, grad, weight):
+        """Updates weight given gradient and index."""
+        allow_np = self.optimizer.allow_np_array if hasattr(self.optimizer, "allow_np_array") else is_np_array()
+        if not isinstance(index, (list, tuple)):
+            indices = [index]
+            grads = [_as_classic(grad, allow_np)]
+            weights = [_as_classic(weight, allow_np)]
+        else:
+            indices = index
+            grads = _as_classic(grad, allow_np)
+            weights = _as_classic(weight, allow_np)
+        if weights:
+            self.optimizer._set_current_context(weights[0].context.device_id)
+        for i, idx in enumerate(indices):
+            # convert ctypes.char_p.value back to python str if needed
+            if isinstance(idx, bytes):
+                indices[i] = py_str(idx)
+                idx = indices[i]
+            if idx not in self.states:
+                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
+                self.states_synced[idx] = True
+            elif not self.states_synced[idx]:
+                self.states[idx] = \
+                    self.sync_state_context(self.states[idx], weights[i].context)
+                self.states_synced[idx] = True
+        if self.aggregate_updates:
+            # segregate values based on type
+            if self.optimizer.aggregate_num is not numpy.inf:
+                type_map = {}
+                for i, w, g in zip(indices, weights, grads):
+                    if w.dtype in type_map:
+                        type_map[w.dtype].append((i, w, g))
+                    else:
+                        type_map[w.dtype] = [(i, w, g)]
+                for idx in type_map:
+                    current_index = 0
+                    indices, weights, grads = zip(*type_map[idx])
+                    while current_index < len(indices):
+                        states = []
+                        step = min(self.optimizer.aggregate_num, len(indices) - current_index)
+                        for j in range(step):
+                            states.append(self.states[indices[current_index + j]])
+                        self.optimizer.update_multi_precision(
+                            indices[current_index:current_index + self.optimizer.aggregate_num],
+                            weights[current_index:current_index + self.optimizer.aggregate_num],
+                            grads[current_index:current_index + self.optimizer.aggregate_num],
+                            states)
+                        current_index += self.optimizer.aggregate_num
+            else:
+                states = [self.states[index] for index in indices]
+                self.optimizer.update_multi_precision(indices, weights, grads, states)
+        else:
+            for index, weight, grad in zip(indices, weights, grads):
+                self.optimizer.update_multi_precision([index], [weight], [grad], [self.states[index]])
+
+    def sync_state_context(self, state, context):
+        """sync state context."""
+        if isinstance(state, NDArray):
+            return state.as_in_context(context)
+        elif isinstance(state, (tuple, list)):
+            synced_state = (self.sync_state_context(i, context) for i in state)
+            if isinstance(state, tuple):
+                return tuple(synced_state)
+            else:
+                return list(synced_state)
+        else:
+            return state
+
+    def set_states(self, states):
+        """Sets updater states."""
+        states = pickle.loads(states)
+        if isinstance(states, tuple) and len(states) == 2:
+            self.states, self.optimizer = states
+        else:
+            self.states = states
+        self.states_synced = dict.fromkeys(self.states.keys(), False)
+
+    def get_states(self, dump_optimizer=False):
+        """Gets updater states.
+
+        Parameters
+        ----------
+        dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
+        """
+        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
+
+
+def get_updater(optimizer):
+    """Returns a closure of the updater needed for kvstore.
+
+    Parameters
+    ----------
+    optimizer: Optimizer
+         The optimizer.
+
+    Returns
+    -------
+    updater: function
+         The closure of the updater.
+    """
+    return Updater(optimizer)
diff --git a/python/mxnet/optimizer/utils.py b/python/mxnet/optimizer/utils.py
new file mode 100644
index 000000000000..af95a53ccae5
--- /dev/null
+++ b/python/mxnet/optimizer/utils.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Optimizer utility functions."""
+from __future__ import absolute_import
+
+
+def _flatten_list(nested_list):
+    return [item for sublist in nested_list for item in sublist]
+
+
+def _as_classic(a, allow_np):
+    # TODO(junwu): This is a temp solution for allowing converting
+    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
+    # users may have custom optimizers implemented using mx.nd.NDArray ops.
+    from ..numpy import ndarray as np_ndarray
+    if isinstance(a, (tuple, list)):
+        if any(isinstance(x, np_ndarray) for x in a):
+            if allow_np:
+                return [x.as_nd_ndarray() for x in a]
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    else:
+        if isinstance(a, np_ndarray):
+            if allow_np:
+                return a.as_nd_ndarray()
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    return a
\ No newline at end of file
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index c60e5bc22201..d3b056c6efdc 100755
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -2244,6 +2244,7 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc
                                 str(buckets), str(probs)))
     return cs_ret_l
 
+
 def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
     """Compare ndarray tuple."""
     if t1 is None or t2 is None:
@@ -2256,11 +2257,14 @@ def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
         assert_almost_equal(t1, t2, rtol=rtol, atol=atol)
 
 
-def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
-                      rtol=1e-4, atol=1e-5, compare_states=True, ntensors=1):
+def compare_optimizer(opt1, opt2, shapes, dtype, w_stype='default', g_stype='default',
+                      rtol=1e-4, atol=1e-5, compare_states=True):
     """Compare opt1 and opt2."""
-    if not isinstance(shape, list):
-        assert(ntensors == 1)
+
+    w1_list, w2_list = [], []
+    g1_list, g2_list = [], []
+    s1_list, s2_list = [], []
+    for i, shape in enumerate(shapes):
         if w_stype == 'default':
             w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
             w1 = w2.copyto(default_context())
@@ -2277,37 +2281,77 @@ def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='defa
             g1 = g2.copyto(default_context()).tostype('default')
         else:
             raise Exception("type not supported yet")
+        s1 = opt1.create_state_multi_precision(i, w1)
+        s2 = opt2.create_state_multi_precision(i, w2)
 
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
         if compare_states:
-            compare_ndarray_tuple(state1, state2)
+            compare_ndarray_tuple(s1, s2)
+
+        w1_list.append(w1)
+        w2_list.append(w2)
+        g1_list.append(g1)
+        g2_list.append(g2)
+        s1_list.append(s1)
+        s2_list.append(s2)
+
+    indices = list(range(len(shapes)))
+    opt1.update_multi_precision(indices, w1_list, g1_list, s1_list)
+    opt2.update_multi_precision(indices, w2_list, g2_list, s2_list)
+    if compare_states:
+        compare_ndarray_tuple(tuple(s1_list), tuple(s2_list), rtol=rtol, atol=atol)
+    compare_ndarray_tuple(tuple(w1_list), tuple(w2_list), rtol=rtol, atol=atol)
+
+
+def compare_optimizer_noise_seeded(opt1, opt2, shapes, dtype, noise_seed,
+                                   w_stype='default', g_stype='default',
+                                   rtol=1e-4, atol=1e-5, compare_states=True):
+    """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
+    in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
+
+    """
+    w1_list, w2_list = [], []
+    g1_list, g2_list = [], []
+    s1_list, s2_list = [], []
+    for i, shape in enumerate(shapes):
+        if w_stype == 'default':
+            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            w1 = w2.copyto(default_context())
+        elif w_stype == 'row_sparse' or w_stype == 'csr':
+            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+            w1 = w2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        if g_stype == 'default':
+            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            g1 = g2.copyto(default_context())
+        elif g_stype == 'row_sparse' or g_stype == 'csr':
+            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+            g1 = g2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        s1 = opt1.create_state_multi_precision(i, w1)
+        s2 = opt2.create_state_multi_precision(i, w2)
 
-        opt1.update_multi_precision(0, w1, g1, state1)
-        opt2.update_multi_precision(0, w2, g2, state2)
         if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1, w2, rtol=rtol, atol=atol)
-    else:
-        # test multi-tensor: Opt1 single-tensor reference, Opt2 multi-tensor
-        from copy import deepcopy
-        w1, g1 = [], []
-        for s in shape:
-            w1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-            g1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-        w1 = tuple(w1)
-        w2 = deepcopy(w1)
-        g1 = tuple(g1)
-        g2 = deepcopy(g1)
-        state2 = [opt2.create_state_multi_precision(0, w2[i]) for i in range(ntensors)]
-
-        opt2.update_multi_precision(list(range(ntensors)), w2, g2, state2)
-        for i in range(ntensors):
-            state1 = opt1.create_state_multi_precision(i, w1[i])
-            opt1.update_multi_precision(i, w1[i], g1[i], state1)
-            if compare_states:
-                compare_ndarray_tuple(state1, state2[i], rtol, atol)
-            compare_ndarray_tuple(w1[i], w2[i], rtol, atol)
+            compare_ndarray_tuple(s1, s2)
+
+        w1_list.append(w1)
+        w2_list.append(w2)
+        g1_list.append(g1)
+        g2_list.append(g2)
+        s1_list.append(s1)
+        s2_list.append(s2)
+
+    indices = list(range(len(shapes)))
+    # set seed for Gaussian noise replication
+    mx.random.seed(noise_seed)
+    opt1.update_multi_precision(indices, w1_list, g1_list, s1_list)
+    mx.random.seed(noise_seed)
+    opt2.update_multi_precision(indices, w2_list, g2_list, s2_list)
+    if compare_states:
+        compare_ndarray_tuple(tuple(s1_list), tuple(s2_list), rtol=rtol, atol=atol)
+    compare_ndarray_tuple(tuple(w1_list), tuple(w2_list), rtol=rtol, atol=atol)
+
 
 def same_symbol_structure(sym1, sym2):
     """Compare two symbols to check if they have the same computation graph structure.
diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h
index fd556a4231cb..2276b9375012 100644
--- a/src/operator/contrib/optimizer_op-inl.h
+++ b/src/operator/contrib/optimizer_op-inl.h
@@ -130,7 +130,7 @@ template <typename xpu> struct GroupAdagradDnsRspKernel {
       // clang-format off
       const DType grad_rescaled = get_grad_rescaled(j);
       index_t data_j = get_data_j(j);
-      const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
+      const DType div = lr * grad_rescaled / (square_root::Map(state_data[grad_idx[i]]) + eps);
       out_data[data_j] = weight_data[data_j] - div;
       // clang-format on
     }
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 83bbcdab833d..be6d30587368 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -61,7 +61,7 @@ Updates are applied by::
 
     grad = clip(grad * rescale_grad, clip_gradient)
     history += mean(square(grad), axis=1, keepdims=True)
-    div = grad / sqrt(history + float_stable_eps)
+    div = grad / (sqrt(history) + epsilon)
     weight -= div * lr
 
 Weights are updated lazily if the gradient is sparse.
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 2df574c46909..b7dea1015bdb 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -231,23 +231,18 @@ struct MultiSGDKernel {
       if ((size_t)i < param.sizes[index]) {
         MPDType w = has_mixed_precision ? param.weights32[index][i] :
                                           MPDType(param.weights[index][i]);
-        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
+        MPDType rescale_grad = param.rescale_grad * static_cast<MPDType>(param.grads[index][i]);
         if (param.clip_gradient >= 0.0f) {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]
-                *mshadow_op::clip::Map(param.rescale_grad *
-                                       static_cast<MPDType>(param.grads[index][i]),
-                                     param.clip_gradient);
-        } else {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
+          rescale_grad = mshadow_op::clip::Map(rescale_grad, param.clip_gradient);
         }
+        rescale_grad += param.wds[index] * w;
         if (has_momentum) {
-          param.mom[index][i] = mom;
+          param.mom[index][i] *= param.momentum;
+          param.mom[index][i] -= param.lrs[index] * rescale_grad;
+          w = w + param.mom[index][i];
+        } else {
+          w -= param.lrs[index] * rescale_grad;
         }
-        w = w + mom;
         if (has_mixed_precision) {
           param.weights32[index][i] = w;
         }
@@ -385,16 +380,12 @@ struct SGDKernel {
     const DType* grad_data, const DType param_clip_gradient,
     const DType param_lr, const DType param_wd, const DType param_rescale_grad,
     const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr)
-                 * mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient));
-    } else {
-      KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr*param_rescale_grad)*grad_data[i]);
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * weight_data[i];
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] - (param_lr * rescale_grad));
   }
 };
 
@@ -439,13 +430,12 @@ struct SGDDnsRspKernel<req, gpu> {
     const dim_t col_id = i % row_length;
     const dim_t row_offset = grad_idx[row_id] * row_length;
     const dim_t data_i = row_offset + col_id;
+    DType grad_rescaled = rescale_grad * grad_val[i];
     if (clip_gradient >= 0.0f) {
-      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                   (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[i], clip_gradient));
-    } else {
-      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                    (lr * rescale_grad) * grad_val[i]);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight[data_i];
+    KERNEL_ASSIGN(out[data_i], req, weight[data_i] - (lr * grad_rescaled));
   }
 };
 
@@ -464,13 +454,12 @@ struct SGDDnsRspKernel<req, cpu> {
     for (index_t j = 0; j < row_length; j++) {
       index_t data_i = grad_idx[i] * row_length + j;
       index_t grad_i = i * row_length + j;
+      DType grad_rescaled = rescale_grad * grad_val[grad_i];
       if (clip_gradient >= 0.0f) {
-        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                     (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient));
-      } else {
-        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                      (lr * rescale_grad) * grad_val[grad_i]);
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight[data_i];
+      KERNEL_ASSIGN(out[data_i], req, weight[data_i] - (lr * grad_rescaled));
     }
   }
 };
@@ -505,7 +494,7 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
         // apply standard weight decay if not lazy update
         if (!param.lazy_update) {
           Kernel<op_with_req<mshadow_op::mul, req_type>, xpu>::Launch(s, weight.Size(),
-            weight_data, weight_data, static_cast<DType>(1 - param.lr * param.wd));
+          weight_data, weight_data, static_cast<DType>(1 - param.lr * param.wd));
           wd = 0;
         }
         if (!grad.storage_initialized()) return;
@@ -604,16 +593,13 @@ struct SGDMomKernel {
     const DType* grad_data, const DType param_clip_gradient, const DType param_momentum,
     const DType param_lr, const DType param_wd, const DType param_rescale_grad,
     const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i]
-              - param_lr*param_wd*weight_data[i]
-              - param_lr
-              *mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i]
-                - param_lr*param_wd*weight_data[i]
-                - param_lr*param_rescale_grad*grad_data[i];
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * rescale_grad;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };
@@ -658,20 +644,15 @@ struct MP_SGDKernel {
     const DType* grad_data, float* weight32, const float param_clip_gradient,
     const float param_lr, const float param_wd, const float param_rescale_grad,
     const OpReqType req) {
+    float w = weight32[i];
+    float rescale_grad = param_rescale_grad * static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      float w = weight32[i];
-      w = (1.f - param_lr*param_wd)*w -
-          (param_lr) * mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                                             param_clip_gradient);
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, (DType)w);
-    } else {
-      float w = weight32[i];
-      w = (1.f-param_lr*param_wd)*w
-               - (param_lr*param_rescale_grad)*static_cast<float>(grad_data[i]);
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, (DType)w);
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * w;
+    w -= param_lr * rescale_grad;
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, (DType)w);
   }
 };
 
@@ -704,17 +685,13 @@ struct MP_SGDMomKernel {
     const float param_wd, const float param_rescale_grad, const OpReqType req) {
     float w = weight32[i];
     float mom = mom_data[i];
+    float grad_rescaled = param_rescale_grad*static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      mom = param_momentum*mom
-              - param_lr*param_wd*w
-              - param_lr
-              *mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                                     param_clip_gradient);
-    } else {
-      mom = param_momentum*mom
-                - param_lr*param_wd*w
-                - param_lr*param_rescale_grad*static_cast<float>(grad_data[i]);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * w;
+    mom *= param_momentum;
+    mom -= param_lr * grad_rescaled;
     mom_data[i] = mom;
     w = w + mom;
     weight32[i] = w;
@@ -753,21 +730,16 @@ struct SGDMomDnsRspDnsKernel<req, cpu> {
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
     const DType* grad_data, const DType clip_gradient, const DType momentum,
     const DType lr, const DType wd, const DType rescale_grad) {
-    const DType rate = lr * wd;
     for (index_t j = 0; j < row_length; j++) {
       index_t data_i = grad_idx[i] * row_length + j;
       index_t grad_i = i * row_length + j;
+      DType grad_rescaled = rescale_grad * grad_data[grad_i];
       if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad_data[grad_i],
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad_data[grad_i];
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight_data[data_i];
+      mom_data[data_i] *= momentum;
+      mom_data[data_i] -= lr * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
     }
   }
@@ -781,21 +753,16 @@ struct SGDMomDnsRspDnsKernel<req, gpu> {
     const DType* grad_data, const DType clip_gradient, const DType momentum,
     const DType lr, const DType wd, const DType rescale_grad) {
     using nnvm::dim_t;
-    const DType rate = lr * wd;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const dim_t data_i = grad_idx[row_id] * row_length + col_id;
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
-      mom_data[data_i] = momentum * mom_data[data_i]
-              - rate * weight_data[data_i]
-              - lr *
-              mshadow_op::clip::Map(rescale_grad * grad_data[i],
-                                    clip_gradient);
-    } else {
-      mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr * rescale_grad * grad_data[i];
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[data_i];
+    mom_data[data_i] *= momentum;
+    mom_data[data_i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
   }
 };
@@ -1065,20 +1032,15 @@ struct NAGMomKernel {
     const DType param_clip_gradient, const DType param_momentum,
     const DType param_lr, const DType param_wd,
     const DType param_rescale_grad, const OpReqType req) {
+    DType grad_rescaled = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i];
-      KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1)
-              *(mom_data[i]-(param_lr*(mshadow_op::clip::Map(param_rescale_grad
-                              *grad_data[i], param_clip_gradient)+(param_wd*weight_data[i])))));
-      mom_data[i] = mom_data[i] - (param_lr*((mshadow_op::clip::Map(param_rescale_grad*grad_data[i],
-                          param_clip_gradient))+(param_wd*weight_data[i])));
-    } else {
-      mom_data[i] = param_momentum*mom_data[i];
-      KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1)
-              *(mom_data[i]-(param_lr*(param_rescale_grad*grad_data[i]+param_wd*weight_data[i]))));
-      mom_data[i] = mom_data[i] - param_lr*((param_rescale_grad*grad_data[i])
-              +(param_wd*weight_data[i]));
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * grad_rescaled;
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] + (param_momentum * mom_data[i])
+                   - (param_lr * grad_rescaled));
   }
 };
 
@@ -1115,25 +1077,16 @@ struct MP_NAGMomKernel {
     const float param_wd, const float param_rescale_grad,
     const OpReqType req) {
     float w = weight32[i];
+    float grad_rescaled = param_rescale_grad * static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i];
-      w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr
-              *(mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                          param_clip_gradient)+(param_wd*w)));
-      mom_data[i] = mom_data[i] - param_lr
-          *((mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                          param_clip_gradient))+(param_wd*w));
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, w);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i];
-      w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr
-              *(param_rescale_grad*static_cast<float>(grad_data[i])+(param_wd*w)));
-      mom_data[i] = mom_data[i] - param_lr
-          *((param_rescale_grad*static_cast<float>(grad_data[i]))+(param_wd*w));
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, w);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * w;
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * grad_rescaled;
+    w += (param_momentum * mom_data[i]) - (param_lr * grad_rescaled);
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, w);
   }
 };
 
@@ -1211,7 +1164,7 @@ struct FTMLKernel {
     const OpReqType req) {
     using namespace mshadow_op;
     const DType grad_i = clip_grad >= 0.0f
-        ? clip::Map(rescale_grad * grad[i] + wd * weight[i], clip_grad)
+        ? clip::Map(rescale_grad * grad[i], clip_grad) + wd * weight[i]
         : (rescale_grad * grad[i] + wd * weight[i]);
     v[i] = beta2 * v[i] + (1 - beta2) * square::Map(grad_i);
     const DType d_t = (1 - power::Map(beta1, t)) / lr *
@@ -1299,10 +1252,11 @@ struct AdamUpdateKernel {
     const DType epsilon, const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd;
+    DType grad_rescaled = grad_data[i] * rescale_grad;
     if (clip_gradient >= 0.f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[i] * wd;
 
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i] = beta2 * var_data[i] +
@@ -1362,17 +1316,13 @@ struct AdamDnsRspDnsKernel<req, cpu> {
       const dim_t data_i = row_offset + j;
       // index in grad
       const dim_t grad_i = i * row_length + j;
-      const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd;
+      DType grad_rescaled = grad_data[grad_i] * rescale_grad;
       if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * grad_rescaled * grad_rescaled;
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += weight_data[data_i] * wd;
+      mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+      var_data[data_i] = beta2 * var_data[data_i] +(1.f - beta2) * grad_rescaled * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
                     (square_root::Map(var_data[data_i]) + epsilon));
     }
@@ -1395,10 +1345,11 @@ struct AdamDnsRspDnsKernel<req, gpu> {
     // index in data/mean/var
     const dim_t data_i = row_offset + col_id;
     // index in grad
-    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[data_i] * wd;
+    DType grad_rescaled = grad_data[i] * rescale_grad;
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[data_i] * wd;
     mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
     var_data[data_i] = beta2 * var_data[data_i] +
                        (1.f - beta2) * grad_rescaled * grad_rescaled;
@@ -1914,8 +1865,8 @@ inline void MPLambUpdatePhaseTwo(const nnvm::NodeAttrs& attrs,
 // by Alex Graves, 2013.
 struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   float lr;
-  float gamma1;
-  float gamma2;
+  float rho;
+  float momentum;
   float epsilon;
   float wd;
   float rescale_grad;
@@ -1924,9 +1875,9 @@ struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   DMLC_DECLARE_PARAMETER(RMSPropAlexParam) {
     DMLC_DECLARE_FIELD(lr)
     .describe("Learning rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    DMLC_DECLARE_FIELD(rho).set_default(0.95f)
     .describe("Decay rate.");
-    DMLC_DECLARE_FIELD(gamma2).set_default(0.9f)
+    DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
     .describe("Decay rate.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
     .describe("A small constant for numerical stability.");
@@ -1956,25 +1907,26 @@ struct RMSPropAlexUpdateKernel {
     DType* state_n_data, DType* state_g_data, DType* delta_data,
     const DType* weight_data, const DType* grad_data,
     const DType clip_gradient, const DType rescale_grad,
-    const DType gamma1, const DType gamma2,
+    const DType rho, const DType momentum,
     const DType lr, const DType wd,
     const DType clip_weights, const DType epsilon,
     const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i];
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
 
-    state_n_data[i] = (1.f - gamma1) * grad_rescaled * grad_rescaled +
-                      gamma1 * state_n_data[i];
-    state_g_data[i] = (1.f - gamma1) * grad_rescaled +
-                      gamma1 * state_g_data[i];
-    delta_data[i] = gamma2 * delta_data[i] -
+    state_n_data[i] = (1.f - rho) * square::Map(grad_rescaled) +
+                      rho * state_n_data[i];
+    state_g_data[i] = (1.f - rho) * grad_rescaled +
+                      rho * state_g_data[i];
+    delta_data[i] = momentum * delta_data[i] -
                     (lr * (grad_rescaled) /
                       (square_root::Map(state_n_data[i] -
-                                        state_g_data[i] * state_g_data[i] + epsilon)));
+                                        square::Map(state_g_data[i]) + epsilon)));
 
     if (clip_weights >= 0.0f) {
       const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights);
@@ -1997,15 +1949,15 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     DType* weight_data = inputs[0].dptr<DType>();
     DType* grad_data = inputs[1].dptr<DType>();
-    DType* state_n_data = inputs[2].dptr<DType>();
-    DType* state_g_data = inputs[3].dptr<DType>();
+    DType* state_g_data = inputs[2].dptr<DType>();
+    DType* state_n_data = inputs[3].dptr<DType>();
     DType* delta_data = inputs[4].dptr<DType>();
     DType* out_data = outputs[0].dptr<DType>();
 
     Kernel<RMSPropAlexUpdateKernel, xpu>::Launch(s, inputs[0].shape_.Size(),
       out_data, state_n_data, state_g_data, delta_data, weight_data, grad_data,
       static_cast<DType>(param.clip_gradient), static_cast<DType>(param.rescale_grad),
-      static_cast<DType>(param.gamma1), static_cast<DType>(param.gamma2),
+      static_cast<DType>(param.rho), static_cast<DType>(param.momentum),
       static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.clip_weights), static_cast<DType>(param.epsilon), req[0]);
   });
@@ -2016,7 +1968,7 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
 // by Tieleman & Hinton, 2012
 struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   float lr;
-  float gamma1;
+  float rho;
   float epsilon;
   float wd;
   float rescale_grad;
@@ -2025,7 +1977,7 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   DMLC_DECLARE_PARAMETER(RMSPropParam) {
     DMLC_DECLARE_FIELD(lr)
     .describe("Learning rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    DMLC_DECLARE_FIELD(rho).set_default(0.95f)
     .describe("The decay rate of momentum estimates.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
     .describe("A small constant for numerical stability.");
@@ -2055,20 +2007,21 @@ struct RMSPropUpdateKernel {
     DType* out_data, DType* state_n_data,
     const DType* weight_data, const DType* grad_data,
     const DType clip_gradient, const DType rescale_grad,
-    const DType gamma1, const DType lr, const DType wd,
+    const DType rho, const DType lr, const DType wd,
     const DType clip_weights, const DType epsilon,
     const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i];
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
 
-    state_n_data[i] = (1.f - gamma1) * (grad_rescaled * grad_rescaled) + gamma1 * state_n_data[i];
+    state_n_data[i] = (1.f - rho) * square::Map(grad_rescaled) + rho * state_n_data[i];
 
     DType weight = weight_data[i] -
-                   lr * (grad_rescaled / square_root::Map(state_n_data[i] + epsilon));
+                   lr * (grad_rescaled) / (square_root::Map(state_n_data[i]) + epsilon);
     if (clip_weights >= 0.0f) {
       weight = clip::Map(weight, clip_weights);
     }
@@ -2093,7 +2046,7 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     Kernel<RMSPropUpdateKernel, xpu>::Launch(s, inputs[0].shape_.Size(),
       out_data, state_n_data, weight_data, grad_data,
       static_cast<DType>(param.clip_gradient), static_cast<DType>(param.rescale_grad),
-      static_cast<DType>(param.gamma1), static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+      static_cast<DType>(param.rho), static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.clip_weights), static_cast<DType>(param.epsilon), req[0]);
   });
 }
@@ -2150,10 +2103,9 @@ struct FtrlUpdateKernel {
                       weight_data[i] / lr;
     n_data[i] += square::Map(grad_rescaled);
 
-    KERNEL_ASSIGN(out_data[i], req,
-                  (sign::Map(z_data[i]) * lamda1 - z_data[i]) /
-                  ((beta + square_root::Map(n_data[i])) / lr + wd) *
-                  gt::Map(abs::Map(z_data[i]), lamda1));
+    DType d = - sign::Map(z_data[i]) * maximum::Map(abs::Map(z_data[i]) - lamda1,
+                                                    static_cast<DType>(0));
+    KERNEL_ASSIGN(out_data[i], req, d / ((beta + square_root::Map(n_data[i])) / lr + wd));
   }
 };
 
@@ -2197,23 +2149,18 @@ struct FtrlDnsRspDnsKernel {
       const dim_t data_i = row_offset + j;
       // index in grad
       const dim_t grad_i = i * row_length + j;
-      const DType grad_rescaled = grad_data[grad_i] * rescale_grad;
+      DType grad_rescaled = grad_data[grad_i] * rescale_grad;
       if (clip_gradient >= 0.0f) {
-        z_data[data_i] += clip::Map(grad_rescaled, clip_gradient) -
-                          (square_root::Map(n_data[data_i] +
-                          square::Map(clip::Map(grad_rescaled, clip_gradient))) -
-                          square_root::Map(n_data[data_i])) * weight_data[data_i] / lr;
-        n_data[data_i] += square::Map(clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
-                          square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
-                          weight_data[data_i] / lr;
-        n_data[data_i] += square::Map(grad_rescaled);
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
-      KERNEL_ASSIGN(out_data[data_i], req,
-                    (sign::Map(z_data[data_i]) * lamda1 - z_data[data_i]) /
-                    ((beta + square_root::Map(n_data[data_i])) / lr + wd) *
-                    gt::Map(abs::Map(z_data[data_i]), lamda1));
+      z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
+                        square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
+                        weight_data[data_i] / lr;
+      n_data[data_i] += square::Map(grad_rescaled);
+
+      DType d = - sign::Map(z_data[data_i]) * maximum::Map(abs::Map(z_data[data_i]) - lamda1,
+                                                           static_cast<DType>(0));
+      KERNEL_ASSIGN(out_data[data_i], req, d / ((beta + square_root::Map(n_data[data_i])) / lr + wd));
     }
   }
 };
@@ -2523,7 +2470,7 @@ struct AdagradDnsRspDnsKernel<cpu> {
       }
       const DType grad_squared = grad_rescaled * grad_rescaled;
       state_data[data_j] += grad_squared;
-      const DType div = grad_rescaled / square_root::Map(state_data[data_j] + epsilon);
+      const DType div = grad_rescaled / (square_root::Map(state_data[data_j]) + epsilon);
       // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
       out_data[data_j] = weight_data[data_j] - div * lr;
     }
@@ -2548,7 +2495,7 @@ struct AdagradDnsRspDnsKernel<gpu> {
     }
     const DType grad_squared = grad_rescaled * grad_rescaled;
     state_data[data_i] += grad_squared;
-    const DType div = grad_rescaled / square_root::Map(state_data[data_i] + epsilon);
+    const DType div = grad_rescaled / (square_root::Map(state_data[data_i]) + epsilon);
     // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
     out_data[data_i] = weight_data[data_i] - div * lr;
   }
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 93e1267cc8c7..2ac3673e4a09 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -112,7 +112,6 @@ struct SGDMomStdDnsRspDnsKernel<req, cpu> {
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
     const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
     const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
-    const DType rate = lr * wd;
     const bool non_zero = (i == 0) ? prefix_sum[0] > 0
                                    : prefix_sum[i] > prefix_sum[i-1];
 
@@ -122,17 +121,13 @@ struct SGDMomStdDnsRspDnsKernel<req, cpu> {
       const index_t data_i = row_i + j;
       const DType grad = non_zero ? grad_data[grad_i + j]
                                   : static_cast<DType>(0);
+      DType grad_rescaled = rescale_grad * grad;
       if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad,
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad;
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight_data[data_i];
+      mom_data[data_i] *= momentum;
+      mom_data[data_i] -= lr * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
     }
   }
@@ -208,20 +203,16 @@ struct AdamStdDnsRspDnsKernel<req, cpu> {
     const RType grad_i = (prefix_sum[i]-1) * row_length;
     for (index_t j = 0; j < row_length; j++) {
       const index_t data_i = row_i + j;
-      const DType grad_rescaled = non_zero ? static_cast<DType>(
-                                               grad_data[grad_i + j] * rescale_grad +
-                                               weight_data[data_i] * wd)
-                                           : static_cast<DType>(weight_data[data_i] * wd);
+      DType grad_rescaled = non_zero ? static_cast<DType>(
+                                         grad_data[grad_i + j] * rescale_grad)
+                                     : static_cast<DType>(0);
       if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * square::Map(grad_rescaled);
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += weight_data[data_i] * wd;
+      mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+      var_data[data_i] = beta2 * var_data[data_i] +
+                         (1.f - beta2) * square::Map(grad_rescaled);
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
                     (square_root::Map(var_data[data_i]) + epsilon));
     }
@@ -780,7 +771,7 @@ gradient and :math:`E[g^2]_t` is the decaying average over past squared gradient
 The :math:`E[g^2]_t` is given by:
 
 .. math::
-  E[g^2]_t = \gamma * E[g^2]_{t-1} + (1-\gamma) * g_t^2
+  E[g^2]_t = \rho * E[g^2]_{t-1} + (1-\rho) * g_t^2
 
 The update step is
 
@@ -791,7 +782,7 @@ The RMSProp code follows the version in
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
 Tieleman & Hinton, 2012.
 
-Hinton suggests the momentum term :math:`\gamma` to be 0.9 and the learning rate
+Hinton suggests the momentum term :math:`\rho` to be 0.9 and the learning rate
 :math:`\eta` to be 0.001.
 
 )code" ADD_FILELINE)
@@ -819,19 +810,19 @@ Define :math:`E[g^2]_t` is the decaying average over past squared gradient and
 :math:`E[g]_t` is the decaying average over past gradient.
 
 .. math::
-  E[g^2]_t = \gamma_1 * E[g^2]_{t-1} + (1 - \gamma_1) * g_t^2\\
-  E[g]_t = \gamma_1 * E[g]_{t-1} + (1 - \gamma_1) * g_t\\
-  \Delta_t = \gamma_2 * \Delta_{t-1} - \frac{\eta}{\sqrt{E[g^2]_t - E[g]_t^2 + \epsilon}} g_t\\
+  E[g^2]_t = \rho * E[g^2]_{t-1} + (1 - \rho) * g_t^2\\
+  E[g]_t = \rho * E[g]_{t-1} + (1 - \rho) * g_t\\
+  momentum_t = \gamma * momentum_{t-1} - \frac{\eta}{\sqrt{E[g^2]_t - E[g]_t^2 + \epsilon}} g_t\\
 
 The update step is
 
 .. math::
-  \theta_{t+1} = \theta_t + \Delta_t
+  \theta_{t+1} = \theta_t + momentum_t
 
 The RMSPropAlex code follows the version in
 http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
 
-Graves suggests the momentum term :math:`\gamma_1` to be 0.95, :math:`\gamma_2`
+Graves suggests the momentum term :math:`\rho` to be 0.95, :math:`\gamma`
 to be 0.9 and the learning rate :math:`\eta` to be 0.0001.
 )code" ADD_FILELINE)
 .set_num_inputs(5)
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 6920cb06e482..b67fbf890cbe 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -37,7 +37,6 @@ struct SGDMomStdDnsRspDnsKernel<req, gpu> {
     const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
     const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
     using nnvm::dim_t;
-    const DType rate = lr * wd;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const dim_t nnr = prefix_sum[row_id];
@@ -46,14 +45,13 @@ struct SGDMomStdDnsRspDnsKernel<req, gpu> {
     const RType grad_i = (nnr - 1) * row_length + col_id;
     const DType grad = non_zero ? grad_data[grad_i]
                                 : static_cast<DType>(0);
+    DType grad_rescaled = rescale_grad * grad;
     if (clip_gradient >= 0.0f) {
-      mom_data[i] = momentum * mom_data[i]
-              - rate * weight_data[i]
-              - lr * mshadow_op::clip::Map(rescale_grad * grad, clip_gradient);
-    } else {
-      mom_data[i] = momentum * mom_data[i]
-                  - rate * weight_data[i] - lr * rescale_grad * grad;
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
+    mom_data[i] *= momentum;
+    mom_data[i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };
@@ -139,12 +137,12 @@ struct AdamStdDnsRspDnsKernel<req, gpu> {
     const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
                           : prefix_sum[row_id] > prefix_sum[row_id - 1];
     const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
-    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad
-                                                        + weight_data[i] * wd)
-                                   : static_cast<DType>(weight_data[i] * wd);
+    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad)
+                                   : static_cast<DType>(0);
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[i] * wd;
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i] = beta2 * var_data[i] +
                   (1.f - beta2) * square::Map(grad_rescaled);
diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h
index 12738f8e4053..00afe2ad079a 100644
--- a/src/optimizer/sgd-inl.h
+++ b/src/optimizer/sgd-inl.h
@@ -82,7 +82,7 @@ void sgd_mom_update(RunContext ctx, TBlob weight, const TBlob grad, TBlob mom,
   Tensor<xpu, 2> grad2d = grad.FlatTo2D<xpu, real_t>(s);
   if (param.clip_gradient > 0.0f) {
     mom2d = param.momentum*mom2d -
-            lr*(param.rescale_grad*F<sgd_clip>(grad2d, param.clip_gradient) + wd*weight2d);
+            lr*(F<sgd_clip>(param.rescale_grad * grad2d, param.clip_gradient) + wd*weight2d);
   } else {
     mom2d = param.momentum*mom2d - lr*(param.rescale_grad*grad2d + wd*weight2d);
   }
@@ -98,7 +98,7 @@ void sgd_update(RunContext ctx, TBlob weight, const TBlob grad,
   Tensor<xpu, 2> weight2d = weight.FlatTo2D<xpu, real_t>(s);
   Tensor<xpu, 2> grad2d = grad.FlatTo2D<xpu, real_t>(s);
   if (param.clip_gradient >= 0.0f) {
-    weight2d -= lr*(param.rescale_grad*F<sgd_clip>(grad2d, param.clip_gradient) +
+    weight2d -= lr*(F<sgd_clip>(param.rescale_grad * grad2d, param.clip_gradient) +
                 wd*weight2d);
   } else {
     weight2d -= lr*(param.rescale_grad*grad2d + wd*weight2d);
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
index 7cfd0217aa31..5f7c51f257b3 100644
--- a/tests/python/unittest/test_contrib_optimizer.py
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -25,76 +25,39 @@
 from common import with_seed
 
 
-# * GroupAdaGrad
-class PyGroupAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of Group AdaGrad optimizer.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Small value to avoid division by 0.
-
-    """
-
-    def __init__(self, eps=1e-5, **kwargs):
-        super(PyGroupAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
-
-    def create_state(self, index, weight):
-        assert len(weight.shape) == 2
-        history = mx.nd.zeros(
-            (weight.shape[0], 1), weight.context, stype=weight.stype)
-        return history
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        assert wd == 0
-
-        history = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True)
-        div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] -= div
-
-
 def test_group_adagrad():
     mx.random.seed(0)
-    opt1 = PyGroupAdaGrad
+    opt1 = mx.optimizer.contrib.GroupAdaGrad
     opt2 = mx.optimizer.contrib.GroupAdaGrad
-    shape = (3, 4)
-    eps_options = [{}, {'eps': 1e-8}]
+    shapes = [(3, 4), [5, 6]]
+    eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for options in itertools.product(eps_options, cg_options, rg_options):
+        for options in itertools.product(eps_options, cg_options, rg_options, agg_options):
             kwarg = dict(wd=0.0)
             for option in options:
                 kwarg.update(option)
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
-                dtype,
-                compare_states=False)
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
+                dtype)
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
                 dtype,
                 w_stype='row_sparse',
-                g_stype='row_sparse',
-                compare_states=False)
+                g_stype='row_sparse')
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
                 dtype,
-                g_stype='row_sparse',
-                compare_states=False)
+                g_stype='row_sparse')
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 6d7cf40f29f7..6137fd9d65df 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -72,145 +72,51 @@ def test_lr_wd_mult():
     args2 = {k: v.asnumpy() for k, v in args2.items()}
 
     assert mod._optimizer.lr_mult == {'fc1_bias': 1.0, 'fc1_weight': 0.0}
-    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5, 'fc1_bias': 0.0}
+    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5}
     assert mx.test_utils.almost_equal(args1['fc1_weight'], args2['fc1_weight'], 1e-10)
     assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
 
-# SGD
-
-class PySGD(mx.optimizer.Optimizer):
-    """python reference implemenation of sgd"""
-    def __init__(self, learning_rate=0.01, momentum=0.0, multi_precision=False, **kwargs):
-        super(PySGD, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.momentum = momentum
-        self.multi_precision = multi_precision
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state: momentum
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-
-        """
-        momentum = None
-        weight_master_copy = None
-        do_multi_precision = self.multi_precision and weight.dtype == np.float16
-        if do_multi_precision:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
-            weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (momentum, weight_master_copy)
-        else:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
-            return momentum
-
-    def create_state_multi_precision(self, index, weight):
-        return self.create_state(index, weight)
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
-
-        if not use_multi_precision:
-            if self.momentum == 0.0:
-                if self.clip_gradient is not None:
-                    weight[:] = ((1 - lr*wd)*weight -
-                        lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight[:] = (1 - lr*wd)*weight - lr*self.rescale_grad*grad
-            else:
-                mom = state
-                if self.clip_gradient is not None:
-                    mom[:] = (self.momentum*mom - lr*wd*weight -
-                        lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                    weight += mom
-                else:
-                    mom[:] = self.momentum*mom - lr*wd*weight - lr*self.rescale_grad*grad
-                    weight += mom
-        else:
-            grad32 = array(grad, ctx=grad.context, dtype=np.float32)
-            mom = state[0]
-            weight32 = state[1]
-            if self.momentum == 0.0:
-                if self.clip_gradient is not None:
-                    weight32[:] = ((1 - lr*wd)*weight32 -
-                        lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight32[:] = (1 - lr*wd)*weight32 - lr*self.rescale_grad*grad32
-            else:
-                if self.clip_gradient is not None:
-                    mom[:] = (self.momentum*mom - lr*wd*weight32 -
-                        lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                    weight32 += mom
-                else:
-                    mom[:] = self.momentum*mom - lr*wd*weight32 - lr*self.rescale_grad*grad32
-                    weight32 += mom
-            tmp = weight32.astype(weight.dtype)
-            tmp.copyto(weight)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        self.update(index, weight, grad, state)
 
 @with_seed()
 def test_sgd():
-    opt1 = PySGD
+    opt1 = mx.optimizer.SGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            if (dtype == np.float16 and
-                                    ('multi_precision' not in kwarg or
-                                        not kwarg['multi_precision'])):
-                                continue
-                            if dtype == np.float16:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3)
-                            else:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                            # test operator fallback on cpu
-                            if dtype != np.float16:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
-                                                  dtype, w_stype='csr', g_stype='csr')
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            if dtype == np.float16:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype, rtol=1e-3, atol=1e-4)
+            else:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype)
+            # test operator fallback on cpu
+            if dtype != np.float16:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  [shapes[0][:2], shapes[1]],
+                                  dtype, w_stype='csr', g_stype='csr')
+
 
 class PySparseSGD(mx.optimizer.Optimizer):
     """python reference implemenation of sgd"""
-    def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs):
+    def __init__(self, learning_rate=0.1, momentum=0.0, **kwargs):
         super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs)
         self.momentum = momentum
 
@@ -228,478 +134,240 @@ def create_state(self, index, weight):
         else:
             return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
 
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        num_rows = weight.shape[0]
-        if self.momentum == 0.0:
-            # Update on a per row basis, skip all-zero rows
-            for row in range(num_rows):
-                grad_row = grad[row].asnumpy()
-                all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
-                if all_zeros:
-                   continue
-                if self.clip_gradient is not None:
-                    weight[row] = ((1 - lr*wd)*weight[row] -
-                        lr*mx.nd.clip(grad[row]*self.rescale_grad,
-                                     -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row]
-        else:
-            mom = state
-            for row in range(num_rows):
-              grad_row = grad[row].asnumpy()
-              all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
-              if all_zeros:
-                  continue
-              if self.clip_gradient is not None:
-                  mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] -
-                      lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                  weight[row] += mom[row]
-              else:
-                  mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row]
-                  weight[row] += mom[row]
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            self._update_count(index)
+            num_rows = weight.shape[0]
+            if self.momentum == 0.0:
+                # Update on a per row basis, skip all-zero rows
+                for row in range(num_rows):
+                    grad_row = grad[row].asnumpy()
+                    all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                    if all_zeros:
+                        continue
+                    grad[row] *= self.rescale_grad
+                    if self.clip_gradient is not None:
+                        grad[row] = mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient)
+                    grad[row] += wd * weight[row]
+                    weight[row] -= lr * grad[row]
+            else:
+                mom = state
+                for row in range(num_rows):
+                    grad_row = grad[row].asnumpy()
+                    all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                    if all_zeros:
+                        continue
+                    grad[row] *= self.rescale_grad
+                    if self.clip_gradient is not None:
+                        grad[row] = mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient)
+                    grad[row] += wd * weight[row]
+                    mom[row] *= self.momentum
+                    mom[row] -= lr * grad[row]
+                    weight[row] += mom[row]
+
 
 @with_seed()
 def test_sparse_sgd():
     opt1 = PySparseSGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='row_sparse', g_stype='row_sparse')
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='default', g_stype='row_sparse')
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            compare_optimizer(opt1(**kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes, dtype,
+                              w_stype='row_sparse', g_stype='row_sparse')
+            compare_optimizer(opt1(**kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes, dtype,
+                              w_stype='default', g_stype='row_sparse')
 
 
 @with_seed()
 def test_std_sparse_sgd():
-    opt1 = PySGD
+    opt1 = mx.optimizer.SGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
-    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    for dtype in [np.float32]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(mom_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
-                                          w_stype='row_sparse', g_stype='row_sparse')
-                        compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
-                                          w_stype='default', g_stype='row_sparse')
-
-
-class PyNAG(PySGD):
-    def __init__(self, **kwargs):
-        super(PyNAG, self).__init__(**kwargs)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
-    def create_state(self, index, weight):
-        """Create additional optimizer state: momentum
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-
-        """
-        momentum = None
-        weight_master_copy = None
-        do_multi_precision = self.multi_precision and weight.dtype == np.float16
-        if do_multi_precision:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
-            weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (momentum, weight_master_copy)
-        else:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
-            return momentum
-
-    def create_state_multi_precision(self, index, weight):
-        return self.create_state(index, weight)
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
+    for dtype in [np.float32]:
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              w_stype='row_sparse', g_stype='row_sparse')
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              w_stype='default', g_stype='row_sparse')
 
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
-        if not use_multi_precision:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            if self.momentum == 0.0:
-                weight[:] += -lr * (grad + wd * weight)
-            else:
-              mom = state
-              weight[:] += (self.momentum**2 * mom) - lr*(self.momentum + 1)*(grad + wd*weight)
-              mom[:] = (self.momentum*mom) - lr*(grad + wd*weight)
-        else:
-            grad32 = array(grad, ctx=grad.context, dtype=np.float32)
-            grad32 = grad32 * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad32 = mx.nd.clip(grad32, -self.clip_gradient, self.clip_gradient)
-            mom = state[0]
-            weight32 = state[1]
-            if self.momentum == 0.0:
-                weight32[:] += -lr * (grad32 + wd * weight32)
-            else:
-                weight32[:] += (self.momentum**2 * mom) - lr*(self.momentum+1)*(grad32 + wd*weight32)
-                mom[:] = (self.momentum*mom) - lr*(grad32 + wd*weight32)
-            tmp = weight32.astype(weight.dtype)
-            tmp.copyto(weight)
 
 @with_seed()
 def test_nag():
-    opt1 = PyNAG
+    opt1 = mx.optimizer.NAG
     opt2 = mx.optimizer.NAG
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dtype in [np.float16, np.float32]:
         for params in itertools.product(mom_options, cg_options, rg_options,
-                                        wd_options, mp_options):
+                                        wd_options, mp_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                                        not kwarg['multi_precision'])):
+                                         not kwarg['multi_precision'])):
                 continue
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-4)
 
 
-# LAMB optimizer
-class PyLAMB(mx.optimizer.Optimizer):
-    """
-	Python reference implementation of LAMB optimizer.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 lower_bound=None, upper_bound=None, bias_correction=True, **kwargs):
-        super(PyLAMB, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-        self.bias_correction = bias_correction
-
-    def create_state(self, index, weight):
-        stype = weight.stype
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype),
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype))
-
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-        mean, var = state
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
-        var[:] = self.beta2 * var + (1. - self.beta2) * mx.nd.square(grad)
-
-        mean_hat = mean
-        var_hat = var
-        r1 = weight.norm()
-        if self.lower_bound:
-            r1 = mx.nd.maximum(r1, self.lower_bound)
-        if self.upper_bound:
-            r1 = mx.nd.minimum(r1, self.upper_bound)
-        if self.bias_correction:
-            mean_hat = mean / (1. - mx.nd.power(self.beta1, t))
-            var_hat = var / (1. - mx.nd.power(self.beta2, t))
-
-        g = mean_hat / (mx.nd.sqrt(var_hat) + self.epsilon) + wd * weight
-
-        r2 = g.norm()
-        # calculate lamb_trust_ratio
-        r = 1. if r1 == 0. or r2 == 0. else r1 / r2
-        lr *= r
-        # update weight
-        weight[:] -= lr * g
+@with_seed()
+def test_lars():
+    opt1 = mx.optimizer.LARS
+    opt2 = mx.optimizer.LARS
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eta_options = [{}, {'eta': 0.002}, {'eta': 0.01}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eta_options, mom_options, cg_options, rg_options,
+                                        wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-3)
 
 
 @with_seed()
 def test_lamb():
-    opt1 = PyLAMB
+    opt1 = mx.optimizer.LAMB
     opt2 = mx.optimizer.LAMB
-    shape = (3, 4, 5)
+    
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     bc_options = [{}, {'bias_correction': False}, {'bias_correction': True}]
     lb_options = [{}, {'lower_bound': None}, {'lower_bound': 1e-3}]
     ub_options = [{}, {'upper_bound': None}, {'upper_bound': 10}]
-    for params in itertools.product(cg_options, rg_options, wd_options, bc_options, lb_options, ub_options):
-        kwarg = {k: v for param in params for k, v in param.items()}
-        kwarg['multi_precision'] = False
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        kwarg['multi_precision'] = True
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float16, rtol=1e-3, atol=1e-3)
-
-@with_seed()
-def test_multilamb():
-    opt1 = PyLAMB
-    opt2 = mx.optimizer.LAMB
-
-    # shapes as Bert-large
-    dims_x = [1024, 4096, 1024, 1024]
-    dims_y = [1, 1, 1024, 4096]
-    dims_occurrences = [9, 1, 4, 2]
-    nlayers = 4 # 24
-    # extra_dims_x=[30522, 512, 30522]
-    # extra_dims_y=[1, 1024, 1024]
-    shapes=[]
-    for l in range(nlayers):
-        for i, (dx,dy) in enumerate(zip(dims_x, dims_y)):
-            for j in range(dims_occurrences[i]):
-                shapes.append((dx,dy))
-    # for dx,dy in zip(extra_dims_x, extra_dims_y):
-    #    shapes.append((dx,dy))
-
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    bias_options = [{'bias_correction': False}, {'bias_correction': True}]
-
-    for dtype in [np.float16, np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for bias_option in bias_options:
-                        kwarg = {}
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        kwarg.update(bias_option)
-                        if (dtype == np.float16):
-                            kwarg.update({'multi_precision': True})
-                        atol = 1e-3
-                        rtol = 1e-6
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype,
-                                          rtol=rtol, atol=atol, ntensors=len(shapes))
-
-#SGLD
-class PySGLD(mx.optimizer.Optimizer):
-    """python reference implementation of SGLD"""
-
-    def __init__(self, **kwargs):
-        super(PySGLD, self).__init__(**kwargs)
-
-    def create_state(self, index, weight):
-        return None
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, mx.nd.NDArray))
-        assert(isinstance(grad, mx.nd.NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight) + mx.random.normal(0, math.sqrt(lr), shape=weight.shape,
-                                                            dtype=weight.dtype, ctx=weight.context)
-
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options, rg_options,
+                                        wd_options, bc_options, lb_options, ub_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-3)
 
 
 @with_seed()
 def test_sgld():
-    opt1 = PySGLD
+    opt1 = mx.optimizer.SGLD
     opt2 = mx.optimizer.SGLD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     ns_options = [1234, 42]
-
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-
-
-    def compare_optimizer_noise_seeded(opt1, opt2, shape, dtype, noise_seed,
-                                       w_stype='default', g_stype='default',
-                                       rtol=1e-4, atol=1e-5, compare_states=True):
-        """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
-        in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
-
-        """
-        if w_stype == 'default':
-            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            w1 = w2.copyto(default_context())
-        elif w_stype == 'row_sparse' or w_stype == 'csr':
-            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-            w1 = w2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-        if g_stype == 'default':
-            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            g1 = g2.copyto(default_context())
-        elif g_stype == 'row_sparse' or g_stype == 'csr':
-            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-            g1 = g2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2)
-
-        # set seed for Gaussian noise replication
-        mx.random.seed(noise_seed)
-        opt1.update_multi_precision(0, w1, g1, state1)
-        mx.random.seed(noise_seed)
-        opt2.update_multi_precision(0, w2, g2, state2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for seed in ns_options:
-        for dtype in [np.float16, np.float32, np.float64]:
-            for params in itertools.product(cg_options, wd_options, mp_options):
+        for dtype in [np.float16, np.float32]:
+            for params in itertools.product(cg_options, wd_options, mp_options, agg_options):
                 kwarg = {k: v for param in params for k, v in param.items()}
-                if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                    not kwarg['multi_precision'])):
+                if (dtype == np.float16 and ('multi_precision' not in kwarg
+                                             or not kwarg['multi_precision'])):
                     continue
                 atol = 1e-2 if dtype == np.float16 else 1e-3
                 rtol = 1e-4 if dtype == np.float16 else 1e-5
-                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), shape, dtype, seed, atol=atol, rtol=rtol)
-
-
-
-# FTML
+                compare_optimizer_noise_seeded(opt1(**kwarg),
+                                               opt2(**kwarg),
+                                               shapes, dtype, seed, atol=atol, rtol=rtol)
 
-class PyFTML(mx.optimizer.Optimizer):
-    """python reference implemenation of FTML"""
-    def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
-        super(PyFTML, self).__init__(**kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, mx.nd. NDArray))
-        assert(isinstance(grad, mx.nd.NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        # get previous states
-        prev_d, prev_v, prev_z = state
-        # compute states
-        v_t = self.beta2 * prev_v + (1 - self.beta2) * mx.nd.square(grad)
-        d_t = (1 - pow(self.beta1, t)) / lr * (mx.nd.sqrt(v_t / (1 - pow(self.beta2, t))) + self.epsilon)
-        sigma_t = d_t - self.beta1 * prev_d
-        z_t = self.beta1 * prev_z + (1 - self.beta1) * grad - sigma_t * weight
-        # update weight
-        weight[:] = - z_t / d_t
-        # update states
-        prev_d[:] = d_t
-        prev_v[:] = v_t
-        prev_z[:] = z_t
 
 @with_seed()
 def test_ftml():
-    opt1 = PyFTML
+    opt1 = mx.optimizer.FTML
     opt2 = mx.optimizer.FTML
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
     beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    for dtype in [np.float32]:
-        for beta1_option in beta1_options:
-            for beta2_option in beta2_options:
-                for cg_option in cg_options:
-                    for rg_option in rg_options:
-                        for wd_option in wd_options:
-                            kwarg = {}
-                            kwarg.update(beta1_option)
-                            kwarg.update(beta2_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
-
-
-# ADAM
-class PyAdam(mx.optimizer.Optimizer):
-    """python reference implemenation of adam"""
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-4)
+
+
+# Sparse ADAM
+class PySparseAdam(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 lazy_update=True, **kwargs):
-        super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
+                 lazy_update=False, **kwargs):
+        super(PySparseAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
@@ -717,391 +385,212 @@ def create_state(self, index, weight):
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
 
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-        An unique integer key used to index the parameters
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
 
-        weight : NDArray
-        weight ndarray
+            mean, variance = state
+            num_rows = weight.shape[0]
 
-        grad : NDArray
-        grad ndarray
+            coef1 = 1. - self.beta1 ** t
+            coef2 = 1. - self.beta2 ** t
+            lr *= math.sqrt(coef2) / coef1
 
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        self._update_count(index)
-
-        t = self._index_update_count[index]
-        mean, variance = state
-
-        wd = self._get_wd(index)
-        num_rows = weight.shape[0]
-        coef1 = 1. - self.beta1**t
-        coef2 = 1. - self.beta2**t
-        lr *= math.sqrt(coef2)/coef1
-        for row in range(num_rows):
-            # check row slices of all zeros
-            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            # skip zeros during lazy update
-            if all_zeros and self.lazy_update:
-                continue
-            grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
-            # clip gradients
-            if self.clip_gradient is not None:
-                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
-            # update mean
-            mean[row] *= self.beta1
-            mean[row] += grad[row] * (1. - self.beta1)
-            # update variance
-            variance[row] *= self.beta2
-            variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
-            # update weight
-            weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon)
+            for row in range(num_rows):
+                # check row slices of all zeros
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(),
+                                                       np.zeros_like(grad[row].asnumpy()))
+                # skip zeros during lazy update
+                if all_zeros and self.lazy_update:
+                    continue
+                grad[row] *= self.rescale_grad
+                # clip gradients
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+                grad[row] += wd * weight[row]
+                # update mean
+                mean[row] *= self.beta1
+                mean[row] += grad[row] * (1. - self.beta1)
+                # update variance
+                variance[row] *= self.beta2
+                variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
+                # update weight
+                weight[row] -= lr * mean[row] / (mx.nd.sqrt(variance[row]) + self.epsilon)
 
 
 @with_seed()
 def test_adam():
-    opt1 = PyAdam
+    opt1 = mx.optimizer.Adam
     opt2 = mx.optimizer.Adam
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for mp_option in mp_options:
-                        kwarg = {}
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        kwarg.update(mp_option)
-                        if (dtype == np.float16 and
-                                ('multi_precision' not in kwarg or
-                                    not kwarg['multi_precision'])):
-                            continue
-                        # atol 2e-5 needed to pass with seed 1248389097
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(**kwarg), shape, dtype,
-                                          rtol=1e-4, atol=2e-5)
-                        # atol 2e-5 needed to pass with seed 781809840
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
-                                          dtype, w_stype='row_sparse', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
-                                          dtype, w_stype='row_sparse', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
-                                          dtype, w_stype='default', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
-                                          dtype, w_stype='default', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-
-
-# AdaMax
-class PyAdamax(mx.optimizer.Optimizer):
-    """The python reference of AdaMax optimizer.
-
-    This class implements the AdaMax optimizer, one variant of Adam based on the infinity norm,
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    The optimizer updates the weight by::
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
-        super(PyAdamax, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        lr /= (1. - self.beta1**t)
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            # atol 2e-5 needed to pass with seed 1248389097
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=2e-5)
 
-        # update m_t and u_t
-        m_t, u_t = state
-        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
-        u_t[:] = mx.nd.maximum(self.beta2 * u_t, mx.nd.abs(grad))
 
-        # update weight
-        weight[:] -= lr * m_t / u_t
+@with_seed()
+def test_sparse_adam():
+    opt1 = PySparseAdam
+    opt2 = mx.optimizer.Adam
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            # atol 2e-5 needed to pass with seed 1248389097
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=2e-5)
+            # atol 2e-5 needed to pass with seed 781809840
+            compare_optimizer(opt1(lazy_update=True, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=True, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes,
+                              dtype, w_stype='default', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes,
+                              dtype, w_stype='default', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
 
 
 @with_seed()
 def test_adamax():
-    opt1 = PyAdamax
+    opt1 = mx.optimizer.Adamax
     opt2 = mx.optimizer.Adamax
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for params in itertools.product(cg_options, rg_options, wd_options, mp_options):
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if (dtype == np.float16 and
-                    ('multi_precision' not in kwarg or
-                    not kwarg['multi_precision'])):
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
                 continue
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-
-
-# Signum
-class PySignum(mx.optimizer.Optimizer):
-    """The python reference of Signum optimizer.
-
-    The optimizer updates the weight by:
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
-    See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    wd_lh : float, optitional
-       The amount of decoupled weight decay regularization.
-    """
-    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh = 0.0, **kwargs):
-        super(PySignum, self).__init__(learning_rate = learning_rate, **kwargs)
-        self.momentum = momentum
-        self.wd_lh = wd_lh
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        if state is not None:
-            mom = state
-            if self.clip_gradient is not None:
-              mom[:] = (self.momentum*mom - (1-self.momentum)*(wd*weight +
-                  mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient)))
-            else:
-              mom[:] = self.momentum*mom - (1-self.momentum)*wd*weight - (1-self.momentum)*self.rescale_grad*grad
-            weight[:] = (1 - lr*self.wd_lh)*weight + lr*mx.nd.sign(mom)
-        else:
-            weight[:] = (1 - lr*(wd+self.wd_lh))*weight - lr*mx.nd.sign(grad)
 
 @with_seed()
 def test_signum():
-    opt1 = PySignum
+    opt1 = mx.optimizer.Signum
     opt2 = mx.optimizer.Signum
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     wd_lh_options = [{}, {'wd_lh': 0.015}, {'wd_lh': 0.0}]
     mom_options = [{}, {'momentum': 0.9}]
     lr_options = [{'learning_rate': 0.05},{'learning_rate': 0.01}]
-    for dtype in [np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for mp_option in wd_lh_options:
-                        for lr_option in lr_options:
-                            for mom_option in mom_options:
-                                kwarg = {}
-                                kwarg.update(cg_option)
-                                kwarg.update(rg_option)
-                                kwarg.update(wd_option)
-                                kwarg.update(mp_option)
-                                kwarg.update(lr_option)
-                                kwarg.update(mom_option)
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-
-
-# RMSProp
-class PyRMSProp(mx.optimizer.Optimizer):
-    """RMSProp optimizer of Tieleman & Hinton, 2012,
-
-    For centered=False, the code follows the version in
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012
-
-    For centered=True, the code follows the version in
-    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
-
-    Parameters
-    ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.001.
-    gamma1: float, optional
-        decay factor of moving average for gradient, gradient^2.
-        Default value is set to 0.9.
-    gamma2: float, optional
-        "momentum" factor.
-        Default value if set to 0.9.
-        Only used if centered=True
-    epsilon : float, optional
-        Default value is set to 1e-8.
-    centered : boolean, optional
-        Use Graves or Tielemans & Hintons version of RMSProp
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient.
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-    clip_weights : float, optional
-        clip weights in range [-clip_weights, clip_weights]
-
-    """
-    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
-                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
-        super(PyRMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.centered = centered
-        self.gamma1 = gamma1
-        self.gamma2 = gamma2
-        self.epsilon = epsilon
-        self.clip_weights = clip_weights
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state.
-
-        For centered=False: n
-        For centered=True: n, g, delta
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-        """
-        if self.centered:
-            return (mx.nd.zeros(weight.shape, weight.context),  # n
-                    mx.nd.zeros(weight.shape, weight.context),  # g
-                    mx.nd.zeros(weight.shape, weight.context))  # delta
-        else:
-            return (mx.nd.zeros(weight.shape, weight.context), )  # n
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        grad = grad * self.rescale_grad + wd * weight
-
-        if not self.centered:
-            (n, ) = state
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            weight[:] -= lr * grad/(mx.nd.sqrt(n + self.epsilon))
-
-        else:
-            n, g, delta = state
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
-            delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g + self.epsilon))
-            weight[:] += delta
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(cg_options, rg_options, wd_options,
+                                        wd_lh_options, mom_options, lr_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
 
-        if self.clip_weights:
-             mx.ndarray.clip(weight, -self.clip_weights, self.clip_weights, out=weight)
 
 @with_seed()
 def test_rms():
-    opt1 = PyRMSProp
+    opt1 = mx.optimizer.RMSProp
     opt2 = mx.optimizer.RMSProp
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    rho_options = [{}, {'rho': 0.5}, {'rho': 0.7}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     cw_options = [{}, {'clip_weights': 0.01}]
     center_options = [{}, {'centered': False}, {'centered': True}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         # Reduce foating point compare tolerance to avoid flaky test failure.
         rtol, atol = (1e-1, 1e-1) if dtype is np.float16 else (1e-2, 1e-2)
 
-        for cw_option in cw_options:
-            for cg_option in cg_options:
-                for center_option in center_options:
-                    for rg_option in rg_options:
-                        for wd_option in wd_options:
-                            for mp_option in mp_options:
-                                kwarg = {}
-                                kwarg.update(cw_option)
-                                kwarg.update(cg_option)
-                                kwarg.update(center_option)
-                                kwarg.update(rg_option)
-                                kwarg.update(wd_option)
-                                kwarg.update(mp_option)
-                                if (dtype == np.float16 and
-                                        ('multi_precision' not in kwarg or
-                                            not kwarg['multi_precision'])):
-                                    continue
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=rtol, atol=atol)
-                                if (default_context() == mx.cpu()):
-                                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
-
-class PyFtrl(mx.optimizer.Optimizer):
-    """The Ftrl optimizer.
+        for params in itertools.product(rho_options, cg_options, cw_options,
+                                        center_options, rg_options, wd_options,
+                                        mom_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=rtol, atol=atol)
+            if default_context() == mx.cpu():
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
+
+
+class PySparseFtrl(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse Ftrl optimizer.
 
     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.
@@ -1119,224 +608,290 @@ class PyFtrl(mx.optimizer.Optimizer):
            \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
     """
 
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, lazy_update=False, **kwargs):
-        super(PyFtrl, self).__init__(**kwargs)
+    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
+        super(PySparseFtrl, self).__init__(**kwargs)
         self.lamda1 = lamda1
         self.beta = beta
         self.lr = learning_rate
-        self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # dn
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # z
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # n
 
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        wd = self._get_wd(index)
-        lr = self._get_lr(index)
-        num_rows = weight.shape[0]
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
-        dn, n = state
-        for row in range(num_rows):
-            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            if all_zeros and self.lazy_update:
-                continue
-            grad[row] = grad[row] * self.rescale_grad
-            if self.clip_gradient is not None:
-                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            wd = self._get_wd(index)
+            lr = self._get_lr(index)
+            num_rows = weight.shape[0]
 
-            #update dn, n
-            dn[row] += grad[row] - (mx.nd.sqrt(n[row] + grad[row] * grad[row]) - mx.nd.sqrt(n[row])) * weight[row] / lr
-            n[row] += grad[row] * grad[row]
+            z, n = state
+            for row in range(num_rows):
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+                if all_zeros:
+                    continue
+                grad[row] *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+
+                # update z[row], n[row]
+                sigma = - mx.nd.sqrt(n[row])
+                n[row] += mx.nd.square(grad[row])
+                denom = mx.nd.sqrt(n[row])
+                sigma += denom
+                sigma /= lr
+                z[row] += grad[row] - sigma * weight[row]
+
+                # update weight
+                denom += self.beta
+                denom /= lr
+                denom += wd
+                d = mx.nd.sign(z[row]) * mx.nd.maximum(mx.nd.abs(z[row]) - self.lamda1, 0)
+                weight[row] = - d / denom
 
-            # update weight
-            weight[row] = (mx.nd.sign(dn[row]) * self.lamda1 - dn[row]) / \
-                          ((self.beta + mx.nd.sqrt(n[row])) / lr + wd) * (mx.nd.abs(dn[row]) > self.lamda1)
 
 @with_seed()
 def test_ftrl():
-    opt1 = PyFtrl
+    opt1 = mx.optimizer.Ftrl
     opt2 = mx.optimizer.Ftrl
-    shape = (3, 4, 5)
-    kwargs = [{},
-              {'clip_gradient': 0.5},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14},
-              {'rescale_grad': 0.8},
-              {'clip_gradient': 0.5, 'wd': 0.07},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
-              {'rescale_grad': 0.8, 'wd': 0.05},
-              {'rescale_grad': 0.8, 'wd': 0.05, 'lamda1': 0.01},
-              {'clip_gradient': 0.5, 'wd': 0.07, 'lamda1': 1.0}]
-    for kwarg in kwargs:
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
-                          np.float32, w_stype='row_sparse', g_stype='row_sparse')
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda1_options = [{}, {'lamda1': 0.}, {'lamda1': 0.1}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda1_options, cg_options,
+                                        rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=1e-4)
+
+
+@with_seed()
+def test_sparse_ftrl():
+    opt1 = PySparseFtrl
+    opt2 = mx.optimizer.Ftrl
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda1_options = [{}, {'lamda1': 0.}, {'lamda1': 0.1}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda1_options, cg_options,
+                                        rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=1e-4)
+
 
 @with_seed()
 def test_nadam():
+    opt1 = mx.optimizer.Nadam
+    opt2 = mx.optimizer.Nadam
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+    schedule_decay_options = [{}, {'schedule_decay': 0.008}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        schedule_decay_options, rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
-    def get_net(num_hidden, flatten=True):
-        data = mx.symbol.Variable('data')
-        fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten)
-        act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-        fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten)
-        act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-        fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
-        return fc3
-
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=5, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.L1Loss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.001, 'wd': 0.0005},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
-            optimizer='nadam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.11
-
-# AdaGrad
-class PyAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of AdaGrad optimizer.
+
+class PySparseAdaGrad(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse Adagrad optimizer.
 
     This class implements the AdaGrad optimizer described in *Adaptive Subgradient
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
-    Updates are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        history = history + square(rescaled_grad)
-        w = w - learning_rate * rescaled_grad / sqrt(history + epsilon)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
     Parameters
     ----------
-    eps: float, optional
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
         Small value to avoid division by 0.
-
     """
-    def __init__(self, eps=1e-7, **kwargs):
-        super(PyAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
+
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, **kwargs):
+        super(PySparseAdaGrad, self).__init__(learning_rate=learning_rate,
+                                              **kwargs)
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
-        return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)
+        return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)  # history
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            wd = self._get_wd(index)
+            lr = self._get_lr(index)
+            num_rows = weight.shape[0]
+
+            history = state
+            for row in range(num_rows):
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+                if all_zeros:
+                    continue
+                grad[row] *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+                grad[row] += wd * weight[row]
+
+                # update history[row]
+                history[row] += mx.nd.square(grad[row])
+                denom = mx.nd.sqrt(history[row])
+                denom += self.epsilon
 
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
+                # update weight
+                weight[row] -= lr * grad[row] / denom
 
-        history = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        history[:] += mx.nd.square(grad)
-        div = grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] += (div + weight * wd) * -lr
 
 @with_seed()
 def test_adagrad():
-    opt1 = PyAdaGrad
+    opt1 = mx.optimizer.AdaGrad
     opt2 = mx.optimizer.AdaGrad
-    shape = (3, 4, 5)
-    eps_options = [{}, {'eps': 1e-8}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.0}]
-    for dtype in [np.float32]:
-        for eps_option in eps_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(eps_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                        if wd_option.get('wd', 0.0) == 0.0:
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='row_sparse', g_stype='row_sparse')
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              g_stype='row_sparse')
-
-# AdaDelta
-class PyAdaDelta(mx.optimizer.Optimizer):
-    """The python reference of AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
-        cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
-        weight -= (cur_delta + wd * weight)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(PyAdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context),
-                mx.nd.zeros(weight.shape, weight.context))
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        wd = self._get_wd(index)
-
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eps_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
 
-        acc_grad, acc_delta = state
 
-        acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
-        current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
-                         mx.nd.sqrt(acc_grad + self.epsilon)) * grad
-        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta ** 2
+@with_seed()
+def test_sparse_adagrad():
+    opt1 = PySparseAdaGrad
+    opt2 = mx.optimizer.AdaGrad
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eps_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            if kwarg.get('wd', 0.0) == 0.0:
+                compare_optimizer(opt1(**kwarg), opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  w_stype='row_sparse', g_stype='row_sparse')
+                compare_optimizer(opt1(**kwarg), opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  g_stype='row_sparse')
 
-        # update weight
-        weight[:] -= current_delta + wd * weight
 
 @with_seed()
 def test_adadelta():
-    opt1 = PyAdaDelta
+    opt1 = mx.optimizer.AdaDelta
     opt2 = mx.optimizer.AdaDelta
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     rho_options = [{'rho': 0.9}]
     eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.0}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(rho_options, eps_options, cg_options,
-                                        rg_options, wd_options):
+                                        rg_options, wd_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if dtype is np.float16:
                 kwarg.update({'multi_precision': True})
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
+
+
+@with_seed()
+def test_dcasgd():
+    opt1 = mx.optimizer.DCASGD
+    opt2 = mx.optimizer.DCASGD
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda_options = [{}, {'lamda': 0.01}, {'lamda': 0.1}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda_options, mom_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
 
 def test_factor_scheduler():
@@ -1353,6 +908,7 @@ def test_factor_scheduler():
     np.testing.assert_almost_equal(sched(201), base_lr * factor * factor)
     np.testing.assert_almost_equal(sched(1000), 1e-4)
 
+
 def test_multifactor_scheduler():
     base_lr = 0.1
     steps = [15, 25]
@@ -1368,6 +924,7 @@ def test_multifactor_scheduler():
     np.testing.assert_almost_equal(sched(26), base_lr * factor * factor)
     np.testing.assert_almost_equal(sched(100), base_lr * factor * factor)
 
+
 def test_poly_scheduler():
     base_lr = 3
     final_lr = 0
@@ -1382,6 +939,7 @@ def test_poly_scheduler():
     assert (poly_sched(500) < 1.6)
     np.testing.assert_almost_equal(poly_sched(steps), final_lr)
 
+
 def test_cosine_scheduler():
     # also tests case without warmup
     base_lr = 3
@@ -1392,6 +950,8 @@ def test_cosine_scheduler():
     np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
     assert (cosine_sched(500) > 1.5)
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
+

From 73dcf48a33f7647658829d8b31351ee5bf873b4c Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Wed, 8 Jan 2020 16:07:14 -0800
Subject: [PATCH 02/10] refactor optimizer

---
 cpp-package/example/charRNN.cpp               |    2 +-
 cpp-package/example/lenet.cpp                 |    2 +-
 cpp-package/include/mxnet-cpp/optimizer.hpp   |    1 -
 .../tutorials/packages/optimizer/index.md     |   26 -
 example/image-classification/common/fit.py    |    4 +-
 example/profiler/profiler_executor.py         |    2 +-
 python/mxnet/gluon/trainer.py                 |   18 +-
 python/mxnet/optimizer/__init__.py            |   45 +-
 python/mxnet/optimizer/adadelta.py            |  112 +
 python/mxnet/optimizer/adagrad.py             |  145 ++
 python/mxnet/optimizer/adam.py                |  188 ++
 python/mxnet/optimizer/adamax.py              |  112 +
 python/mxnet/optimizer/contrib.py             |  136 +-
 python/mxnet/optimizer/dcasgd.py              |  117 ++
 python/mxnet/optimizer/ftml.py                |  160 ++
 python/mxnet/optimizer/ftrl.py                |  171 ++
 python/mxnet/optimizer/lamb.py                |  263 +++
 python/mxnet/optimizer/lars.py                |  282 +++
 python/mxnet/optimizer/nadam.py               |  125 ++
 python/mxnet/optimizer/nag.py                 |  166 ++
 python/mxnet/optimizer/optimizer.py           | 1829 ++---------------
 python/mxnet/optimizer/rmsprop.py             |  185 ++
 python/mxnet/optimizer/sgd.py                 |  247 +++
 python/mxnet/optimizer/sgld.py                |   89 +
 python/mxnet/optimizer/signum.py              |  162 ++
 python/mxnet/optimizer/updater.py             |  142 ++
 python/mxnet/optimizer/utils.py               |   43 +
 python/mxnet/test_utils.py                    |  106 +-
 src/operator/contrib/optimizer_op-inl.h       |    2 +-
 src/operator/contrib/optimizer_op.cc          |    2 +-
 src/operator/optimizer_op-inl.h               |  277 +--
 src/operator/optimizer_op.cc                  |   49 +-
 src/operator/optimizer_op.cu                  |   18 +-
 src/optimizer/sgd-inl.h                       |    4 +-
 .../python/unittest/test_contrib_optimizer.py |   73 +-
 tests/python/unittest/test_optimizer.py       | 1584 ++++++--------
 36 files changed, 3793 insertions(+), 3096 deletions(-)
 create mode 100644 python/mxnet/optimizer/adadelta.py
 create mode 100644 python/mxnet/optimizer/adagrad.py
 create mode 100644 python/mxnet/optimizer/adam.py
 create mode 100644 python/mxnet/optimizer/adamax.py
 create mode 100644 python/mxnet/optimizer/dcasgd.py
 create mode 100644 python/mxnet/optimizer/ftml.py
 create mode 100644 python/mxnet/optimizer/ftrl.py
 create mode 100644 python/mxnet/optimizer/lamb.py
 create mode 100644 python/mxnet/optimizer/lars.py
 create mode 100644 python/mxnet/optimizer/nadam.py
 create mode 100644 python/mxnet/optimizer/nag.py
 create mode 100644 python/mxnet/optimizer/rmsprop.py
 create mode 100644 python/mxnet/optimizer/sgd.py
 create mode 100644 python/mxnet/optimizer/sgld.py
 create mode 100644 python/mxnet/optimizer/signum.py
 create mode 100644 python/mxnet/optimizer/updater.py
 create mode 100644 python/mxnet/optimizer/utils.py

diff --git a/cpp-package/example/charRNN.cpp b/cpp-package/example/charRNN.cpp
index 94e9455c5941..3d1b91d729e2 100644
--- a/cpp-package/example/charRNN.cpp
+++ b/cpp-package/example/charRNN.cpp
@@ -553,7 +553,7 @@ void trainWithBuiltInRNNOp(const std::string file, int batch_size, int max_epoch
   }
   start_epoch++;
 
-  Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+  Optimizer* opt = OptimizerRegistry::Find("sgd");
 //  opt->SetParam("momentum", 0.9)->SetParam("rescale_grad", 1.0 / batch_size)
 //  ->SetParam("clip_gradient", 10);
 
diff --git a/cpp-package/example/lenet.cpp b/cpp-package/example/lenet.cpp
index a52efd8fed40..624f4404ecaf 100644
--- a/cpp-package/example/lenet.cpp
+++ b/cpp-package/example/lenet.cpp
@@ -136,7 +136,7 @@ class Lenet {
     // args_map["fc1_b"] = 0;
 
     lenet.InferArgsMap(ctx_dev, &args_map, args_map);
-    Optimizer* opt = OptimizerRegistry::Find("ccsgd");
+    Optimizer* opt = OptimizerRegistry::Find("sgd");
     opt->SetParam("momentum", 0.9)
        ->SetParam("rescale_grad", 1.0)
        ->SetParam("clip_gradient", 10)
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
index 26fd00f3a162..e9a8bca5f028 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.hpp
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -128,7 +128,6 @@ inline Optimizer* OptimizerRegistry::Find(const std::string& name) {
   if (cmap().empty()) {
     // Optimizers should only be registered once
     MXNETCPP_REGISTER_OPTIMIZER(sgd, SGDOptimizer);
-    MXNETCPP_REGISTER_OPTIMIZER(ccsgd, SGDOptimizer);  // For backward compatibility
     MXNETCPP_REGISTER_OPTIMIZER(rmsprop, RMSPropOptimizer);
     MXNETCPP_REGISTER_OPTIMIZER(adam, AdamOptimizer);
     MXNETCPP_REGISTER_OPTIMIZER(adagrad, AdaGradOptimizer);
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/index.md b/docs/python_docs/python/tutorials/packages/optimizer/index.md
index b7b6c7453a89..3350cc6f5a9a 100644
--- a/docs/python_docs/python/tutorials/packages/optimizer/index.md
+++ b/docs/python_docs/python/tutorials/packages/optimizer/index.md
@@ -281,32 +281,6 @@ Here is how to create the signum optimizer in MXNet.
 signum_optimizer = optimizer.Signum(learning_rate=0.01, momentum=0.9, wd_lh=0.0)
 ```
 
-### [LBSGD](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.LBSGD)
-LBSGD stands for Large Batch Stochastic Gradient Descent and implements a technique where Layer-wise Adaptive Rate Scaling (LARS) is used to maintain a separate learning rate for each layer of the neural network. LBSGD has no additional modifications to SGD and performs the same parameter update steps as the SGD optimizer described above.
-
-LBSGD was introduced by [You et al](https://arxiv.org/pdf/1708.03888.pdf) for distributed training with data-parallel synchronous SGD across multiple worker nodes to overcome the issue of reduced model accuracy when the number of workers, and by extension effective batch size, is increased.
-
-Here is how to initialize the LBSGD optimizer in MXNet.
-
-
-```python
-lbsgd_optimizer = optimizer.LBSGD(momentum=0.0,
-                                  multi_precision=False,
-                                  warmup_strategy='linear',
-                                  warmup_epochs=5,
-                                  batch_scale=1,
-                                  updates_per_epoch=32,
-                                  begin_epoch=0,
-                                  num_epochs=60)
-```
-
-LBSGD has a number of extra keyword arguments described below
-* `multi_precision` - When True performs updates with float32 precision weights regardless of whether weights are initialized with lower precision. When False perform updates with same precision as the weights when initialized. Set to True to improve performance when training with low precision weight represenations.
-* `warmup_strategy` - The warmup is period where the learning rate is increased through the first few epochs. The following strategies are supported:  ['linear', 'power2', 'sqrt','lars']
-* `warmup_epochs` - How many epochs to perform warmup for
-* `batch_scale` - use batch size*numworkers
-* `updates_per_epoch` - How many updates to the learning rate to perform every epoch. For example during warmup the warmup strategy is applied to increase the learning rate a total of `warmup_epochs*updates_per_epoch` number of times.
-* `begin_epoch` - The epoch at which to start warmup.
 
 ### [DCASGD](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.DCASGD)
 
diff --git a/example/image-classification/common/fit.py b/example/image-classification/common/fit.py
index 8e8b0197960a..38ca296cf986 100755
--- a/example/image-classification/common/fit.py
+++ b/example/image-classification/common/fit.py
@@ -235,7 +235,7 @@ def fit(args, network, data_loader, **kwargs):
         'multi_precision': True}
 
     # Only a limited number of optimizers have 'momentum' property
-    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum', 'lbsgd'}
+    has_momentum = {'sgd', 'dcasgd', 'nag', 'signum'}
     if args.optimizer in has_momentum:
         optimizer_params['momentum'] = args.mom
 
@@ -243,7 +243,7 @@ def fit(args, network, data_loader, **kwargs):
         args.monitor, pattern=".*") if args.monitor > 0 else None
 
     # A limited number of optimizers have a warmup period
-    has_warmup = {'lbsgd', 'lbnag'}
+    has_warmup = {'lbnag'}
     if args.optimizer in has_warmup:
         nworkers = kv.num_workers
         if epoch_size < 1:
diff --git a/example/profiler/profiler_executor.py b/example/profiler/profiler_executor.py
index 91532535bd05..cba1515fa1a1 100644
--- a/example/profiler/profiler_executor.py
+++ b/example/profiler/profiler_executor.py
@@ -102,7 +102,7 @@ def get_module(ctx, sym, provide_data, provide_label, batch_size=None, is_train=
         mod.bind(data_shapes=provide_data, label_shapes=provide_label, for_training=False, inputs_need_grad=False)
 
     mod.init_params(initializer=mx.init.Xavier(magnitude=2.))
-    mod.init_optimizer(optimizer='ccsgd',
+    mod.init_optimizer(optimizer='sgd',
                        optimizer_params={
                             'learning_rate': 0.0001,
                             'momentum': 0.0,
diff --git a/python/mxnet/gluon/trainer.py b/python/mxnet/gluon/trainer.py
index a27c951c01b9..85e7409cde92 100644
--- a/python/mxnet/gluon/trainer.py
+++ b/python/mxnet/gluon/trainer.py
@@ -60,9 +60,11 @@ class Trainer(object):
         Arguments would then be {'type':'2bit', 'threshold':0.5}
         See mxnet.KVStore.set_gradient_compression method for more details on gradient compression.
     update_on_kvstore : bool, default None
-        Whether to perform parameter updates on kvstore. If None, then trainer will choose the more
-        suitable option depending on the type of kvstore. If the `update_on_kvstore` argument is
-        provided, environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
+        Whether to perform parameter updates on kvstore. If None and optimizer.aggregate_num <= 1,
+        then trainer will choose the more suitable option depending on the type of kvstore.
+        If None and optimizer.aggregate_num > 1, `update_on_kvstore` is set to False.
+        If the `update_on_kvstore` argument is provided,
+        environment variable `MXNET_UPDATE_ON_KVSTORE` will be ignored.
 
     Properties
     ----------
@@ -103,6 +105,12 @@ def __init__(self, params, optimizer, optimizer_params=None, kvstore='device',
         optimizer_params = optimizer_params if optimizer_params else {}
         self._init_optimizer(optimizer, optimizer_params)
         self._scale = self._optimizer.rescale_grad
+        if self._optimizer.aggregate_num > 1 and update_on_kvstore is not None:
+            if update_on_kvstore:
+                raise ValueError("Cannot set update_on_kvstore=True "
+                                 "when optimizer.aggregate_num > 1.")
+        if update_on_kvstore is None and self._optimizer.aggregate_num > 1:
+            update_on_kvstore = False
         self._kvstore_params = {'kvstore': kvstore, 'update_on_kvstore': update_on_kvstore}
         self._kv_initialized = False
         self._kvstore = None
@@ -457,8 +465,8 @@ def _update(self, ignore_stale_grad=False):
         if not (self._kvstore and self._update_on_kvstore):
             for updater, upd in zip(self._updaters, updates):
                 if upd:
-                    i, w, g = zip(*upd)
-                    updater(i, w, g)
+                    i, g, w = zip(*upd)
+                    updater(i, g, w)
 
     def save_states(self, fname):
         """Saves trainer states (e.g. optimizer, momentum) to a file.
diff --git a/python/mxnet/optimizer/__init__.py b/python/mxnet/optimizer/__init__.py
index 72eb5a741520..89b37de1c873 100644
--- a/python/mxnet/optimizer/__init__.py
+++ b/python/mxnet/optimizer/__init__.py
@@ -16,9 +16,48 @@
 # under the License.
 """Optimizer API of MXNet."""
 
-from . import optimizer, contrib
+from . import (optimizer, contrib, updater, utils, sgd,
+               sgld, signum, dcasgd, nag, adagrad,
+               adadelta, adam, adamax, nadam, ftrl,
+               ftml, lars, lamb, rmsprop)
 # pylint: disable=wildcard-import
 from .optimizer import *
-# pylint: enable=wildcard-import
 
-__all__ = optimizer.__all__ + ['contrib']
+from .updater import *
+
+from .utils import *
+
+from .sgd import *
+
+from .sgld import *
+
+from .signum import *
+
+from .dcasgd import *
+
+from .nag import *
+
+from .adagrad import *
+
+from .adadelta import *
+
+from .adam import *
+
+from .adamax import *
+
+from .nadam import *
+
+from .ftrl import *
+
+from .ftml import *
+
+from .lars import *
+
+from .lamb import *
+
+from .rmsprop import *
+
+__all__ = optimizer.__all__ + updater.__all__ + ['contrib'] + sgd.__all__ + sgld.__all__ \
+          + signum.__all__ + dcasgd.__all__ + nag.__all__ + adagrad.__all__ + adadelta.__all__ \
+          + adam.__all__ + adamax.__all__ + nadam.__all__ + ftrl.__all__ + ftml.__all__ \
+          + lars.__all__ + lamb.__all__ + rmsprop.__all__
diff --git a/python/mxnet/optimizer/adadelta.py b/python/mxnet/optimizer/adadelta.py
new file mode 100644
index 000000000000..0c6fdd02aaca
--- /dev/null
+++ b/python/mxnet/optimizer/adadelta.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""AdaDelta optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['AdaDelta']
+
+
+@register
+class AdaDelta(Optimizer):
+    """The AdaDelta optimizer.
+
+    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
+    learning rate method*, available at https://arxiv.org/abs/1212.5701.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
+        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
+        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
+        weight -= learning_rate * delta
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 1.0
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    rho: float, default 0.9
+        Decay rate for both squared gradients and delta.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=1.0, rho=0.9, epsilon=1e-6, use_fused_step=False, **kwargs):
+        super(AdaDelta, self).__init__(learning_rate=learning_rate,
+                                       use_fused_step=use_fused_step,
+                                       **kwargs)
+        self.rho = rho
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context),  # accumulated g
+                zeros(weight.shape, weight.context))  # accumulated delta
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            acc_g, acc_delta = state
+
+            # update g, delta
+            acc_g[:] *= self.rho
+            acc_g[:] += (1. - self.rho) * square(grad)
+            current_delta = sqrt(acc_delta + self.epsilon)
+            current_delta /= sqrt(acc_g + self.epsilon)
+            current_delta *= grad
+            acc_delta[:] *= self.rho
+            acc_delta[:] += (1. - self.rho) * square(current_delta)
+
+            # update weight
+            weight[:] -= lr * current_delta
diff --git a/python/mxnet/optimizer/adagrad.py b/python/mxnet/optimizer/adagrad.py
new file mode 100644
index 000000000000..8e181fda90a6
--- /dev/null
+++ b/python/mxnet/optimizer/adagrad.py
@@ -0,0 +1,145 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""AdaGrad optimizer"""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import sparse
+from .optimizer import Optimizer, register
+
+__all__ = ['AdaGrad']
+
+
+@register
+class AdaGrad(Optimizer):
+    """AdaGrad optimizer.
+
+    This class implements the AdaGrad optimizer described in *Adaptive Subgradient
+    Methods for Online Learning and Stochastic Optimization*, and available at
+    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        history += square(grad)
+        weight -= learning_rate * grad / (sqrt(history) + epsilon)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    See Also
+    ----------
+    :meth:`mxnet.ndarray.sparse.adagrad_update`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False or grad is not sparse, step is called,
+        otherwise, fused_step is called.
+
+    """
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, use_fused_step=True, **kwargs):
+        super(AdaGrad, self).__init__(learning_rate=learning_rate,
+                                      use_fused_step=use_fused_step,
+                                      **kwargs)
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update history
+            history = state
+            history[:] += square(grad)
+            d = grad / (sqrt(history) + self.epsilon)
+
+            # update weight
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            is_sparse = grad.stype == 'row_sparse'
+
+            if is_sparse:
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                history = state
+
+                # When grad is sparse, update weight with fused kernel
+                sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                # When the grad is not sparse, the func step is called to update weight and state
+                self.step([index], [weight], [grad], [state])
diff --git a/python/mxnet/optimizer/adam.py b/python/mxnet/optimizer/adam.py
new file mode 100644
index 000000000000..a08c5c73e6fe
--- /dev/null
+++ b/python/mxnet/optimizer/adam.py
@@ -0,0 +1,188 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Adam optimizer."""
+from __future__ import absolute_import
+import math
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import adam_update
+from .optimizer import Optimizer, register
+
+__all__ = ['Adam']
+
+
+@register
+class Adam(Optimizer):
+    """The Adam optimizer.
+
+    This class implements the optimizer described in *Adam: A Method for
+    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
+
+    If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
+    **lazy updates** at step t are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient) + wd * weight[row]
+            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
+            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
+            lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
+
+    The lazy update only updates the mean and var for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all indices.
+    Compared with the original update, it can provide large improvements in model training
+    throughput for some applications. However, it provides slightly different semantics than
+    the original update, and may lead to different empirical results.
+
+    Otherwise, **standard updates** at step t are applied by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m + (1 - beta1) * rescaled_grad
+        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
+        lr = learning_rate * sqrt(1 - beta2**t) / (1 - beta1**t)
+        w = w - lr * m / (sqrt(v) + epsilon)
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    lazy_update : bool, default False
+       Default is False. If True, lazy updates are applied \
+       if the storage types of weight and grad are both ``row_sparse``.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 lazy_update=False, use_fused_step=True, **kwargs):
+        super(Adam, self).__init__(use_fused_step=use_fused_step,
+                                   learning_rate=learning_rate,
+                                   **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update,\
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        self.lazy_update = lazy_update
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        stype = weight.stype if self.lazy_update else 'default'
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=stype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype,
+                      stype=stype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+            lr *= math.sqrt(coef2) / coef1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            # update weight
+            d = mean / (sqrt(var) + self.epsilon)
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+
+            lr *= math.sqrt(coef2)/coef1
+
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            mean, var = state
+
+            # update weight with fused kernel
+            adam_update(weight, grad, mean, var, out=weight,
+                        lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/adamax.py b/python/mxnet/optimizer/adamax.py
new file mode 100644
index 000000000000..50af82138f43
--- /dev/null
+++ b/python/mxnet/optimizer/adamax.py
@@ -0,0 +1,112 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Adamax optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, maximum, abs as NDabs)
+from .optimizer import Optimizer, register
+
+__all__ = ['Adamax']
+
+
+# pylint: enable=line-too-long
+@register
+class Adamax(Optimizer):
+    """The AdaMax optimizer.
+
+    It is a variant of Adam based on the infinity norm
+    available at http://arxiv.org/abs/1412.6980 Section 7.
+
+    The optimizer updates the weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        m = beta1 * m_t + (1 - beta1) * grad
+        u = maximum(beta2 * u, abs(grad))
+        weight -= lr / (1 - beta1**t) * m / u
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.002
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999,
+                 use_fused_step=False, **kwargs):
+        super(Adamax, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            lr /= (1. - self.beta1**t)
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] = maximum(self.beta2 * var, NDabs(grad))
+
+            # update weight
+            d = mean / var
+            weight[:] -= lr * d
+
diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
index d269aa1bd069..1092db3979ea 100644
--- a/python/mxnet/optimizer/contrib.py
+++ b/python/mxnet/optimizer/contrib.py
@@ -18,11 +18,8 @@
 
 # pylint: disable=too-many-lines
 """Contrib optimizers."""
-from ..ndarray import (NDArray, clip, contrib, mean, sqrt, square, zeros)
-from .optimizer import Optimizer
-
-# convenience wrapper for Optimizer.Register
-register = Optimizer.register  # pylint: disable=invalid-name
+from ..ndarray import (clip, contrib, mean, sqrt, square, zeros)
+from .optimizer import Optimizer, register
 
 __all__ = ['GroupAdaGrad']
 
@@ -40,8 +37,7 @@ class GroupAdaGrad(Optimizer):
 
         grad = clip(grad * rescale_grad, clip_gradient)
         history += mean(square(grad), axis=1, keepdims=True)
-        div = grad / sqrt(history + float_stable_eps)
-        weight -= div * lr
+        weight -= lr * grad / (sqrt(history) + epsilon)
 
     Weights are updated lazily if the gradient is sparse.
 
@@ -53,14 +49,24 @@ class GroupAdaGrad(Optimizer):
 
     Parameters
     ----------
-    eps: float, optional
-        Initial value of the history accumulator. Avoids division by 0.
-
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False or grad is not sparse, step is called,
+        otherwise, fused_step is called.
     """
 
-    def __init__(self, eps=1e-5, **kwargs):
-        super(GroupAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, use_fused_step=True, **kwargs):
+        super(GroupAdaGrad, self).__init__(learning_rate=learning_rate,
+                                           use_fused_step=use_fused_step,
+                                           **kwargs)
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
         assert len(weight.shape) == 2
@@ -68,33 +74,83 @@ def create_state(self, index, weight):
             (weight.shape[0], 1), weight.context, stype=weight.stype)
         return history
 
-    def update(self, index, weight, grad, state):
-        assert (isinstance(weight, NDArray))
-        assert (isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
-
-        is_sparse = grad.stype == 'row_sparse'
-        if is_sparse:
-            kwargs = {
-                'epsilon': self.float_stable_eps,
-                'rescale_grad': self.rescale_grad
-            }
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            contrib.group_adagrad_update(
-                weight,
-                grad,
-                state,
-                out=weight,
-                lr=lr,
-                **kwargs)
-        else:
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+         Parameters
+         ----------
+         indices : list of int
+             List of unique indices of the parameters into the individual learning rates
+             and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+             and `set_wd_mult()`, respectively.
+         weights : list of NDArray
+             List of parameters to be updated.
+         grads : list of NDArray
+             List of gradients of the objective with respect to this parameter.
+         states : List of any obj
+             List of state returned by `create_state()`.
+         """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+            assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
+
+            # preprocess grad
             grad = grad * self.rescale_grad
             if self.clip_gradient is not None:
                 grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            state[:] += mean(square(grad), axis=1, keepdims=True)
-            div = lr * grad / sqrt(state + self.float_stable_eps)
-            weight[:] -= div
+
+            # update history
+            history = state
+            history[:] += mean(square(grad), axis=1, keepdims=True)
+
+            # update weight
+            d = grad / (sqrt(history) + self.epsilon)
+            weight[:] -= lr * d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            is_sparse = grad.stype == 'row_sparse'
+
+            if is_sparse:
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
+
+                kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                history = state
+
+                # When grad is sparse, update weight with fused kernel
+                contrib.group_adagrad_update(
+                    weight,
+                    grad,
+                    history,
+                    out=weight,
+                    lr=lr,
+                    **kwargs)
+            else:
+                # When the grad is not sparse, the func step is called to update weight and state
+                self.step([index], [weight], [grad], [state])
\ No newline at end of file
diff --git a/python/mxnet/optimizer/dcasgd.py b/python/mxnet/optimizer/dcasgd.py
new file mode 100644
index 000000000000..f9ef2624c982
--- /dev/null
+++ b/python/mxnet/optimizer/dcasgd.py
@@ -0,0 +1,117 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""DCASGD optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['DCASGD']
+
+
+@register
+class DCASGD(Optimizer):
+    """The DCASGD optimizer.
+
+    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent
+    with Delay Compensation for Distributed Deep Learning*,
+    available at https://arxiv.org/abs/1609.08326.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, optional
+       The momentum value.
+    lamda : float, optional
+       Scale DC value.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, lamda=0.04,
+                 use_fused_step=False, **kwargs):
+        super(DCASGD, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.momentum = momentum
+        self.weight_previous = {}
+        self.lamda = lamda
+
+    def create_state(self, index, weight):
+        if self.momentum == 0.0:
+            return None, weight.copy()  # previous weight
+        else:
+            return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # momentum
+                    weight.copy())  # previous weight
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom, previous_weight
+            mom, previous_weight = state
+
+            d = square(grad)
+            d *= weight - previous_weight
+            d *= self.lamda
+            d += grad
+
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * d
+            else:
+                assert (self.momentum == 0.0)
+                mom = d
+                mom *= -lr
+            previous_weight[:] = weight
+
+            # update weight
+            weight[:] += mom
diff --git a/python/mxnet/optimizer/ftml.py b/python/mxnet/optimizer/ftml.py
new file mode 100644
index 000000000000..9b5aec3054d4
--- /dev/null
+++ b/python/mxnet/optimizer/ftml.py
@@ -0,0 +1,160 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""FTML optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import ftml_update
+from .optimizer import Optimizer, register
+
+__all__ = ['FTML']
+
+
+@register
+class FTML(Optimizer):
+    """The FTML optimizer.
+
+    This class implements the optimizer described in
+    *FTML - Follow the Moving Leader in Deep Learning*,
+    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
+
+    Denote time step by t. The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
+        d_t = (1 - power(beta1, t)) / lr * (square_root(v / (1 - power(beta2, t))) + epsilon)
+        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
+        weight = - z / d_t
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.0025
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.6
+        0 < beta1 < 1. Generally close to 0.5.
+    beta2 : float, default 0.999
+        0 < beta2 < 1. Generally close to 1.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.0025, beta1=0.6, beta2=0.999, epsilon=1e-8,
+                 use_fused_step=True, **kwargs):
+        super(FTML, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
+                zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
+                zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef1 = 1. - self.beta1**t
+            coef2 = 1. - self.beta2**t
+
+            # update d, v, z
+            d, v, z = state
+
+            v[:] *= self.beta2
+            v[:] += (1. - self.beta2) * square(grad)
+            sigma = - self.beta1 * d
+            d[:] = sqrt(v / coef2) + self.epsilon
+            d[:] *= coef1 / lr
+            sigma += d
+            z[:] *= self.beta1
+            z[:] += (1. - self.beta1) * grad
+            z[:] -= sigma * weight
+
+            # update weight
+            weight[:] = - z / d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad, 't': t}
+            if self.clip_gradient:
+                kwargs['clip_grad'] = self.clip_gradient
+
+            d, v, z = state
+
+            # update weight with fused kernel
+            ftml_update(weight, grad, d, v, z, out=weight, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/ftrl.py b/python/mxnet/optimizer/ftrl.py
new file mode 100644
index 000000000000..b0e484b8f971
--- /dev/null
+++ b/python/mxnet/optimizer/ftrl.py
@@ -0,0 +1,171 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""FTRL optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square, sign, maximum, abs as NDabs)
+from ..ndarray import ftrl_update
+from .optimizer import Optimizer, register
+
+__all__ = ['Ftrl']
+
+
+#pylint: disable=invalid-name
+#pylint: disable=line-too-long
+@register
+class Ftrl(Optimizer):
+    """The Ftrl optimizer.
+
+    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
+    http://dl.acm.org/citation.cfm?id=2488200.
+
+    eta :
+        .. math::
+           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
+        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
+        n += rescaled_grad**2
+        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
+
+    If the storage types of weight, state and grad are all ``row_sparse``, \
+    **sparse updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
+            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
+            n[row] += rescaled_grad[row]**2
+            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
+
+    The sparse update only updates the z and n for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    lamda1 : float, default 0.01
+        L1 regularization coefficient.
+    beta : float, default 1.0
+        Per-coordinate learning rate correlation parameter.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+
+    def __init__(self, learning_rate=0.1, lamda1=0.01, beta=1.,
+                 use_fused_step=True, **kwargs):
+        super(Ftrl, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.lamda1 = lamda1
+        self.beta = beta
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
+                zeros(weight.shape, weight.context, stype=weight.stype))  # n
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+
+            # update z, n
+            z, n = state
+
+            sigma = - sqrt(n)
+            n[:] += square(grad)
+            denom = sqrt(n)
+            sigma += denom
+            sigma /= lr
+            z[:] += grad - sigma * weight
+
+            # update weight
+            denom += self.beta
+            denom /= lr
+            denom += wd
+            d = sign(z) * maximum(NDabs(z) - self.lamda1, 0)
+            weight[:] = - d / denom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            # update weight with fused kernel
+            z, n = state
+            ftrl_update(weight, grad, z, n, out=weight, lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/lamb.py b/python/mxnet/optimizer/lamb.py
new file mode 100644
index 000000000000..11b7e18c0bf3
--- /dev/null
+++ b/python/mxnet/optimizer/lamb.py
@@ -0,0 +1,263 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Lamb optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip, sqrt, where, square, ones_like,
+                       maximum, minimum)
+from ..ndarray import (lamb_update_phase1, lamb_update_phase2,
+                       mp_lamb_update_phase1, mp_lamb_update_phase2)
+from ..ndarray.contrib import (multi_lamb_update, multi_mp_lamb_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['LAMB']
+
+
+@register
+class LAMB(Optimizer):
+    """LAMB Optimizer.
+
+    Referenced from 'Large Batch Optimization for Deep Learning: Training BERT in 76 minutes'
+    (https://arxiv.org/pdf/1904.00962.pdf)
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-6
+        Small value to avoid division by 0.
+    lower_bound : float, default None
+        Lower limit of norm of weight
+    upper_bound : float, default None
+        Upper limit of norm of weight
+    bias_correction : bool, default True
+        Whether or not to apply bias correction
+    aggregate_num : int, default 4
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+        In default, all the weights are aggregated.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
+                 lower_bound=None, upper_bound=None, bias_correction=True,
+                 aggregate_num=4, use_fused_step=True, **kwargs):
+        assert aggregate_num <= 45,\
+            'When use_fused_step is True, LAMB only supports aggregate_num <= 45,' \
+            ' and receives {}'.format(aggregate_num)
+        super(LAMB, self).__init__(learning_rate=learning_rate,
+                                   aggregate_num=aggregate_num,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.lower_bound = lower_bound
+        self.upper_bound = upper_bound
+        self.bias_correction = bias_correction
+
+    def create_state(self, index, weight):
+        stype = weight.stype
+        return (zeros(weight.shape, weight.context, dtype=numpy.float32, stype=stype),  # mean
+                zeros(weight.shape, weight.context, dtype=numpy.float32, stype=stype))  # var
+
+    def step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+
+            # update mean, var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            r1 = weight.norm()
+            if self.lower_bound is not None:
+                r1 = maximum(r1, self.lower_bound)
+            if self.upper_bound is not None:
+                r1 = minimum(r1, self.upper_bound)
+
+            if self.bias_correction:
+                # apply bias correction
+                coef1 = 1. - self.beta1**t
+                coef2 = 1. - self.beta2**t
+                mean_hat = mean / coef1
+                var_hat = var / coef2
+                sqrt(var_hat, out=var_hat)
+                var_hat += self.epsilon
+                mean_hat /= var_hat
+                mean_hat += wd * weight
+            else:
+                mean_hat = sqrt(var)
+                mean_hat += self.epsilon
+                mean_hat[:] = mean / mean_hat
+                mean_hat += wd * weight
+
+            g = mean_hat
+            r2 = g.norm()
+
+            # calculate lamb_trust_ratio
+            ratio = r1 / r2
+            # becomes NaN if ratio == NaN or 0, otherwise 0
+            nan_or_zero = 1 - ratio / ratio
+            r = where(nan_or_zero, ones_like(ratio), ratio)
+            lr *= r
+
+            # update weight
+            g *= lr
+            weight[:] -= g
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        if aggregate:
+            kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                      'bias_correction': self.bias_correction,
+                      'rescale_grad': self.rescale_grad}
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.lower_bound:
+                kwargs['lower_bound'] = self.lower_bound
+            if self.upper_bound:
+                kwargs['upper_bound'] = self.upper_bound
+
+            step_counts = []
+            for index in indices:
+                step_counts.append(self._index_update_count[index])
+
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+
+            if not multi_precision:
+                mean, var = list(zip(*states))
+                multi_lamb_update(weights, grads, mean, var,
+                                  out=weights, step_count=step_counts,
+                                  lrs=lrs, wds=wds, **kwargs)
+            else:
+                weights32, mean_var = list(zip(*states))
+                mean, var = list(zip(*mean_var))
+                multi_mp_lamb_update(weights, grads,
+                                     mean, var, weights32,
+                                     out=weights, step_count=step_counts,
+                                     lrs=lrs, wds=wds, **kwargs)
+        else:
+            for index, weight, grad, state in zip(indices, weights, grads, states):
+                self._update_count(index)
+                lr = self._get_lr(index)
+                wd = self._get_wd(index)
+                t = self._index_update_count[index]
+                kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
+                          'bias_correction': self.bias_correction,
+                          'rescale_grad': self.rescale_grad, 't': t}
+                if self.clip_gradient:
+                    kwargs['clip_gradient'] = self.clip_gradient
+
+                multi_precision = self.multi_precision and weight.dtype == numpy.float16
+
+                if multi_precision:
+                    weight32 = state[0]
+                    mean, var = state[1]
+                    g = mp_lamb_update_phase1(weight, grad, mean, var, weight32, wd=wd, **kwargs)
+
+                    kwargs = {}
+                    if self.lower_bound:
+                        kwargs['lower_bound'] = self.lower_bound
+                    if self.upper_bound:
+                        kwargs['upper_bound'] = self.upper_bound
+                    r_1 = weight32.norm()
+                    r_2 = g.norm()
+                    mp_lamb_update_phase2(weight, g, r_1, r_2, weight32, lr=lr, out=weight, **kwargs)
+                else:
+                    mean, var = state
+                    g = lamb_update_phase1(weight, grad, mean, var, wd=wd, **kwargs)
+
+                    kwargs = {}
+                    if self.lower_bound:
+                        kwargs['lower_bound'] = self.lower_bound
+                    if self.upper_bound:
+                        kwargs['upper_bound'] = self.upper_bound
+                    r_1 = weight.norm()
+                    r_2 = g.norm()
+                    lamb_update_phase2(weight, g, r_1, r_2, lr=lr, out=weight, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(LAMB, self).update_multi_precision(indices, weights, grads, states)
+
diff --git a/python/mxnet/optimizer/lars.py b/python/mxnet/optimizer/lars.py
new file mode 100644
index 000000000000..1cd746c6dd32
--- /dev/null
+++ b/python/mxnet/optimizer/lars.py
@@ -0,0 +1,282 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""LARS optimizer."""
+from __future__ import absolute_import
+import os
+import numpy
+from ..ndarray import (zeros, clip, sqrt, array,
+                       multi_sum_sq, multi_lars, norm as NDnorm,
+                       where, ones_like)
+from ..ndarray import (sgd_update, sgd_mom_update,
+                       mp_sgd_update, mp_sgd_mom_update,
+                       preloaded_multi_sgd_update, preloaded_multi_sgd_mom_update,
+                       preloaded_multi_mp_sgd_update, preloaded_multi_mp_sgd_mom_update)
+from .optimizer import Optimizer, register
+from .utils import _flatten_list
+
+__all__ = ['LARS']
+
+
+@register
+class LARS(Optimizer):
+    """the LARS optimizer from 'Large Batch Training of Convolution Networks' \
+    (https://arxiv.org/abs/1708.03888)
+
+    Behave mostly like SGD with momentum and weight decay but is scaling \
+    adaptively the learning for each layer:
+    w_norm = L2norm(weights)
+    g_norm = L2norm(gradients)
+    if w_norm > 0 and g_norm > 0:
+        lr_layer = lr * w_norm / (g_norm + weight_decay * w_norm + epsilon)
+    else:
+        lr_layer = lr
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.
+        The momentum value.
+    eta : float, default 0.001
+        LARS coefficient used to scale the learning rate.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    lazy_update : bool, default False
+        Default is False. If True, lazy updates are applied \
+        if the storage types of weight and grad are both ``row_sparse``.
+    aggregate_num : int, default 1
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, eta=0.001,
+                 epsilon=1e-8, lazy_update=False, use_fused_step=True,
+                 aggregate_num=1, **kwargs):
+        super(LARS, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   aggregate_num=aggregate_num,
+                                   **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update,\
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        if lazy_update:
+            assert not self.multi_precision, \
+                'When lazy_update is set to True, multi_precision has be turned off.'
+        self.lazy_update = lazy_update
+        self.momentum = momentum
+        self.eta = eta
+        self.epsilon = epsilon
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            stype = weight.stype if self.lazy_update else 'default'
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
+        return momentum
+
+    def _l2norm(self, v, rescale=False):
+        """L2 Norm implementation"""
+        v = v.astype('float32')
+        if rescale:
+            v *= self.rescale_grad
+        norm = NDnorm(v)
+        return norm
+
+    def _get_lars(self, index, weight, grad, wd):
+        """Returns a scaling factor for the learning rate for this layer"""
+        lars = 1.0
+        name = self.idx2name[index] if index in self.idx2name else str(index)
+        if name.endswith('gamma') or name.endswith('beta') or name.endswith('bias'):
+            return lars
+
+        w_norm = self._l2norm(weight)
+        g_norm = self._l2norm(grad, rescale=True)
+
+        # calculate lars_trust_ratio
+        ratio = w_norm / g_norm
+        # becomes NaN if ratio == NaN or 0, otherwise 0
+        nan_or_zero = 1 - ratio / ratio
+        lars = self.eta * w_norm / (g_norm + wd * w_norm + self.epsilon)
+        lars = where(nan_or_zero, ones_like(lars), lars)
+
+        return lars.asscalar()
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # compute lars
+            # clip grad + wd * weight is performed after computing lars
+            lars = self._get_lars(index, weight, grad, wd)
+            lr *= lars
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+            else:
+                mom = -lr * grad
+
+            # update weight
+            weight[:] += mom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient is not None:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            nb_params = len(indices)
+            names = [self.idx2name[i] if i in self.idx2name else str(i) for i in indices]
+            lars_idx = [i for i in range(nb_params) if
+                        not(names[i].endswith('gamma') or names[i].endswith('beta') or
+                            names[i].endswith('bias'))]
+            nb_lars = len(lars_idx)
+            no_lars_idx = [i for i in range(nb_params) if
+                           (names[i].endswith('gamma') or names[i].endswith('beta') or
+                            names[i].endswith('bias'))]
+            cur_ctx = weights[0].context
+            full_idx = lars_idx + no_lars_idx
+            new_lrs = array([lrs[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
+            new_wds = array([wds[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
+            new_weights = [weights[i] for i in full_idx]
+            new_grads = [grads[i] for i in full_idx]
+            new_states = [states[i] for i in full_idx]
+            if nb_lars > 0:
+                w_sum_sq = multi_sum_sq(*new_weights[:nb_lars], num_arrays=nb_lars)
+                g_sum_sq = multi_sum_sq(*new_grads[:nb_lars], num_arrays=nb_lars)
+                multi_lars(new_lrs[:nb_lars], w_sum_sq, g_sum_sq, new_wds[:nb_lars],
+                           eta=self.eta, eps=self.epsilon, rescale_grad=self.rescale_grad,
+                           out=new_lrs[:nb_lars])
+            # Same than usual using preloaded sgd functions
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+            if not multi_precision:
+                if self.momentum > 0:
+                    preloaded_multi_sgd_mom_update(
+                        *(_flatten_list(zip(new_weights, new_grads, new_states)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+                else:
+                    preloaded_multi_sgd_update(
+                        *(_flatten_list(zip(new_weights, new_grads)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+            else:
+                states = list(zip(*states))
+                weights32, moms = states
+                if self.momentum > 0:
+                    preloaded_multi_mp_sgd_mom_update(
+                        *(_flatten_list(zip(new_weights, new_grads, moms, weights32)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+                else:
+                    preloaded_multi_mp_sgd_update(
+                        *(_flatten_list(zip(new_weights, new_grads, weights32)) +
+                          [new_lrs, new_wds]), out=new_weights, num_weights=len(new_weights),
+                        **kwargs)
+        else:
+            for i, (index, weight, grad, state) in enumerate(zip(indices, weights, grads, states)):
+                wd = wds[i]
+                lr = lrs[i]
+                lr *= self._get_lars(index, weight, grad, wd)
+                multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+                if not multi_precision:
+                    mom = state
+                    if state is not None:
+                        sgd_mom_update(weight, grad, mom, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    weight32, mom = state
+                    if mom is not None:
+                        mp_sgd_mom_update(weight, grad, mom, weight32, out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(LARS, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/nadam.py b/python/mxnet/optimizer/nadam.py
new file mode 100644
index 000000000000..483a44a8cc46
--- /dev/null
+++ b/python/mxnet/optimizer/nadam.py
@@ -0,0 +1,125 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Nadam optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from .optimizer import Optimizer, register
+
+__all__ = ['Nadam']
+
+
+@register
+class Nadam(Optimizer):
+    """The Nesterov Adam optimizer.
+
+    Much like Adam is essentially RMSprop with momentum,
+    Nadam is Adam RMSprop with Nesterov momentum available
+    at http://cs229.stanford.edu/proj2015/054_report.pdf.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    beta1 : float, default 0.9
+        Exponential decay rate for the first moment estimates.
+    beta2 : float, default 0.999
+        Exponential decay rate for the second moment estimates.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    schedule_decay : float, default 0.004
+        Exponential decay rate for the momentum schedule
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
+                 schedule_decay=0.004, use_fused_step=False, **kwargs):
+        super(Nadam, self).__init__(learning_rate=learning_rate,
+                                    use_fused_step=use_fused_step,
+                                    **kwargs)
+        self.beta1 = beta1
+        self.beta2 = beta2
+        self.epsilon = epsilon
+        self.schedule_decay = schedule_decay
+        self.m_schedule = 1.
+
+    def create_state(self, index, weight):
+        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
+                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            coef2 = 1. - self.beta2**t
+
+            # warming momentum schedule
+            momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
+            momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
+            self.m_schedule = self.m_schedule * momentum_t
+            m_schedule_next = self.m_schedule * momentum_t_1
+
+            # update mean and var
+            mean, var = state
+            mean[:] *= self.beta1
+            mean[:] += (1. - self.beta1) * grad
+            var[:] *= self.beta2
+            var[:] += (1. - self.beta2) * square(grad)
+
+            grad_prime = grad / (1. - self.m_schedule)
+            mean_prime = mean / (1. - m_schedule_next)
+            var_prime = var / coef2
+            mean_bar = momentum_t_1 * mean_prime + (1. - momentum_t) * grad_prime
+
+            # update weight
+            d = mean_bar / (sqrt(var_prime) + self.epsilon)
+            weight[:] -= lr * d
+
diff --git a/python/mxnet/optimizer/nag.py b/python/mxnet/optimizer/nag.py
new file mode 100644
index 000000000000..463f7949a9e9
--- /dev/null
+++ b/python/mxnet/optimizer/nag.py
@@ -0,0 +1,166 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""NAG optimizer."""
+from __future__ import absolute_import
+import numpy
+from ..ndarray import (zeros, clip)
+from ..ndarray import (sgd_update, mp_sgd_update, nag_mom_update, mp_nag_mom_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['NAG']
+
+
+@register
+class NAG(Optimizer):
+    """Nesterov accelerated gradient.
+
+    This optimizer updates each weight by::
+
+        grad = clip(grad * rescale_grad, clip_gradient) + wd * weight
+        state = momentum * state + lr * grad
+        weight = weight - (momentum * state + lr * grad)
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.9
+       The momentum value.
+    multi_precision: bool, default False
+        Flag to control the internal precision of the optimizer.
+        False: results in using the same precision as the weights (default),
+        True: makes internal 32-bit copy of the weights and applies gradients
+        in 32-bit precision even if actual weights used in the model have lower precision.
+        Turning this on can improve convergence and accuracy when training with float16.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.9, multi_precision=False,
+                 use_fused_step=True, **kwargs):
+        super(NAG, self).__init__(learning_rate=learning_rate,
+                                  multi_precision=multi_precision,
+                                  use_fused_step=use_fused_step,
+                                  **kwargs)
+        self.momentum = momentum
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+                d = self.momentum * mom - lr * grad
+            else:
+                d = -lr * grad
+
+            # update weight
+            weight[:] += d
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'rescale_grad': self.rescale_grad}
+            if self.momentum > 0:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+
+            multi_precision = self.multi_precision and weight.dtype == numpy.float16
+
+            if not multi_precision:
+                mom = state
+                if mom is not None:
+                    nag_mom_update(weight, grad, mom, out=weight, lr=lr, wd=wd, **kwargs)
+                else:
+                    sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                weight32, mom = state
+                if mom is not None:
+                    mp_nag_mom_update(weight, grad, mom, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+                else:
+                    mp_sgd_update(weight, grad, weight32, out=weight,
+                                  lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(NAG, self).update_multi_precision(indices, weights, grads, states)
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 09e881ebfca6..b5e8c2468304 100755
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -17,38 +17,17 @@
 # under the License.
 
 # pylint: disable=too-many-lines
-"""Weight updating functions."""
+"""Base Optimizer class."""
 from __future__ import absolute_import
-import logging
-import math
-import pickle
 import warnings
-import os
 import numpy
-from ..base import py_str
-from ..ndarray import (NDArray, zeros, clip, sqrt, cast, maximum, abs as NDabs, array, multiply,
-                       multi_sum_sq, multi_lars, norm as NDnorm)
-from ..ndarray import (sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update,
-                       mp_sgd_update, mp_sgd_mom_update, square, ftrl_update, ftml_update,
-                       signsgd_update, signum_update, nag_mom_update, mp_nag_mom_update,
-                       multi_sgd_update, multi_sgd_mom_update, multi_mp_sgd_update,
-                       multi_mp_sgd_mom_update, preloaded_multi_sgd_update,
-                       preloaded_multi_sgd_mom_update, preloaded_multi_mp_sgd_update,
-                       preloaded_multi_mp_sgd_mom_update, lamb_update_phase1, lamb_update_phase2,
-                       mp_lamb_update_phase1, mp_lamb_update_phase2)
-from ..ndarray.contrib import (multi_lamb_update, multi_mp_lamb_update)
-from ..ndarray import sparse
-from ..random import normal
+from ..ndarray import (NDArray, zeros, cast)
 from ..util import is_np_array
 
 __all__ = [
-    'AdaDelta', 'AdaGrad', 'Adam', 'Adamax', 'DCASGD', 'FTML', 'Ftrl', 'LARS', 'LBSGD',
-    'NAG', 'NDabs', 'Nadam', 'Optimizer', 'RMSProp', 'SGD', 'SGLD', 'Signum', 'LAMB',
-    'Test', 'Updater', 'ccSGD', 'create', 'get_updater', 'register'
+    'Optimizer', 'Test', 'create', 'register'
 ]
 
-def _flatten_list(nested_list):
-    return [item for sublist in nested_list for item in sublist]
 
 class Optimizer(object):
     """The base class inherited by all optimizers.
@@ -95,6 +74,17 @@ class Optimizer(object):
         Dictionary of parameter index to gluon.Parameter, used to lookup parameter attributes
         such as lr_mult, wd_mult, etc. param_dict shall not be deep copied.
 
+    aggregate_num : int, optional, default None
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+        In default, only one weight is aggregated.
+        When `aggregate_num` is set to numpy.inf, all the weights are aggregated.
+
+    use_fused_step : bool, optional, default None
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+
     Properties
     ----------
     learning_rate : float
@@ -104,7 +94,9 @@ class Optimizer(object):
     def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
                  clip_gradient=None, learning_rate=None,
                  lr_scheduler=None, sym=None, begin_num_update=0,
-                 multi_precision=False, param_dict=None):
+                 multi_precision=False, param_dict=None, aggregate_num=None,
+                 use_fused_step=None, **kwargs):
+        super(Optimizer, self).__init__(**kwargs)
         self.rescale_grad = rescale_grad
         self.lr_scheduler = lr_scheduler
         if self.lr_scheduler is None and learning_rate is None:
@@ -125,7 +117,11 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self._index_update_count = self._all_index_update_counts[0]
         self.clip_gradient = clip_gradient
         self.multi_precision = multi_precision
-        self.aggregate_num = 0
+
+        if aggregate_num is None:
+            self.aggregate_num = 1
+        else:
+            self.aggregate_num = aggregate_num
 
         if param_idx2name is None:
             param_idx2name = {}
@@ -135,6 +131,8 @@ def __init__(self, rescale_grad=1., param_idx2name=None, wd=0.,
         self.sym_info = (sym.attr_dict(), sym.list_arguments()) if sym is not None else ()
         self.param_dict = param_dict if param_dict else {}
         self.allow_np_array = is_np_array()
+        self.use_fused_step = use_fused_step \
+            if use_fused_step is not None else False
 
         self.set_lr_mult({})
         self.set_wd_mult({})
@@ -250,7 +248,6 @@ def create_state_multi_precision(self, index, weight):
         state : any obj
             The state associated with the weight.
         """
-        weight_master_copy = None
         if self.multi_precision and weight.dtype == numpy.float16:
             weight_master_copy = weight.astype(numpy.float32)
             return (weight_master_copy,) + (self.create_state(index, weight_master_copy),)
@@ -261,50 +258,101 @@ def create_state_multi_precision(self, index, weight):
                           "optimizer")
         return self.create_state(index, weight)
 
-    def update(self, index, weight, grad, state):
-        """Updates the given parameter using the corresponding gradient and state.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        raise NotImplementedError()
+        raise NotImplementedError
 
-    def update_multi_precision(self, index, weight, grad, state):
-        """Updates the given parameter using the corresponding gradient and state.
-        Mixed precision version.
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        New operators that fuses optimizer's update should be put in this function.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        if self.multi_precision and weight.dtype == numpy.float16:
-            # Wrapper for mixed precision
-            weight_master_copy = state[0]
-            original_state = state[1]
-            grad32 = grad.astype(numpy.float32)
-            self.update(index, weight_master_copy, grad32, original_state)
-            cast(weight_master_copy, dtype=weight.dtype, out=weight)
+        raise NotImplementedError
+
+    def update(self, indices, weights, grads, states):
+        """Call step to perform a single optimization update if use_fused_step is False,
+         otherwise fused_step is called.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for weight, grad in zip(weights, grads):
+            assert(isinstance(weight, NDArray))
+            assert(isinstance(grad, NDArray))
+        if not self.use_fused_step:
+            self.step(indices, weights, grads, states)
         else:
-            self.update(index, weight, grad, state)
+            self.fused_step(indices, weights, grads, states)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Call step to perform a single optimization update if use_fused_step is False,
+         otherwise fused_step is called. Mixed precision version.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        weights_master_copy = []
+        original_states = []
+        grads32 = []
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            if self.multi_precision and weight.dtype == numpy.float16:
+                weights_master_copy.append(state[0])
+                original_states.append(state[1])
+                grads32.append(grad.astype(numpy.float32))
+            else:
+                weights_master_copy.append(weight)
+                original_states.append(state)
+                grads32.append(grad)
+        self.update(indices, weights_master_copy, grads32, original_states)
+        for weight_master_copy, weight in zip(weights_master_copy, weights):
+            if self.multi_precision and weight.dtype == numpy.float16:
+                cast(weight_master_copy, dtype=weight.dtype, out=weight)
 
     def set_learning_rate(self, lr):
         """Sets a new learning rate of the optimizer.
@@ -323,10 +371,6 @@ def set_learning_rate(self, lr):
         else:
             self.lr = lr
 
-    def set_lr_scale(self, args_lrscale): # pylint: disable=unused-argument
-        """[DEPRECATED] Sets lr scale. Use set_lr_mult instead."""
-        raise DeprecationWarning
-
     def set_lr_mult(self, args_lr_mult):
         """Sets an individual learning rate multiplier for each parameter.
 
@@ -363,11 +407,6 @@ def set_lr_mult(self, args_lr_mult):
     def set_wd_mult(self, args_wd_mult):
         """Sets an individual weight decay multiplier for each parameter.
 
-        By default, if `param_idx2name` was provided in the
-        constructor, the weight decay multipler is set as 0 for all
-        parameters whose name don't end with ``_weight`` or
-        ``_gamma``.
-
         .. note:: The default weight decay multiplier for a `Variable`
             can be set with its `wd_mult` argument in the constructor.
 
@@ -387,9 +426,6 @@ def set_wd_mult(self, args_wd_mult):
             compatibility, and we recommend to use the name instead.
         """
         self.wd_mult = {}
-        for n in self.idx2name.values():
-            if not (n.endswith('_weight') or n.endswith('_gamma')):
-                self.wd_mult[n] = 0.0
         if self.sym_info:
             attr, arg_names = self.sym_info
             for name in arg_names:
@@ -519,1514 +555,10 @@ def __setstate__(self, state):
         # param_dict needs to be explicitly set by the trainer
         self.param_dict = {}
 
+
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
-# pylint: disable=line-too-long
-@register
-class SGD(Optimizer):
-    """The SGD optimizer with momentum and weight decay.
-
-    If the storage types of grad is ``row_sparse`` and ``lazy_update`` is True, \
-    **lazy updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = lr * (rescale_grad * clip(grad[row], clip_gradient) + wd * weight[row])
-            state[row] = momentum[row] * state[row] + rescaled_grad[row]
-            weight[row] = weight[row] - state[row]
-
-    The sparse update only updates the momentum for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    In the case when ``update_on_kvstore`` is set to False (either globally via
-    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
-    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
-    of parameters, which may lead to improved performance. The aggregation size
-    is controlled by MXNET_OPTIMIZER_AGGREGATION_SIZE environment variable and
-    defaults to 4.
-
-    Otherwise, **standard updates** are applied by::
-
-        rescaled_grad = lr * (rescale_grad * clip(grad, clip_gradient) + wd * weight)
-        state = momentum * state + rescaled_grad
-        weight = weight - state
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    lazy_update : bool, optional
-        Default is True. If True, lazy updates are applied \
-        if the storage types of weight and grad are both ``row_sparse``.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-    """
-    def __init__(self, momentum=0.0, lazy_update=True, **kwargs):
-        super(SGD, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            stype = weight.stype if self.lazy_update else 'default'
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
-        return momentum
-
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            if not multi_precision:
-                if self.momentum > 0:
-                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
-                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
-                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
-            else:
-                if self.momentum > 0:
-                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads, *zip(*states))),
-                                            out=weights, num_weights=len(weights),
-                                            lrs=lrs, wds=wds, **kwargs)
-                else:
-                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
-                                                           list(zip(*states))[1])),
-                                        out=weights, num_weights=len(weights),
-                                        lrs=lrs, wds=wds, **kwargs)
-        else:
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-@register
-class Signum(Optimizer):
-    r"""The Signum optimizer that takes the sign of gradient or momentum.
-
-    The optimizer updates the weight by::
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
-
-    References
-    ----------
-    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
-    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
-
-    See: https://arxiv.org/abs/1802.04434
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    wd_lh : float, optional
-       The amount of decoupled weight decay regularization, see details in the original paper at:\
-       https://arxiv.org/abs/1711.05101
-    """
-    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh=0.0, **kwargs):
-        super(Signum, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.momentum = momentum
-        self.wd_lh = wd_lh
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def _update_impl(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-        if self.wd_lh:
-            kwargs['wd_lh'] = self.wd_lh
-
-        if state is not None:
-            signum_update(weight, grad, state, out=weight,
-                          lr=lr, wd=wd, **kwargs)
-        else:
-            signsgd_update(weight, grad, out=weight,
-                           lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state)
-
-@register
-class FTML(Optimizer):
-    """The FTML optimizer.
-
-    This class implements the optimizer described in
-    *FTML - Follow the Moving Leader in Deep Learning*,
-    available at http://proceedings.mlr.press/v70/zheng17a/zheng17a.pdf.
-
-    Denote time step by t. The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        v = beta2 * v + (1 - beta2) * square(rescaled_grad)
-        d_t = (1 - power(beta1, t)) / lr * square_root(v / (1 - power(beta2, t))) + epsilon)
-        z = beta1 * z + (1 - beta1) * rescaled_grad - (d_t - beta1 * d_(t-1)) * weight
-        weight = - z / d_t
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftml_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        0 < beta1 < 1. Generally close to 0.5.
-    beta2 : float, optional
-        0 < beta2 < 1. Generally close to 1.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    """
-    def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
-        super(FTML, self).__init__(**kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
-                zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
-                zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad, 't': t}
-        if self.clip_gradient:
-            kwargs['clip_grad'] = self.clip_gradient
-
-        prev_d, prev_v, prev_z = state
-        ftml_update(weight, grad, prev_d, prev_v, prev_z, out=weight,
-                    lr=lr, wd=wd, **kwargs)
-
-@register
-class LARS(Optimizer):
-    """the LARS optimizer from 'Large Batch Training of Convolution Networks' \
-    (https://arxiv.org/abs/1708.03888)
-
-    Behave mostly like SGD with momentum and weight decay but is scaling \
-    adaptively the learning for each layer (except bias and batch norm parameters):
-    w_norm = L2norm(weights)
-    g_norm = L2norm(gradients)
-    if w_norm > 0 and g_norm > 0:
-        lr_layer = lr * lr_mult * eta * w_norm / (g_norm + weight_decay * w_norm + eps)
-    else:
-        lr_layer = lr * lr_mult
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    lazy_update : bool, optional
-        Default is True. If True, lazy updates are applied \
-        if the storage types of weight and grad are both ``row_sparse``.
-    lars_eta : float, optional
-        LARS coefficient used to scale the learning rate. Default set to 0.001.
-    lars_epsilon : float, optional
-        Optional epsilon in case of very small gradients. Default set to 0.
-    momentum_correction : bool, optional
-        If True scale momentum w.r.t global learning rate change (with an lr_scheduler) \
-        as indicated in 'Accurate, Large Minibatch SGD: Training ImageNet in 1 Hour` \
-        (https://arxiv.org/pdf/1706.02677.pdf)
-        Default set to True.
-    """
-    def __init__(self, momentum=0.0, lazy_update=True, eta=0.001, eps=0,
-                 momentum_correction=True, **kwargs):
-        super(LARS, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.momentum_correction = momentum_correction
-        self.lazy_update = lazy_update
-        self.aggregate_num = int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "4"))
-        self.eta = eta
-        self.eps = eps
-        self.skip = 0
-        self.last_lr = None
-        self.cur_lr = None
-
-
-    def _get_lrs(self, indices):
-        """Gets the learning rates given the indices of the weights.
-
-        Parameters
-        ----------
-        indices : list of int
-            Indices corresponding to weights.
-
-        Returns
-        -------
-        lrs : list of float
-            Learning rates for those indices.
-        """
-        if self.cur_lr is not None:
-            self.last_lr = self.cur_lr
-
-        if self.lr_scheduler is not None:
-            lr = self.lr_scheduler(self.num_update)
-        else:
-            lr = self.lr
-
-        if self.cur_lr is None:
-            self.last_lr = lr
-        self.cur_lr = lr
-
-        lrs = [lr for _ in indices]
-        for i, index in enumerate(indices):
-            if index in self.param_dict:
-                lrs[i] *= self.param_dict[index].lr_mult
-            elif index in self.lr_mult:
-                lrs[i] *= self.lr_mult[index]
-            elif index in self.idx2name:
-                lrs[i] *= self.lr_mult.get(self.idx2name[index], 1.0)
-        return lrs
-
-    def set_wd_mult(self, args_wd_mult):
-        self.wd_mult = {}
-        for n in self.idx2name.values():
-            is_weight = n.endswith('_weight')
-
-            if not is_weight:
-                self.wd_mult[n] = 0.0
-
-        if self.sym_info:
-            attr, arg_names = self.sym_info
-            for name in arg_names:
-                if name in attr and '__wd_mult__' in attr[name]:
-                    self.wd_mult[name] = float(attr[name]['__wd_mult__'])
-        self.wd_mult.update(args_wd_mult)
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            stype = weight.stype if self.lazy_update else 'default'
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
-        return momentum
-
-    def _l2norm(self, v, rescale=False):
-        """L2 Norm implementation"""
-        v = v.astype('float32')
-        if rescale:
-            v *= self.rescale_grad
-        norm = NDnorm(v).asnumpy()[0]
-        return norm
-
-    def _get_lars(self, i, weight, g, lr, wd):
-        """Returns a scaling factor for the learning rate for this layer"""
-        name = self.idx2name[i] if i in self.idx2name else str(i)
-        if name.endswith('gamma') or name.endswith('beta') or name.endswith('bias'):
-            return lr
-
-        w_norm = self._l2norm(weight)
-        g_norm = self._l2norm(g, rescale=True)
-
-        if w_norm > 0.0 and g_norm > 0.0:
-            lars = self.eta * w_norm/(g_norm + wd * w_norm + self.eps)
-        else:
-            lars = 1.0
-        return lars * lr
-
-    def _update_impl(self, indices, weights, grads, states, multi_precision=False):
-        aggregate = True
-        if not isinstance(indices, (tuple, list)):
-            indices = [indices]
-            weights = [weights]
-            grads = [grads]
-            states = [states]
-        for weight, grad in zip(weights, grads):
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            aggregate = (aggregate and
-                         weight.stype == 'default' and
-                         grad.stype == 'default')
-        self._update_count(indices)
-        lrs = self._get_lrs(indices)
-        wds = self._get_wds(indices)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = (self.momentum * (self.cur_lr / self.last_lr)) \
-                                 if (self.momentum_correction and self.last_lr != 0) else \
-                                 self.momentum
-
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if aggregate:
-            nb_params = len(indices)
-            names = [self.idx2name[i] if i in self.idx2name else str(i) for i in indices]
-            lars_idx = [i for i in range(nb_params) if
-                        not(names[i].endswith('gamma') or names[i].endswith('beta') or
-                            names[i].endswith('bias'))]
-            nb_lars = len(lars_idx)
-            no_lars_idx = [i for i in range(nb_params) if
-                           (names[i].endswith('gamma') or names[i].endswith('beta') or
-                            names[i].endswith('bias'))]
-            cur_ctx = weights[0].context
-            full_idx = lars_idx + no_lars_idx
-            new_lrs = array([lrs[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
-            new_wds = array([wds[i] for i in full_idx], ctx=cur_ctx, dtype='float32')
-            new_weights = [weights[i] for i in full_idx]
-            new_grads = [grads[i] for i in full_idx]
-            new_states = [states[i] for i in full_idx]
-            if nb_lars > 0:
-                w_sum_sq = multi_sum_sq(*new_weights[:nb_lars], num_arrays=nb_lars)
-                g_sum_sq = multi_sum_sq(*new_grads[:nb_lars], num_arrays=nb_lars)
-                multi_lars(new_lrs[:nb_lars], w_sum_sq, g_sum_sq, new_wds[:nb_lars],
-                           eta=self.eta, eps=self.eps, rescale_grad=self.rescale_grad,
-                           out=new_lrs[:nb_lars])
-            # Same than usual using preloaded sgd functions
-            sidx = 0
-            while sidx < len(indices):
-                eidx = sidx + len(new_weights[sidx:sidx+self.aggregate_num])
-                if not multi_precision:
-                    if self.momentum > 0:
-                        preloaded_multi_sgd_mom_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                new_states[sidx:eidx])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                    else:
-                        preloaded_multi_sgd_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                else:
-                    if self.momentum > 0:
-                        preloaded_multi_mp_sgd_mom_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                *zip(*new_states[sidx:eidx]))) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                    else:
-                        preloaded_multi_mp_sgd_update(
-                            *(_flatten_list(zip(new_weights[sidx:eidx],
-                                                new_grads[sidx:eidx],
-                                                list(zip(*new_states[sidx:eidx]))[1])) +
-                              [new_lrs[sidx:eidx], new_wds[sidx:eidx]]),
-                            out=new_weights[sidx:eidx],
-                            num_weights=len(new_weights[sidx:eidx]),
-                            **kwargs)
-                sidx += self.aggregate_num
-        else:
-            lrs = [self._get_lars(i, w, g, lr, wd) for (i, w, g, lr, wd) in
-                   zip(indices, weights, grads, lrs, wds)]
-
-            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
-                if not multi_precision:
-                    if state is not None:
-                        sgd_mom_update(weight, grad, state, out=weight,
-                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-                    else:
-                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
-                                   lr=lr, wd=wd, **kwargs)
-                else:
-                    if state[0] is not None:
-                        mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight,
-                                          lr=lr, wd=wd, **kwargs)
-                    else:
-                        mp_sgd_update(weight, grad, state[1], out=weight,
-                                      lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-#
-@register
-class LBSGD(Optimizer):
-    """The Large Batch SGD optimizer with momentum and weight decay.
-
-    The optimizer updates the weight by::
-
-        state = momentum * state + lr * rescale_grad * clip(grad, clip_gradient) + wd * weight
-        weight = weight - state
-
-    For details of the update algorithm see :class:`~mxnet.ndarray.sgd_update`
-    and :class:`~mxnet.ndarray.sgd_mom_update`.
-    In addition to the SGD updates the LBSGD optimizer uses the LARS, Layer-wise
-    Adaptive Rate Scaling, algorithm to have a separate learning rate for each
-    layer of the network, which leads to better stability over large batch sizes.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-        The momentum value.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-
-    warmup_strategy: string ('linear', 'power2', 'sqrt'. , 'lars'   default : 'linear')
-    warmup_epochs: unsigned, default: 5
-    batch_scale:   unsigned, default: 1 (same as batch size * numworkers)
-    updates_per_epoch: updates_per_epoch (default: 32, Default might not reflect true number batches per epoch. Used for warmup.)
-    begin_epoch: unsigned, default 0, starting epoch.
-    """
-    def __init__(self, momentum=0.0, multi_precision=False, warmup_strategy='linear',
-                 warmup_epochs=5, batch_scale=1, updates_per_epoch=32, begin_epoch=0, num_epochs=60,
-                 **kwargs):
-        super(LBSGD, self).__init__(**kwargs)
-        logging.info('Running Large-Batch SGD Algorithm')
-        logging.info('(Batch_scale=%f, warmup_epochs=%d, warmup_strategy=%s, updates_per_epoch=%d)',
-                     batch_scale, warmup_epochs, warmup_strategy, updates_per_epoch)
-        self.momentum = momentum
-        self.multi_precision = multi_precision
-        # new user parameters for large batch
-        self.warmup_strategy = warmup_strategy
-        self.warmup_epochs = warmup_epochs
-        self.batch_scale = batch_scale
-        self.updates_per_epoch = updates_per_epoch
-        self.init_updates = begin_epoch * updates_per_epoch
-        self.num_epochs = num_epochs
-        # addl internal usage parameters and storage
-        self.lbmult = 1
-        self.cumgrads = {}
-        # for adaptive lr
-        self.adaptive = False
-        self.admult = 1  # adaptation constant
-
-    def create_state(self, index, weight):
-        momentum = None
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = array(weight, ctx=weight.context, dtype=numpy.float32)
-            if self.momentum != 0.0:
-                momentum = zeros(weight.shape, weight.context, dtype=numpy.float32,
-                                 stype=weight.stype)
-            return (momentum, weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "SGD optimizer")
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def _get_lbmult(self, nup):
-        """Returns lr scaling factor for large batch according to warmup schedule
-        (to be implemented)
-        """
-        nwup = self.warmup_epochs * self.updates_per_epoch
-        strategy = self.warmup_strategy
-        maxmult = float(self.batch_scale)
-        if nup >= nwup:
-            mult = maxmult
-        elif nwup <= 1:
-            mult = 1.0
-        else:
-            if (strategy == 'linear'):
-                mult = 1.0 + (maxmult - 1) * nup / nwup
-            elif (strategy == 'power2'):
-                mult = 1.0 + (maxmult-1) * (nup*nup)/(nwup*nwup)
-            elif (strategy == 'sqrt'):
-                mult = 1.0 + (maxmult - 1) * math.sqrt(float(nup) / nwup)
-            else:
-                mult = 1.0
-        return mult
-
-    def _get_lars(self, weight, g, wd):
-        """Returns a scaling factor for the learning rate for this layer
-        default is 1
-        """
-        weight2 = self._l2norm(weight)
-        grad2 = self._l2norm(g)
-        lars = math.sqrt(weight2 / (grad2 + wd * weight2 + 1e-18))
-        if lars < 0.01:
-            lars = 0.01
-        elif lars > 100:
-            lars = 100
-        return lars
-
-    def _l2norm(self, v):
-        "inner product implementation"
-        norm = multiply(v, v).asnumpy().sum()
-        return norm
-
-    def _reset_cum_gradient(self, index):
-        "called every macro-batch to reset cumulated gradients to 0 for a given index"
-        self.cumgrads[index]['cum_grad'] = 0
-
-    def _get_cum_gradient(self, index):
-        "get the cumulated gradient for index"
-        if index in self.cumgrads:
-            return self.cumgrads[index]
-        else:
-            return {}
-
-    def _put_cum_gradient(self, index, cgrad):
-        "store cumulated gradient for index"
-        self.cumgrads[index] = cgrad
-
-    def _cumulate_gradient(self, grad, index):
-        "Cumulate gradients for large-batch emulation. Cumulated by index (layer)"
-        cgrad = self._get_cum_gradient(index)
-        if cgrad:
-            num_cums = cgrad['num_cums']
-            if num_cums > 0:
-                cum_grad = cgrad['cum_grad'] + grad
-                num_cums += 1
-            else:
-                cum_grad = grad
-                num_cums = self.init_updates + 1
-        else:
-            cum_grad = grad
-            num_cums = self.init_updates + 1
-        cgrad = {'cum_grad': cum_grad, 'num_cums': num_cums}
-        self._put_cum_gradient(index, cgrad)
-        return cgrad
-
-    def update(self, index, weight, grad, state):
-        assert (isinstance(weight, NDArray))
-        assert (isinstance(grad, NDArray))
-
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        # new stuff for large batch
-        cgrad = self._cumulate_gradient(grad, index)
-        if (cgrad['num_cums'] % self.batch_scale) == 0:
-            grad = cgrad['cum_grad'] / self.batch_scale
-            if self.warmup_strategy == 'lars':
-                lbmult = self._get_lars(weight, grad, wd)
-            else:
-                lbmult = self._get_lbmult(cgrad['num_cums'])
-            lr = lr * lbmult
-            # do the regular sgd update flow
-            kwargs = {'rescale_grad': self.rescale_grad}
-            if self.momentum > 0:
-                kwargs['momentum'] = self.momentum
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            use_multi_precision = isinstance(state, (list, tuple))
-
-            if not use_multi_precision:
-                if state is not None:
-                    sgd_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
-                else:
-                    sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-            else:
-                if state[0] is not None:
-                    mp_sgd_mom_update(weight, grad, state[0], state[1], out=weight, lr=lr, wd=wd,
-                                      **kwargs)
-                else:
-                    mp_sgd_update(weight, grad, state[1], out=weight, lr=lr, wd=wd, **kwargs)
-            # reset update count and cumulated gradient per large batch
-            self._reset_cum_gradient(index)
-        else:
-            lr = 0.0
-            kwargs = {}
-            sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-
-
-@register
-class LAMB(Optimizer):
-    """LAMB Optimizer.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 lower_bound=None, upper_bound=None, bias_correction=True, **kwargs):
-        super(LAMB, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-        self.bias_correction = bias_correction
-        self.aggregate_num = max(1, min(45, int(os.getenv('MXNET_OPTIMIZER_AGGREGATION_SIZE', "45"))))
-
-    def create_state(self, index, weight):
-        stype = weight.stype
-        dtype = weight.dtype
-        return (zeros(weight.shape, weight.context, dtype=dtype, stype=stype),
-                zeros(weight.shape, weight.context, dtype=dtype, stype=stype))
-
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'bias_correction': self.bias_correction,
-                  'rescale_grad': self.rescale_grad}
-
-        if self.aggregate_num <= 1 or not isinstance(index, (tuple, list)):
-            if isinstance(index, (tuple, list)):
-                assert(len(index) == self.aggregate_num)
-                index, weight, grad, state = index[0], weight[0], grad[0], state[0]
-            assert(isinstance(weight, NDArray))
-            assert(isinstance(grad, NDArray))
-            self._update_count(index)
-            lr = self._get_lr(index)
-            wd = self._get_wd(index)
-            t = self._index_update_count[index]
-            weight_ptr = weight
-            grad_ptr = grad
-            if multi_precision:
-                mean, var = state[1]
-                weight32 = state[0]
-            else:
-                mean, var = state
-            kwargs['t'] = t
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-
-            if multi_precision:
-                g = mp_lamb_update_phase1(weight_ptr, grad_ptr, mean, var, weight32, wd=wd, **kwargs)
-                kwargs = {}
-                if self.lower_bound:
-                    kwargs['lower_bound'] = self.lower_bound
-                if self.upper_bound:
-                    kwargs['upper_bound'] = self.upper_bound
-                r_1 = weight32.norm()
-                r_2 = g.norm()
-                mp_lamb_update_phase2(weight_ptr, g, r_1, r_2, weight32, lr=lr, out=weight_ptr, **kwargs)
-            else:
-                g = lamb_update_phase1(weight_ptr, grad_ptr, mean, var, wd=wd, **kwargs)
-                kwargs = {}
-                if self.lower_bound:
-                    kwargs['lower_bound'] = self.lower_bound
-                if self.upper_bound:
-                    kwargs['upper_bound'] = self.upper_bound
-                r_1 = weight_ptr.norm()
-                r_2 = g.norm()
-                lamb_update_phase2(weight_ptr, g, r_1, r_2, lr=lr, out=weight_ptr, **kwargs)
-        else:
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            if self.lower_bound:
-                kwargs['lower_bound'] = self.lower_bound
-            if self.upper_bound:
-                kwargs['upper_bound'] = self.upper_bound
-
-            step_count, lrs, wds = [], [], []
-            for i, w_i, g_i in zip(index, weight, grad):
-                assert(isinstance(w_i, NDArray))
-                assert(isinstance(g_i, NDArray))
-                self._update_count(i)
-                step_count.append(self._index_update_count[i])
-                lrs.append(self._get_lr(i))
-                wds.append(self._get_wd(i))
-
-            updated_tensors = 0
-            while updated_tensors < len(weight):
-                sidx = updated_tensors
-                eidx = min(updated_tensors + self.aggregate_num, len(weight))
-                if not multi_precision:
-                    mean, var = list(zip(*state[sidx:eidx]))
-                    multi_lamb_update(weight[sidx:eidx],
-                                      grad[sidx:eidx],
-                                      mean, var,
-                                      out=weight[sidx:eidx],
-                                      step_count=step_count[sidx:eidx],
-                                      lrs=lrs[sidx:eidx],
-                                      wds=wds[sidx:eidx],
-                                      **kwargs)
-                else:
-                    mean_var = list(zip(*state[sidx:eidx]))[1]
-                    temp = list(zip(*mean_var))
-                    mean = temp[0]
-                    var = temp[1]
-                    multi_mp_lamb_update(weight[sidx:eidx],
-                                         grad[sidx:eidx],
-                                         mean, var,
-                                         list(zip(*state[sidx:eidx]))[0],
-                                         out=weight[sidx:eidx],
-                                         step_count=step_count[sidx:eidx],
-                                         lrs=lrs[sidx:eidx],
-                                         wds=wds[sidx:eidx],
-                                         **kwargs)
-                updated_tensors += self.aggregate_num
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        if not isinstance(index, (tuple, list)):
-            use_multi_precision = self.multi_precision and weight.dtype == numpy.float16
-        else:
-            use_multi_precision = self.multi_precision and weight[0].dtype == numpy.float16
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-# pylint: enable=line-too-long
-@register
-class DCASGD(Optimizer):
-    """The DCASGD optimizer.
-
-    This class implements the optimizer described in *Asynchronous Stochastic Gradient Descent
-    with Delay Compensation for Distributed Deep Learning*,
-    available at https://arxiv.org/abs/1609.08326.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-
-    lamda : float, optional
-       Scale DC value.
-    """
-    def __init__(self, momentum=0.0, lamda=0.04, **kwargs):
-        super(DCASGD, self).__init__(**kwargs)
-        self.momentum = momentum
-        self.weight_previous = {}
-        self.lamda = lamda
-
-    def create_state(self, index, weight):
-        if self.momentum == 0.0:
-            return (None,
-                    weight.copy())  # previous weight
-        else:
-            return (zeros(weight.shape, weight.context, dtype=weight.dtype), # momentum
-                    weight.copy())  # previous weight
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        mom, previous_weight = state
-        if mom:
-            mom[:] *= self.momentum
-            mom[:] += -lr * (grad + wd * weight + self.lamda \
-                             * grad * grad * (weight - previous_weight))
-        else:
-            assert(self.momentum == 0.0)
-            mom = -lr * (grad + wd * weight + self.lamda \
-                         * grad * grad * (weight - previous_weight))
-        previous_weight[:] = weight
-        weight[:] += mom
-
-@register
-class NAG(Optimizer):
-    """Nesterov accelerated gradient.
-
-    This optimizer updates each weight by::
-
-        state = momentum * state + grad + wd * weight
-        weight = weight - (lr * (grad + momentum * state))
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    multi_precision: bool, optional
-        Flag to control the internal precision of the optimizer.
-        False: results in using the same precision as the weights (default),
-        True: makes internal 32-bit copy of the weights and applies gradients
-        in 32-bit precision even if actual weights used in the model have lower precision.
-        Turning this on can improve convergence and accuracy when training with float16.
-    """
-    def __init__(self, momentum=0.0, **kwargs):
-        super(NAG, self).__init__(**kwargs)
-        self.momentum = momentum
-
-    def create_state_multi_precision(self, index, weight):
-        weight_master_copy = None
-        if self.multi_precision and weight.dtype == numpy.float16:
-            weight_master_copy = weight.astype(numpy.float32)
-            return (self.create_state(index, weight_master_copy), weight_master_copy)
-        if weight.dtype == numpy.float16 and not self.multi_precision:
-            warnings.warn("Accumulating with float16 in optimizer can lead to "
-                          "poor accuracy or slow convergence. "
-                          "Consider using multi_precision=True option of the "
-                          "NAG optimizer")
-        return self.create_state(index, weight)
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype)
-        return momentum
-
-    def _update_impl(self, index, weight, grad, state, multi_precision=False):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'rescale_grad': self.rescale_grad}
-        if self.momentum > 0:
-            kwargs['momentum'] = self.momentum
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        if not multi_precision:
-            if state is not None:
-                nag_mom_update(weight, grad, state, out=weight, lr=lr, wd=wd, **kwargs)
-            else:
-                sgd_update(weight, grad, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            if state[0] is not None:
-                mp_nag_mom_update(weight, grad, state[0], state[1], out=weight,
-                                  lr=lr, wd=wd, **kwargs)
-            else:
-                mp_sgd_update(weight, grad, state[1], out=weight,
-                              lr=lr, wd=wd, **kwargs)
-
-    def update(self, index, weight, grad, state):
-        self._update_impl(index, weight, grad, state, multi_precision=False)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        use_multi_precision = self.multi_precision and weight.dtype == numpy.float16 \
-                                and isinstance(state, (tuple, list))
-        self._update_impl(index, weight, grad, state,
-                          multi_precision=use_multi_precision)
-
-
-@register
-class SGLD(Optimizer):
-    """Stochastic Gradient Riemannian Langevin Dynamics.
-
-    This class implements the optimizer described in the paper *Stochastic Gradient
-    Riemannian Langevin Dynamics on the Probability Simplex*, available at
-    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
-
-    """
-    def __init__(self, **kwargs):
-        super(SGLD, self).__init__(**kwargs)
-
-    def create_state(self, index, weight):
-        return None
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight)
-        weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
-                            dtype=weight.dtype, ctx=weight.context)
-
-
-
-@register  # pylint: disable=invalid-name
-class ccSGD(SGD):
-    """[DEPRECATED] Same as `SGD`. Left here for backward compatibility."""
-    def __init__(self, *args, **kwargs):
-        super(ccSGD, self).__init__(*args, **kwargs)
-
-@register
-class Adam(Optimizer):
-    """The Adam optimizer.
-
-    This class implements the optimizer described in *Adam: A Method for
-    Stochastic Optimization*, available at http://arxiv.org/abs/1412.6980.
-
-    If the storage types of grad is ``row_sparse``, and ``lazy_update`` is True, \
-    **lazy updates** at step t are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad + wd * weight[row], clip_gradient)
-            m[row] = beta1 * m[row] + (1 - beta1) * rescaled_grad[row]
-            v[row] = beta2 * v[row] + (1 - beta2) * (rescaled_grad[row]**2)
-            lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
-            w[row] = w[row] - lr * m[row] / (sqrt(v[row]) + epsilon)
-
-    The lazy update only updates the mean and var for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all indices.
-    Compared with the original update, it can provide large improvements in model training
-    throughput for some applications. However, it provides slightly different semantics than
-    the original update, and may lead to different empirical results.
-
-    Otherwise, **standard updates** at step t are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m + (1 - beta1) * rescaled_grad
-        v = beta2 * v + (1 - beta2) * (rescaled_grad**2)
-        lr = learning_rate * sqrt(1 - beta1**t) / (1 - beta2**t)
-        w = w - lr * m / (sqrt(v) + epsilon)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.adam_update`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    lazy_update : bool, optional
-       Default is True. If True, lazy updates are applied \
-       if the storage types of weight and grad are both ``row_sparse``.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 lazy_update=True, **kwargs):
-        super(Adam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lazy_update = lazy_update
-
-    def create_state(self, index, weight):
-        stype = weight.stype if self.lazy_update else 'default'
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype,
-                      stype=stype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype,
-                      stype=stype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        coef1 = 1. - self.beta1**t
-        coef2 = 1. - self.beta2**t
-        lr *= math.sqrt(coef2)/coef1
-
-        kwargs = {'beta1': self.beta1, 'beta2': self.beta2, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        mean, var = state
-        adam_update(weight, grad, mean, var, out=weight,
-                    lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
-
-@register
-class AdaGrad(Optimizer):
-    """AdaGrad optimizer.
-
-    This class implements the AdaGrad optimizer described in *Adaptive Subgradient
-    Methods for Online Learning and Stochastic Optimization*, and available at
-    http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad, clip_gradient)
-        history += square(grad)
-        div = grad / sqrt(history + float_stable_eps)
-        weight += (div + weight * wd) * -lr
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    See Also
-    ----------
-    :meth:`mxnet.ndarray.sparse.adagrad_update`.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Initial value of the history accumulator. Avoids division by 0.
-
-    """
-    def __init__(self, eps=1e-7, **kwargs):
-        super(AdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
-
-    def create_state(self, index, weight):
-        return zeros(weight.shape, weight.context, stype=weight.stype)  # history
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        is_sparse = grad.stype == 'row_sparse'
-        history = state
-
-        if is_sparse:
-            kwargs = {'epsilon': self.float_stable_eps,
-                      'rescale_grad': self.rescale_grad}
-            if self.clip_gradient:
-                kwargs['clip_gradient'] = self.clip_gradient
-            sparse.adagrad_update(weight, grad, history, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-            history[:] += square(grad)
-            div = grad / sqrt(history + self.float_stable_eps)
-            weight[:] += (div + weight * wd) * -lr
-
-@register
-class RMSProp(Optimizer):
-    """The RMSProp optimizer.
-
-    Two versions of RMSProp are implemented:
-
-    If ``centered=False``, we follow
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012.
-    For details of the update algorithm see :class:`~mxnet.ndarray.rmsprop_update`.
-
-    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
-    by Alex Graves, 2013.
-    For details of the update algorithm see :class:`~mxnet.ndarray.rmspropalex_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    gamma1: float, optional
-        A decay factor of moving average over past squared gradient.
-    gamma2: float, optional
-        A "momentum" factor. Only used if `centered`=``True``.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    centered : bool, optional
-        Flag to control which version of RMSProp to use.::
-
-            True: will use Graves's version of `RMSProp`,
-            False: will use Tieleman & Hinton's version of `RMSProp`.
-
-    clip_weights : float, optional
-        Clips weights into range ``[-clip_weights, clip_weights]``.
-    """
-    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
-                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
-        super(RMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.gamma1 = gamma1
-        self.gamma2 = gamma2
-        self.centered = centered
-        self.epsilon = epsilon
-        self.clip_weights = clip_weights
-
-    def create_state(self, index, weight):
-        if self.centered:
-            return (
-                zeros(weight.shape, weight.context, stype=weight.stype),  # n
-                zeros(weight.shape, weight.context, stype=weight.stype),  # g
-                zeros(weight.shape, weight.context, stype=weight.stype))  # delta
-        else:
-            return (zeros(weight.shape, weight.context, stype=weight.stype),)  # n
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        kwargs = {'gamma1': self.gamma1, 'epsilon': self.epsilon,
-                  'rescale_grad': self.rescale_grad}
-        if self.centered:
-            kwargs['gamma2'] = self.gamma2
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-        if self.clip_weights:
-            kwargs['clip_weights'] = self.clip_weights
-
-        if not self.centered:
-            (n, ) = state
-            rmsprop_update(
-                weight, grad, n, out=weight, lr=lr, wd=wd, **kwargs)
-        else:
-            n, g, delta = state
-            rmspropalex_update(weight, grad, n, g, delta, out=weight,
-                               lr=lr, wd=wd, **kwargs)
-
-@register
-class AdaDelta(Optimizer):
-    """The AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad * grad
-        delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * delta * delta
-        weight -= (delta + wd * weight)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(AdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context),  # accumulated g
-                zeros(weight.shape, weight.context))  # accumulated delta
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        wd = self._get_wd(index)
-        self._update_count(index)
-
-        # preprocess grad
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = clip(grad, - self.clip_gradient, self.clip_gradient)
-
-        # accumulated g and delta initlization
-        acc_g, acc_delta = state
-
-        # update g, delta
-        acc_g[:] *= self.rho
-        acc_g[:] += (1. - self.rho) * grad * grad
-        current_delta = sqrt(acc_delta + self.epsilon) / sqrt(acc_g + self.epsilon) * grad
-        acc_delta[:] *= self.rho
-        acc_delta[:] += (1. - self.rho) * current_delta * current_delta
-
-        # update weight
-        weight[:] -= current_delta + wd * weight
-
-#pylint: disable=invalid-name
-#pylint: disable=line-too-long
-@register
-class Ftrl(Optimizer):
-    """The Ftrl optimizer.
-
-    Referenced from *Ad Click Prediction: a View from the Trenches*, available at
-    http://dl.acm.org/citation.cfm?id=2488200.
-
-    eta :
-        .. math::
-           \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^2}}
-
-    The optimizer updates the weight by::
-
-        rescaled_grad = clip(grad * rescale_grad, clip_gradient)
-        z += rescaled_grad - (sqrt(n + rescaled_grad**2) - sqrt(n)) * weight / learning_rate
-        n += rescaled_grad**2
-        w = (sign(z) * lamda1 - z) / ((beta + sqrt(n)) / learning_rate + wd) * (abs(z) > lamda1)
-
-    If the storage types of weight, state and grad are all ``row_sparse``, \
-    **sparse updates** are applied by::
-
-        for row in grad.indices:
-            rescaled_grad[row] = clip(grad[row] * rescale_grad, clip_gradient)
-            z[row] += rescaled_grad[row] - (sqrt(n[row] + rescaled_grad[row]**2) - sqrt(n[row])) * weight[row] / learning_rate
-            n[row] += rescaled_grad[row]**2
-            w[row] = (sign(z[row]) * lamda1 - z[row]) / ((beta + sqrt(n[row])) / learning_rate + wd) * (abs(z[row]) > lamda1)
-
-    The sparse update only updates the z and n for the weights whose row_sparse
-    gradient indices appear in the current batch, rather than updating it for all
-    indices. Compared with the original update, it can provide large
-    improvements in model training throughput for some applications. However, it
-    provides slightly different semantics than the original update, and
-    may lead to different empirical results.
-
-    For details of the update algorithm, see :class:`~mxnet.ndarray.ftrl_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    lamda1 : float, optional
-        L1 regularization coefficient.
-    learning_rate : float, optional
-        The initial learning rate.
-    beta : float, optional
-        Per-coordinate learning rate correlation parameter.
-    """
-
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
-        super(Ftrl, self).__init__(**kwargs)
-        self.lamda1 = lamda1
-        self.beta = beta
-        self.lr = learning_rate
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, stype=weight.stype),  # z
-                zeros(weight.shape, weight.context, stype=weight.stype))  # n
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        wd = self._get_wd(index)
-        lr = self._get_lr(index)
-
-        kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
-        if self.clip_gradient:
-            kwargs['clip_gradient'] = self.clip_gradient
-
-        # accumulated g and delta initialization
-        z, n = state
-        ftrl_update(weight, grad, z, n, out=weight,
-                    lr=lr, wd=wd, **kwargs)
-
-# pylint: enable=line-too-long
-@register
-class Adamax(Optimizer):
-    """The AdaMax optimizer.
-
-    It is a variant of Adam based on the infinity norm
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    The optimizer updates the weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
-        super(Adamax, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        lr /= (1. - self.beta1**t)
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # update m_t and u_t
-        m_t, u_t = state
-        m_t[:] *= self.beta1
-        m_t[:] += (1. - self.beta1) * grad
-        u_t[:] = maximum(self.beta2 * u_t, NDabs(grad))
-
-        # update weight
-        weight[:] -= lr * m_t / u_t
-
-@register
-class Nadam(Optimizer):
-    """The Nesterov Adam optimizer.
-
-    Much like Adam is essentially RMSprop with momentum,
-    Nadam is Adam RMSprop with Nesterov momentum available
-    at http://cs229.stanford.edu/proj2015/054_report.pdf.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    epsilon : float, optional
-        Small value to avoid division by 0.
-    schedule_decay : float, optional
-        Exponential decay rate for the momentum schedule
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 schedule_decay=0.004, **kwargs):
-        super(Nadam, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.schedule_decay = schedule_decay
-        self.m_schedule = 1.
-
-    def create_state(self, index, weight):
-        return (zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, NDArray))
-        assert(isinstance(grad, NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        # warming momentum schedule
-        momentum_t = self.beta1 * (1. - 0.5 * (pow(0.96, t * self.schedule_decay)))
-        momentum_t_1 = self.beta1 * (1. - 0.5 * (pow(0.96, (t + 1) * self.schedule_decay)))
-        self.m_schedule = self.m_schedule * momentum_t
-        m_schedule_next = self.m_schedule * momentum_t_1
-
-        # update m_t and v_t
-        m_t, v_t = state
-        m_t[:] *= self.beta1
-        m_t[:] += (1. - self.beta1) * grad
-        v_t[:] *= self.beta2
-        v_t[:] += (1. - self.beta2) * grad * grad
-
-        grad_prime = grad / (1. - self.m_schedule)
-        m_t_prime = m_t / (1. - m_schedule_next)
-        v_t_prime = v_t / (1. - pow(self.beta2, t))
-        m_t_bar = (1. - momentum_t) * grad_prime + momentum_t_1 * m_t_prime
-
-        # update weight
-        weight[:] -= lr * m_t_bar / (sqrt(v_t_prime) + self.epsilon)
 
 @register
 class Test(Optimizer):
@@ -2038,139 +570,16 @@ def create_state(self, index, weight):
         """Creates a state to duplicate weight."""
         return zeros(weight.shape, weight.context)
 
-    def update(self, index, weight, grad, state):
+    def step(self, indices, weights, grads, states):
         """Performs w += rescale_grad * grad."""
-        weight[:] += grad * self.rescale_grad
-        state[:] = weight
-
-# backward compatibility wrapper for Optimizer.CreateOptimizer
-create = Optimizer.create_optimizer  # pylint: disable=invalid-name
-
-
-def _as_classic(a, allow_np):
-    # TODO(junwu): This is a temp solution for allowing converting
-    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
-    # users may have custom optimizers implemented using mx.nd.NDArray ops.
-    from ..numpy import ndarray as np_ndarray
-    if isinstance(a, (tuple, list)):
-        if any(isinstance(x, np_ndarray) for x in a):
-            if allow_np:
-                return [x.as_nd_ndarray() for x in a]
-            else:
-                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    else:
-        if isinstance(a, np_ndarray):
-            if allow_np:
-                return a.as_nd_ndarray()
-            else:
-                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    return a
-
-
-
-class Updater(object):
-    """Updater for kvstore."""
-    def __init__(self, optimizer):
-        self.optimizer = optimizer
-        self.states = {}
-        self.states_synced = {}
-        self.aggregate_updates = optimizer.aggregate_num > 0
-
-    def __call__(self, index, grad, weight):
-        """Updates weight given gradient and index."""
-        allow_np = self.optimizer.allow_np_array if hasattr(self.optimizer, "allow_np_array") else is_np_array()
-        if not isinstance(index, (list, tuple)):
-            indices = [index]
-            grads = [_as_classic(grad, allow_np)]
-            weights = [_as_classic(weight, allow_np)]
-        else:
-            indices = index
-            grads = _as_classic(grad, allow_np)
-            weights = _as_classic(weight, allow_np)
-        if weights:
-            self.optimizer._set_current_context(weights[0].context.device_id)
-        for i, idx in enumerate(indices):
-            # convert ctypes.char_p.value back to python str if needed
-            if isinstance(idx, bytes):
-                indices[i] = py_str(idx)
-                idx = indices[i]
-            if idx not in self.states:
-                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
-                self.states_synced[idx] = True
-            elif not self.states_synced[idx]:
-                self.states[idx] = \
-                    self.sync_state_context(self.states[idx], weights[i].context)
-                self.states_synced[idx] = True
-        if self.aggregate_updates:
-            # segregate values based on type
-            type_map = {}
-            for i, w, g in zip(indices, weights, grads):
-                if w.dtype in type_map:
-                    type_map[w.dtype].append((i, w, g))
-                else:
-                    type_map[w.dtype] = [(i, w, g)]
-            for idx in type_map:
-                current_index = 0
-                indices, weights, grads = zip(*type_map[idx])
-                while current_index < len(indices):
-                    states = []
-                    step = min(self.optimizer.aggregate_num, len(indices) - current_index)
-                    for j in range(step):
-                        states.append(self.states[indices[current_index + j]])
-                    self.optimizer.update_multi_precision(
-                        indices[current_index:current_index + self.optimizer.aggregate_num],
-                        weights[current_index:current_index + self.optimizer.aggregate_num],
-                        grads[current_index:current_index + self.optimizer.aggregate_num],
-                        states)
-                    current_index += self.optimizer.aggregate_num
-        else:
-            for i, w, g in zip(indices, weights, grads):
-                self.optimizer.update_multi_precision(i, w, g, self.states[i])
-
-    def sync_state_context(self, state, context):
-        """sync state context."""
-        if isinstance(state, NDArray):
-            return state.as_in_context(context)
-        elif isinstance(state, (tuple, list)):
-            synced_state = (self.sync_state_context(i, context) for i in state)
-            if isinstance(state, tuple):
-                return tuple(synced_state)
-            else:
-                return list(synced_state)
-        else:
-            return state
-
-    def set_states(self, states):
-        """Sets updater states."""
-        states = pickle.loads(states)
-        if isinstance(states, tuple) and len(states) == 2:
-            self.states, self.optimizer = states
-        else:
-            self.states = states
-        self.states_synced = dict.fromkeys(self.states.keys(), False)
-
-    def get_states(self, dump_optimizer=False):
-        """Gets updater states.
-
-        Parameters
-        ----------
-        dump_optimizer : bool, default False
-            Whether to also save the optimizer itself. This would also save optimizer
-            information such as learning rate and weight decay schedules.
-        """
-        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
+        for index, weight, grad in zip(indices, weights, grads):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+            grad = self.rescale_grad * grad
+            weight[:] -= lr * (grad + wd * weight)
 
-def get_updater(optimizer):
-    """Returns a closure of the updater needed for kvstore.
 
-    Parameters
-    ----------
-    optimizer: Optimizer
-         The optimizer.
+create = Optimizer.create_optimizer  # pylint: disable=invalid-name
 
-    Returns
-    -------
-    updater: function
-         The closure of the updater.
-    """
-    return Updater(optimizer)
diff --git a/python/mxnet/optimizer/rmsprop.py b/python/mxnet/optimizer/rmsprop.py
new file mode 100644
index 000000000000..b57c82130b4e
--- /dev/null
+++ b/python/mxnet/optimizer/rmsprop.py
@@ -0,0 +1,185 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""RMSProp optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import (rmsprop_update, rmspropalex_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['RMSProp']
+
+
+@register
+class RMSProp(Optimizer):
+    """The RMSProp optimizer.
+
+    Two versions of RMSProp are implemented:
+
+    If ``centered=False``, we follow
+    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
+    Tieleman & Hinton, 2012.
+    For details of the update algorithm see :class:`~mxnet.ndarray.rmsprop_update`.
+
+    If ``centered=True``, we follow http://arxiv.org/pdf/1308.0850v5.pdf (38)-(45)
+    by Alex Graves, 2013.
+    For details of the update algorithm see :class:`~mxnet.ndarray.rmspropalex_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    rho: float, default 0.9
+        A decay factor of moving average over past squared gradient.
+    momentum: float, default 0.9
+        Heavy ball momentum factor. Only used if `centered`=``True``.
+    epsilon : float, default 1e-8
+        Small value to avoid division by 0.
+    centered : bool, default False
+        Flag to control which version of RMSProp to use.::
+
+            True: will use Graves's version of `RMSProp`,
+            False: will use Tieleman & Hinton's version of `RMSProp`.
+
+    clip_weights : float, optional
+        Clips weights into range ``[-clip_weights, clip_weights]``.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.001, rho=0.9, momentum=0.9,
+                 epsilon=1e-8, centered=False, clip_weights=None,
+                 use_fused_step=True, **kwargs):
+        super(RMSProp, self).__init__(learning_rate=learning_rate,
+                                      use_fused_step=use_fused_step,
+                                      **kwargs)
+        self.rho = rho
+        self.momentum = momentum
+        self.centered = centered
+        self.epsilon = epsilon
+        self.clip_weights = clip_weights
+
+    def create_state(self, index, weight):
+        if self.centered:
+            return (
+                zeros(weight.shape, weight.context, stype=weight.stype),  # mean
+                zeros(weight.shape, weight.context, stype=weight.stype),  # var
+                zeros(weight.shape, weight.context, stype=weight.stype))  # mom
+        else:
+            return zeros(weight.shape, weight.context, stype=weight.stype)  # var
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            if not self.centered:
+                # update var
+                var = state
+                var[:] *= self.rho
+                var[:] += (1 - self.rho) * square(grad)
+
+                # update weight
+                d = grad / (sqrt(var) + self.epsilon)
+                weight[:] -= lr * d
+            else:
+                # update mean, var, mom
+                mean, var, mom = state
+                mean[:] *= self.rho
+                mean[:] += (1 - self.rho) * grad
+                var[:] *= self.rho
+                var[:] += (1 - self.rho) * square(grad)
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad / sqrt(var - square(mean) + self.epsilon)
+
+                # update weight
+                weight[:] += mom
+
+            if self.clip_weights:
+                clip(weight, -self.clip_weights, self.clip_weights, out=weight)
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'rho': self.rho, 'epsilon': self.epsilon,
+                      'rescale_grad': self.rescale_grad}
+            if self.centered:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.clip_weights:
+                kwargs['clip_weights'] = self.clip_weights
+
+            # update weight with fused kernel
+            if not self.centered:
+                var = state
+                rmsprop_update(weight, grad, var, out=weight, lr=lr, wd=wd, **kwargs)
+            else:
+                mean, var, mom = state
+                rmspropalex_update(weight, grad, mean, var, mom, out=weight,
+                                   lr=lr, wd=wd, **kwargs)
diff --git a/python/mxnet/optimizer/sgd.py b/python/mxnet/optimizer/sgd.py
new file mode 100644
index 000000000000..3e0f74928182
--- /dev/null
+++ b/python/mxnet/optimizer/sgd.py
@@ -0,0 +1,247 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""SGD optimizer"""
+from __future__ import absolute_import
+import os
+import numpy
+from ..ndarray import (zeros, clip)
+from ..ndarray import (sgd_update, sgd_mom_update,
+                       mp_sgd_update, mp_sgd_mom_update,
+                       multi_sgd_update, multi_sgd_mom_update,
+                       multi_mp_sgd_update, multi_mp_sgd_mom_update)
+from .optimizer import Optimizer, register
+from .utils import _flatten_list
+
+__all__ = ['SGD']
+
+
+@register
+class SGD(Optimizer):
+    """The SGD optimizer with momentum and weight decay.
+
+    If the storage types of grad is ``row_sparse`` and ``lazy_update`` is True, \
+    **lazy updates** are applied by::
+
+        for row in grad.indices:
+            rescaled_grad[row] = clip(rescale_grad * grad[row] + wd * weight[row], clip_gradient)
+            state[row] = momentum[row] * state[row] + lr * rescaled_grad[row]
+            weight[row] = weight[row] - state[row]
+
+    The sparse update only updates the momentum for the weights whose row_sparse
+    gradient indices appear in the current batch, rather than updating it for all
+    indices. Compared with the original update, it can provide large
+    improvements in model training throughput for some applications. However, it
+    provides slightly different semantics than the original update, and
+    may lead to different empirical results.
+
+    In the case when ``update_on_kvstore`` is set to False (either globally via
+    MXNET_UPDATE_ON_KVSTORE=0 environment variable or as a parameter in
+    :class:`~mxnet.gluon.Trainer`) SGD optimizer can perform aggregated update
+    of parameters, which may lead to improved performance. The aggregation size
+    is controlled by ``aggregate_num`` and defaults to 4.
+
+    Otherwise, **standard updates** are applied by::
+
+        rescaled_grad = clip(rescale_grad * grad, clip_gradient)) + wd * weight
+        state = momentum * state + lr * rescaled_grad
+        weight = weight - state
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.sgd_update` and :class:`~mxnet.ndarray.sgd_mom_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.1
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, default 0.
+        The momentum value.
+    lazy_update : bool, default False
+        Default is False. If True, lazy updates are applied \
+        if the storage types of weight and grad are both ``row_sparse``.
+    multi_precision: bool, default False
+        Flag to control the internal precision of the optimizer.
+        False: results in using the same precision as the weights (default),
+        True: makes internal 32-bit copy of the weights and applies gradients
+        in 32-bit precision even if actual weights used in the model have lower precision.
+        Turning this on can improve convergence and accuracy when training with float16.
+    aggregate_num : int, default 1
+        Number of weights to be aggregated in a list.
+        They are passed to the optimizer for a single optimization step.
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, momentum=0.0, lazy_update=False,
+                 multi_precision=False, use_fused_step=True, aggregate_num=1, **kwargs):
+        super(SGD, self).__init__(learning_rate=learning_rate,
+                                  multi_precision=multi_precision,
+                                  aggregate_num=aggregate_num,
+                                  use_fused_step=use_fused_step,
+                                  **kwargs)
+        if not self.use_fused_step:
+            assert not lazy_update, \
+                'When use_fused_step is set to False, lazy_update has to be turned off.'
+        if lazy_update:
+            assert not multi_precision, \
+                'When lazy_update is set to True, multi_precision has be turned off.'
+        self.momentum = momentum
+        self.lazy_update = lazy_update
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            stype = weight.stype if self.lazy_update else 'default'
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, -self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update mom
+            mom = state
+            if mom is not None:
+                mom[:] *= self.momentum
+                mom[:] -= lr * grad
+            else:
+                mom = -lr * grad
+
+            # update weight
+            weight[:] += mom
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        # When either weight or gradient is sparse, aggregate is False.
+        aggregate = self.aggregate_num > 1
+        for weight, grad in zip(weights, grads):
+            aggregate = (aggregate and
+                         weight.stype == 'default' and
+                         grad.stype == 'default')
+        self._update_count(indices)
+        lrs = self._get_lrs(indices)
+        wds = self._get_wds(indices)
+
+        kwargs = {'rescale_grad': self.rescale_grad}
+        if self.momentum > 0:
+            kwargs['momentum'] = self.momentum
+        if self.clip_gradient:
+            kwargs['clip_gradient'] = self.clip_gradient
+
+        if aggregate:
+            # update `aggregate_num` number of weights in a single kernel.
+            # this does not support sparse weight or gradient.
+            multi_precision = self.multi_precision and weights[0].dtype == numpy.float16
+            if not multi_precision:
+                if self.momentum > 0:
+                    multi_sgd_mom_update(*_flatten_list(zip(weights, grads, states)), out=weights,
+                                         num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_sgd_update(*_flatten_list(zip(weights, grads)), out=weights,
+                                     num_weights=len(weights), lrs=lrs, wds=wds, **kwargs)
+            else:
+                states = list(zip(*states))
+                weights32, moms = states
+                if self.momentum > 0:
+                    multi_mp_sgd_mom_update(*_flatten_list(zip(weights, grads,
+                                                               moms, weights32)),
+                                            out=weights, num_weights=len(weights),
+                                            lrs=lrs, wds=wds, **kwargs)
+                else:
+                    multi_mp_sgd_update(*_flatten_list(zip(weights, grads,
+                                                           weights32)),
+                                        out=weights, num_weights=len(weights),
+                                        lrs=lrs, wds=wds, **kwargs)
+        else:
+            for weight, grad, state, lr, wd in zip(weights, grads, states, lrs, wds):
+                multi_precision = self.multi_precision and weight.dtype == numpy.float16
+                if not multi_precision:
+                    mom = state
+                    if mom is not None:
+                        sgd_mom_update(weight, grad, mom, out=weight,
+                                       lazy_update=self.lazy_update, lr=lr, wd=wd, **kwargs)
+                    else:
+                        sgd_update(weight, grad, out=weight, lazy_update=self.lazy_update,
+                                   lr=lr, wd=wd, **kwargs)
+                else:
+                    # weight32 is a float32 copy of weight.
+                    # in the kernel, we firstly update weight32,
+                    # and then cast the result to float16 and save it to weight.
+                    weight32, mom = state
+                    if mom is not None:
+                        mp_sgd_mom_update(weight, grad, mom, weight32, out=weight,
+                                          lr=lr, wd=wd, **kwargs)
+                    else:
+                        mp_sgd_update(weight, grad, weight32, out=weight,
+                                      lr=lr, wd=wd, **kwargs)
+
+    def update_multi_precision(self, indices, weights, grads, states):
+        """Override update_multi_precision.
+        """
+        if self.use_fused_step:
+            self.update(indices, weights, grads, states)
+        else:
+            super(SGD, self).update_multi_precision(indices, weights, grads, states)
+
diff --git a/python/mxnet/optimizer/sgld.py b/python/mxnet/optimizer/sgld.py
new file mode 100644
index 000000000000..8a99d8f977d7
--- /dev/null
+++ b/python/mxnet/optimizer/sgld.py
@@ -0,0 +1,89 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""SGLD optimizer."""
+from __future__ import absolute_import
+import math
+from ..ndarray import clip
+from ..random import normal
+from .optimizer import Optimizer, register
+
+__all__ = ['SGLD']
+
+
+@register
+class SGLD(Optimizer):
+    """Stochastic Gradient Riemannian Langevin Dynamics.
+
+    This class implements the optimizer described in the paper *Stochastic Gradient
+    Riemannian Langevin Dynamics on the Probability Simplex*, available at
+    https://papers.nips.cc/paper/4883-stochastic-gradient-riemannian-langevin-dynamics-on-the-probability-simplex.pdf.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.001
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    use_fused_step : bool, default False
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.1, use_fused_step=False, **kwargs):
+        super(SGLD, self).__init__(learning_rate=learning_rate,
+                                   use_fused_step=use_fused_step,
+                                   **kwargs)
+
+    def create_state(self, index, weight):
+        return None
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            # preprocess grad
+            grad *= self.rescale_grad
+            if self.clip_gradient is not None:
+                grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+            grad += wd * weight
+
+            # update weight
+            weight[:] -= lr / 2 * grad
+            weight[:] += normal(0, math.sqrt(lr), shape=weight.shape,
+                                dtype=weight.dtype, ctx=weight.context)
diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
new file mode 100644
index 000000000000..16188ccd2fb8
--- /dev/null
+++ b/python/mxnet/optimizer/signum.py
@@ -0,0 +1,162 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Signum optimizer."""
+from __future__ import absolute_import
+from ..ndarray import (zeros, clip, sign)
+from ..ndarray import (signsgd_update, signum_update)
+from .optimizer import Optimizer, register
+
+__all__ = ['Signum']
+
+
+@register
+class Signum(Optimizer):
+    r"""The Signum optimizer that takes the sign of gradient or momentum.
+
+    The optimizer updates the weight by::
+
+        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
+        state = momentum * state + (1-momentum)*rescaled_grad
+        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
+
+    References
+    ----------
+    Jeremy Bernstein, Yu-Xiang Wang, Kamyar Azizzadenesheli & Anima Anandkumar. (2018).
+    signSGD: Compressed Optimisation for Non-Convex Problems. In ICML'18.
+
+    See: https://arxiv.org/abs/1802.04434
+
+    For details of the update algorithm see
+    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
+
+    This optimizer accepts the following parameters in addition to those accepted
+    by :class:`.Optimizer`.
+
+    Parameters
+    ----------
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    momentum : float, optional
+       The momentum value.
+    wd_lh : float, optional
+       The amount of decoupled weight decay regularization, see details in the original paper at:\
+       https://arxiv.org/abs/1711.05101
+    use_fused_step : bool, default True
+        Whether or not to use fused kernels for optimizer.
+        When use_fused_step=False, step is called,
+        otherwise, fused_step is called.
+    """
+    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh=0.0, use_fused_step=True, **kwargs):
+        super(Signum, self).__init__(learning_rate=learning_rate,
+                                     use_fused_step=use_fused_step,
+                                     **kwargs)
+        self.momentum = momentum
+        self.wd_lh = wd_lh
+
+    def create_state(self, index, weight):
+        momentum = None
+        if self.momentum != 0.0:
+            momentum = zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
+        return momentum
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+         Parameters
+         ----------
+         indices : list of int
+             List of unique indices of the parameters into the individual learning rates
+             and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+             and `set_wd_mult()`, respectively.
+         weights : list of NDArray
+             List of parameters to be updated.
+         grads : list of NDArray
+             List of gradients of the objective with respect to this parameter.
+         states : List of any obj
+             List of state returned by `create_state()`.
+         """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            if state is not None:
+                # preprocess grad
+                grad *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    grad = clip(grad, - self.clip_gradient, self.clip_gradient)
+                grad += wd * weight
+
+                # update mom
+                mom = state
+                mom[:] *= self.momentum
+                mom[:] -= (1 - self.momentum) * grad
+                weight[:] *= 1 - lr * self.wd_lh
+
+                # update weight
+                weight[:] += lr * sign(mom)
+            else:
+                # update weight
+                weight[:] *= 1 - lr * (wd + self.wd_lh)
+                weight[:] -= lr * sign(grad)
+
+    def fused_step(self, indices, weights, grads, states):
+        """Perform a fused optimization step using gradients and states.
+        Fused kernel is used for update.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
+
+            kwargs = {'rescale_grad': self.rescale_grad}
+            if self.momentum > 0:
+                kwargs['momentum'] = self.momentum
+            if self.clip_gradient:
+                kwargs['clip_gradient'] = self.clip_gradient
+            if self.wd_lh:
+                kwargs['wd_lh'] = self.wd_lh
+
+            # update weight with fused kernel
+            if state is not None:
+                signum_update(weight, grad, state, out=weight,
+                              lr=lr, wd=wd, **kwargs)
+            else:
+                signsgd_update(weight, grad, out=weight,
+                               lr=lr, wd=wd, **kwargs)
+
diff --git a/python/mxnet/optimizer/updater.py b/python/mxnet/optimizer/updater.py
new file mode 100644
index 000000000000..03398396c449
--- /dev/null
+++ b/python/mxnet/optimizer/updater.py
@@ -0,0 +1,142 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+# pylint: disable=too-many-lines
+"""Updater class."""
+from __future__ import absolute_import
+import pickle
+import numpy
+from ..base import py_str
+from ..ndarray import NDArray
+from ..util import is_np_array
+from .utils import _as_classic
+
+__all__ = ['Updater', 'get_updater']
+
+
+class Updater(object):
+    """Updater for kvstore."""
+    def __init__(self, optimizer):
+        self.optimizer = optimizer
+        self.states = {}
+        self.states_synced = {}
+        self.aggregate_updates = optimizer.aggregate_num > 1
+
+    def __call__(self, index, grad, weight):
+        """Updates weight given gradient and index."""
+        allow_np = self.optimizer.allow_np_array if hasattr(self.optimizer, "allow_np_array") else is_np_array()
+        if not isinstance(index, (list, tuple)):
+            indices = [index]
+            grads = [_as_classic(grad, allow_np)]
+            weights = [_as_classic(weight, allow_np)]
+        else:
+            indices = index
+            grads = _as_classic(grad, allow_np)
+            weights = _as_classic(weight, allow_np)
+        if weights:
+            self.optimizer._set_current_context(weights[0].context.device_id)
+        for i, idx in enumerate(indices):
+            # convert ctypes.char_p.value back to python str if needed
+            if isinstance(idx, bytes):
+                indices[i] = py_str(idx)
+                idx = indices[i]
+            if idx not in self.states:
+                self.states[idx] = self.optimizer.create_state_multi_precision(idx, weights[i])
+                self.states_synced[idx] = True
+            elif not self.states_synced[idx]:
+                self.states[idx] = \
+                    self.sync_state_context(self.states[idx], weights[i].context)
+                self.states_synced[idx] = True
+        if self.aggregate_updates:
+            # segregate values based on type
+            if self.optimizer.aggregate_num is not numpy.inf:
+                type_map = {}
+                for i, w, g in zip(indices, weights, grads):
+                    if w.dtype in type_map:
+                        type_map[w.dtype].append((i, w, g))
+                    else:
+                        type_map[w.dtype] = [(i, w, g)]
+                for idx in type_map:
+                    current_index = 0
+                    indices, weights, grads = zip(*type_map[idx])
+                    while current_index < len(indices):
+                        states = []
+                        step = min(self.optimizer.aggregate_num, len(indices) - current_index)
+                        for j in range(step):
+                            states.append(self.states[indices[current_index + j]])
+                        self.optimizer.update_multi_precision(
+                            indices[current_index:current_index + self.optimizer.aggregate_num],
+                            weights[current_index:current_index + self.optimizer.aggregate_num],
+                            grads[current_index:current_index + self.optimizer.aggregate_num],
+                            states)
+                        current_index += self.optimizer.aggregate_num
+            else:
+                states = [self.states[index] for index in indices]
+                self.optimizer.update_multi_precision(indices, weights, grads, states)
+        else:
+            for index, weight, grad in zip(indices, weights, grads):
+                self.optimizer.update_multi_precision([index], [weight], [grad], [self.states[index]])
+
+    def sync_state_context(self, state, context):
+        """sync state context."""
+        if isinstance(state, NDArray):
+            return state.as_in_context(context)
+        elif isinstance(state, (tuple, list)):
+            synced_state = (self.sync_state_context(i, context) for i in state)
+            if isinstance(state, tuple):
+                return tuple(synced_state)
+            else:
+                return list(synced_state)
+        else:
+            return state
+
+    def set_states(self, states):
+        """Sets updater states."""
+        states = pickle.loads(states)
+        if isinstance(states, tuple) and len(states) == 2:
+            self.states, self.optimizer = states
+        else:
+            self.states = states
+        self.states_synced = dict.fromkeys(self.states.keys(), False)
+
+    def get_states(self, dump_optimizer=False):
+        """Gets updater states.
+
+        Parameters
+        ----------
+        dump_optimizer : bool, default False
+            Whether to also save the optimizer itself. This would also save optimizer
+            information such as learning rate and weight decay schedules.
+        """
+        return pickle.dumps((self.states, self.optimizer) if dump_optimizer else self.states)
+
+
+def get_updater(optimizer):
+    """Returns a closure of the updater needed for kvstore.
+
+    Parameters
+    ----------
+    optimizer: Optimizer
+         The optimizer.
+
+    Returns
+    -------
+    updater: function
+         The closure of the updater.
+    """
+    return Updater(optimizer)
diff --git a/python/mxnet/optimizer/utils.py b/python/mxnet/optimizer/utils.py
new file mode 100644
index 000000000000..af95a53ccae5
--- /dev/null
+++ b/python/mxnet/optimizer/utils.py
@@ -0,0 +1,43 @@
+# coding: utf-8
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+"""Optimizer utility functions."""
+from __future__ import absolute_import
+
+
+def _flatten_list(nested_list):
+    return [item for sublist in nested_list for item in sublist]
+
+
+def _as_classic(a, allow_np):
+    # TODO(junwu): This is a temp solution for allowing converting
+    # np.ndarray to mx.nd.NDArray to be fed into the optimizer since
+    # users may have custom optimizers implemented using mx.nd.NDArray ops.
+    from ..numpy import ndarray as np_ndarray
+    if isinstance(a, (tuple, list)):
+        if any(isinstance(x, np_ndarray) for x in a):
+            if allow_np:
+                return [x.as_nd_ndarray() for x in a]
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    else:
+        if isinstance(a, np_ndarray):
+            if allow_np:
+                return a.as_nd_ndarray()
+            else:
+                raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
+    return a
\ No newline at end of file
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index c60e5bc22201..d3b056c6efdc 100755
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -2244,6 +2244,7 @@ def verify_generator(generator, buckets, probs, nsamples=1000000, nrepeat=5, suc
                                 str(buckets), str(probs)))
     return cs_ret_l
 
+
 def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
     """Compare ndarray tuple."""
     if t1 is None or t2 is None:
@@ -2256,11 +2257,14 @@ def compare_ndarray_tuple(t1, t2, rtol=None, atol=None):
         assert_almost_equal(t1, t2, rtol=rtol, atol=atol)
 
 
-def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='default',
-                      rtol=1e-4, atol=1e-5, compare_states=True, ntensors=1):
+def compare_optimizer(opt1, opt2, shapes, dtype, w_stype='default', g_stype='default',
+                      rtol=1e-4, atol=1e-5, compare_states=True):
     """Compare opt1 and opt2."""
-    if not isinstance(shape, list):
-        assert(ntensors == 1)
+
+    w1_list, w2_list = [], []
+    g1_list, g2_list = [], []
+    s1_list, s2_list = [], []
+    for i, shape in enumerate(shapes):
         if w_stype == 'default':
             w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
             w1 = w2.copyto(default_context())
@@ -2277,37 +2281,77 @@ def compare_optimizer(opt1, opt2, shape, dtype, w_stype='default', g_stype='defa
             g1 = g2.copyto(default_context()).tostype('default')
         else:
             raise Exception("type not supported yet")
+        s1 = opt1.create_state_multi_precision(i, w1)
+        s2 = opt2.create_state_multi_precision(i, w2)
 
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
         if compare_states:
-            compare_ndarray_tuple(state1, state2)
+            compare_ndarray_tuple(s1, s2)
+
+        w1_list.append(w1)
+        w2_list.append(w2)
+        g1_list.append(g1)
+        g2_list.append(g2)
+        s1_list.append(s1)
+        s2_list.append(s2)
+
+    indices = list(range(len(shapes)))
+    opt1.update_multi_precision(indices, w1_list, g1_list, s1_list)
+    opt2.update_multi_precision(indices, w2_list, g2_list, s2_list)
+    if compare_states:
+        compare_ndarray_tuple(tuple(s1_list), tuple(s2_list), rtol=rtol, atol=atol)
+    compare_ndarray_tuple(tuple(w1_list), tuple(w2_list), rtol=rtol, atol=atol)
+
+
+def compare_optimizer_noise_seeded(opt1, opt2, shapes, dtype, noise_seed,
+                                   w_stype='default', g_stype='default',
+                                   rtol=1e-4, atol=1e-5, compare_states=True):
+    """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
+    in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
+
+    """
+    w1_list, w2_list = [], []
+    g1_list, g2_list = [], []
+    s1_list, s2_list = [], []
+    for i, shape in enumerate(shapes):
+        if w_stype == 'default':
+            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            w1 = w2.copyto(default_context())
+        elif w_stype == 'row_sparse' or w_stype == 'csr':
+            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
+            w1 = w2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        if g_stype == 'default':
+            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
+            g1 = g2.copyto(default_context())
+        elif g_stype == 'row_sparse' or g_stype == 'csr':
+            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
+            g1 = g2.copyto(default_context()).tostype('default')
+        else:
+            raise Exception("type not supported yet")
+        s1 = opt1.create_state_multi_precision(i, w1)
+        s2 = opt2.create_state_multi_precision(i, w2)
 
-        opt1.update_multi_precision(0, w1, g1, state1)
-        opt2.update_multi_precision(0, w2, g2, state2)
         if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1, w2, rtol=rtol, atol=atol)
-    else:
-        # test multi-tensor: Opt1 single-tensor reference, Opt2 multi-tensor
-        from copy import deepcopy
-        w1, g1 = [], []
-        for s in shape:
-            w1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-            g1.append(mx.random.uniform(shape=s, ctx=default_context(), dtype=dtype))
-        w1 = tuple(w1)
-        w2 = deepcopy(w1)
-        g1 = tuple(g1)
-        g2 = deepcopy(g1)
-        state2 = [opt2.create_state_multi_precision(0, w2[i]) for i in range(ntensors)]
-
-        opt2.update_multi_precision(list(range(ntensors)), w2, g2, state2)
-        for i in range(ntensors):
-            state1 = opt1.create_state_multi_precision(i, w1[i])
-            opt1.update_multi_precision(i, w1[i], g1[i], state1)
-            if compare_states:
-                compare_ndarray_tuple(state1, state2[i], rtol, atol)
-            compare_ndarray_tuple(w1[i], w2[i], rtol, atol)
+            compare_ndarray_tuple(s1, s2)
+
+        w1_list.append(w1)
+        w2_list.append(w2)
+        g1_list.append(g1)
+        g2_list.append(g2)
+        s1_list.append(s1)
+        s2_list.append(s2)
+
+    indices = list(range(len(shapes)))
+    # set seed for Gaussian noise replication
+    mx.random.seed(noise_seed)
+    opt1.update_multi_precision(indices, w1_list, g1_list, s1_list)
+    mx.random.seed(noise_seed)
+    opt2.update_multi_precision(indices, w2_list, g2_list, s2_list)
+    if compare_states:
+        compare_ndarray_tuple(tuple(s1_list), tuple(s2_list), rtol=rtol, atol=atol)
+    compare_ndarray_tuple(tuple(w1_list), tuple(w2_list), rtol=rtol, atol=atol)
+
 
 def same_symbol_structure(sym1, sym2):
     """Compare two symbols to check if they have the same computation graph structure.
diff --git a/src/operator/contrib/optimizer_op-inl.h b/src/operator/contrib/optimizer_op-inl.h
index fd556a4231cb..2276b9375012 100644
--- a/src/operator/contrib/optimizer_op-inl.h
+++ b/src/operator/contrib/optimizer_op-inl.h
@@ -130,7 +130,7 @@ template <typename xpu> struct GroupAdagradDnsRspKernel {
       // clang-format off
       const DType grad_rescaled = get_grad_rescaled(j);
       index_t data_j = get_data_j(j);
-      const DType div = lr * grad_rescaled / square_root::Map(state_data[grad_idx[i]] + eps);
+      const DType div = lr * grad_rescaled / (square_root::Map(state_data[grad_idx[i]]) + eps);
       out_data[data_j] = weight_data[data_j] - div;
       // clang-format on
     }
diff --git a/src/operator/contrib/optimizer_op.cc b/src/operator/contrib/optimizer_op.cc
index 83bbcdab833d..be6d30587368 100644
--- a/src/operator/contrib/optimizer_op.cc
+++ b/src/operator/contrib/optimizer_op.cc
@@ -61,7 +61,7 @@ Updates are applied by::
 
     grad = clip(grad * rescale_grad, clip_gradient)
     history += mean(square(grad), axis=1, keepdims=True)
-    div = grad / sqrt(history + float_stable_eps)
+    div = grad / (sqrt(history) + epsilon)
     weight -= div * lr
 
 Weights are updated lazily if the gradient is sparse.
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 2df574c46909..b7dea1015bdb 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -231,23 +231,18 @@ struct MultiSGDKernel {
       if ((size_t)i < param.sizes[index]) {
         MPDType w = has_mixed_precision ? param.weights32[index][i] :
                                           MPDType(param.weights[index][i]);
-        MPDType mom = has_momentum ? param.mom[index][i] : MPDType(0);
+        MPDType rescale_grad = param.rescale_grad * static_cast<MPDType>(param.grads[index][i]);
         if (param.clip_gradient >= 0.0f) {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]
-                *mshadow_op::clip::Map(param.rescale_grad *
-                                       static_cast<MPDType>(param.grads[index][i]),
-                                     param.clip_gradient);
-        } else {
-          mom = param.momentum*mom
-                - param.lrs[index]*param.wds[index]*w
-                - param.lrs[index]*param.rescale_grad*static_cast<MPDType>(param.grads[index][i]);
+          rescale_grad = mshadow_op::clip::Map(rescale_grad, param.clip_gradient);
         }
+        rescale_grad += param.wds[index] * w;
         if (has_momentum) {
-          param.mom[index][i] = mom;
+          param.mom[index][i] *= param.momentum;
+          param.mom[index][i] -= param.lrs[index] * rescale_grad;
+          w = w + param.mom[index][i];
+        } else {
+          w -= param.lrs[index] * rescale_grad;
         }
-        w = w + mom;
         if (has_mixed_precision) {
           param.weights32[index][i] = w;
         }
@@ -385,16 +380,12 @@ struct SGDKernel {
     const DType* grad_data, const DType param_clip_gradient,
     const DType param_lr, const DType param_wd, const DType param_rescale_grad,
     const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr)
-                 * mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient));
-    } else {
-      KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr*param_rescale_grad)*grad_data[i]);
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * weight_data[i];
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] - (param_lr * rescale_grad));
   }
 };
 
@@ -439,13 +430,12 @@ struct SGDDnsRspKernel<req, gpu> {
     const dim_t col_id = i % row_length;
     const dim_t row_offset = grad_idx[row_id] * row_length;
     const dim_t data_i = row_offset + col_id;
+    DType grad_rescaled = rescale_grad * grad_val[i];
     if (clip_gradient >= 0.0f) {
-      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                   (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[i], clip_gradient));
-    } else {
-      KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                    (lr * rescale_grad) * grad_val[i]);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight[data_i];
+    KERNEL_ASSIGN(out[data_i], req, weight[data_i] - (lr * grad_rescaled));
   }
 };
 
@@ -464,13 +454,12 @@ struct SGDDnsRspKernel<req, cpu> {
     for (index_t j = 0; j < row_length; j++) {
       index_t data_i = grad_idx[i] * row_length + j;
       index_t grad_i = i * row_length + j;
+      DType grad_rescaled = rescale_grad * grad_val[grad_i];
       if (clip_gradient >= 0.0f) {
-        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                     (lr) * mshadow_op::clip::Map(rescale_grad * grad_val[grad_i], clip_gradient));
-      } else {
-        KERNEL_ASSIGN(out[data_i], req, (1.f - lr * wd) * weight[data_i] -
-                      (lr * rescale_grad) * grad_val[grad_i]);
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight[data_i];
+      KERNEL_ASSIGN(out[data_i], req, weight[data_i] - (lr * grad_rescaled));
     }
   }
 };
@@ -505,7 +494,7 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
         // apply standard weight decay if not lazy update
         if (!param.lazy_update) {
           Kernel<op_with_req<mshadow_op::mul, req_type>, xpu>::Launch(s, weight.Size(),
-            weight_data, weight_data, static_cast<DType>(1 - param.lr * param.wd));
+          weight_data, weight_data, static_cast<DType>(1 - param.lr * param.wd));
           wd = 0;
         }
         if (!grad.storage_initialized()) return;
@@ -604,16 +593,13 @@ struct SGDMomKernel {
     const DType* grad_data, const DType param_clip_gradient, const DType param_momentum,
     const DType param_lr, const DType param_wd, const DType param_rescale_grad,
     const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i]
-              - param_lr*param_wd*weight_data[i]
-              - param_lr
-              *mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i]
-                - param_lr*param_wd*weight_data[i]
-                - param_lr*param_rescale_grad*grad_data[i];
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * rescale_grad;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };
@@ -658,20 +644,15 @@ struct MP_SGDKernel {
     const DType* grad_data, float* weight32, const float param_clip_gradient,
     const float param_lr, const float param_wd, const float param_rescale_grad,
     const OpReqType req) {
+    float w = weight32[i];
+    float rescale_grad = param_rescale_grad * static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      float w = weight32[i];
-      w = (1.f - param_lr*param_wd)*w -
-          (param_lr) * mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                                             param_clip_gradient);
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, (DType)w);
-    } else {
-      float w = weight32[i];
-      w = (1.f-param_lr*param_wd)*w
-               - (param_lr*param_rescale_grad)*static_cast<float>(grad_data[i]);
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, (DType)w);
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
+    rescale_grad += param_wd * w;
+    w -= param_lr * rescale_grad;
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, (DType)w);
   }
 };
 
@@ -704,17 +685,13 @@ struct MP_SGDMomKernel {
     const float param_wd, const float param_rescale_grad, const OpReqType req) {
     float w = weight32[i];
     float mom = mom_data[i];
+    float grad_rescaled = param_rescale_grad*static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      mom = param_momentum*mom
-              - param_lr*param_wd*w
-              - param_lr
-              *mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                                     param_clip_gradient);
-    } else {
-      mom = param_momentum*mom
-                - param_lr*param_wd*w
-                - param_lr*param_rescale_grad*static_cast<float>(grad_data[i]);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * w;
+    mom *= param_momentum;
+    mom -= param_lr * grad_rescaled;
     mom_data[i] = mom;
     w = w + mom;
     weight32[i] = w;
@@ -753,21 +730,16 @@ struct SGDMomDnsRspDnsKernel<req, cpu> {
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
     const DType* grad_data, const DType clip_gradient, const DType momentum,
     const DType lr, const DType wd, const DType rescale_grad) {
-    const DType rate = lr * wd;
     for (index_t j = 0; j < row_length; j++) {
       index_t data_i = grad_idx[i] * row_length + j;
       index_t grad_i = i * row_length + j;
+      DType grad_rescaled = rescale_grad * grad_data[grad_i];
       if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad_data[grad_i],
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad_data[grad_i];
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight_data[data_i];
+      mom_data[data_i] *= momentum;
+      mom_data[data_i] -= lr * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
     }
   }
@@ -781,21 +753,16 @@ struct SGDMomDnsRspDnsKernel<req, gpu> {
     const DType* grad_data, const DType clip_gradient, const DType momentum,
     const DType lr, const DType wd, const DType rescale_grad) {
     using nnvm::dim_t;
-    const DType rate = lr * wd;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const dim_t data_i = grad_idx[row_id] * row_length + col_id;
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
-      mom_data[data_i] = momentum * mom_data[data_i]
-              - rate * weight_data[data_i]
-              - lr *
-              mshadow_op::clip::Map(rescale_grad * grad_data[i],
-                                    clip_gradient);
-    } else {
-      mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr * rescale_grad * grad_data[i];
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[data_i];
+    mom_data[data_i] *= momentum;
+    mom_data[data_i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
   }
 };
@@ -1065,20 +1032,15 @@ struct NAGMomKernel {
     const DType param_clip_gradient, const DType param_momentum,
     const DType param_lr, const DType param_wd,
     const DType param_rescale_grad, const OpReqType req) {
+    DType grad_rescaled = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i];
-      KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1)
-              *(mom_data[i]-(param_lr*(mshadow_op::clip::Map(param_rescale_grad
-                              *grad_data[i], param_clip_gradient)+(param_wd*weight_data[i])))));
-      mom_data[i] = mom_data[i] - (param_lr*((mshadow_op::clip::Map(param_rescale_grad*grad_data[i],
-                          param_clip_gradient))+(param_wd*weight_data[i])));
-    } else {
-      mom_data[i] = param_momentum*mom_data[i];
-      KERNEL_ASSIGN(out_data[i], req, weight_data[i]-mom_data[i]+(param_momentum+1)
-              *(mom_data[i]-(param_lr*(param_rescale_grad*grad_data[i]+param_wd*weight_data[i]))));
-      mom_data[i] = mom_data[i] - param_lr*((param_rescale_grad*grad_data[i])
-              +(param_wd*weight_data[i]));
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * grad_rescaled;
+    KERNEL_ASSIGN(out_data[i], req, weight_data[i] + (param_momentum * mom_data[i])
+                   - (param_lr * grad_rescaled));
   }
 };
 
@@ -1115,25 +1077,16 @@ struct MP_NAGMomKernel {
     const float param_wd, const float param_rescale_grad,
     const OpReqType req) {
     float w = weight32[i];
+    float grad_rescaled = param_rescale_grad * static_cast<float>(grad_data[i]);
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i];
-      w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr
-              *(mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                          param_clip_gradient)+(param_wd*w)));
-      mom_data[i] = mom_data[i] - param_lr
-          *((mshadow_op::clip::Map(param_rescale_grad*static_cast<float>(grad_data[i]),
-                          param_clip_gradient))+(param_wd*w));
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, w);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i];
-      w = w-mom_data[i]+(param_momentum+1)*(mom_data[i]-param_lr
-              *(param_rescale_grad*static_cast<float>(grad_data[i])+(param_wd*w)));
-      mom_data[i] = mom_data[i] - param_lr
-          *((param_rescale_grad*static_cast<float>(grad_data[i]))+(param_wd*w));
-      weight32[i] = w;
-      KERNEL_ASSIGN(out_data[i], req, w);
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, param_clip_gradient);
     }
+    grad_rescaled += param_wd * w;
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= param_lr * grad_rescaled;
+    w += (param_momentum * mom_data[i]) - (param_lr * grad_rescaled);
+    weight32[i] = w;
+    KERNEL_ASSIGN(out_data[i], req, w);
   }
 };
 
@@ -1211,7 +1164,7 @@ struct FTMLKernel {
     const OpReqType req) {
     using namespace mshadow_op;
     const DType grad_i = clip_grad >= 0.0f
-        ? clip::Map(rescale_grad * grad[i] + wd * weight[i], clip_grad)
+        ? clip::Map(rescale_grad * grad[i], clip_grad) + wd * weight[i]
         : (rescale_grad * grad[i] + wd * weight[i]);
     v[i] = beta2 * v[i] + (1 - beta2) * square::Map(grad_i);
     const DType d_t = (1 - power::Map(beta1, t)) / lr *
@@ -1299,10 +1252,11 @@ struct AdamUpdateKernel {
     const DType epsilon, const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[i] * wd;
+    DType grad_rescaled = grad_data[i] * rescale_grad;
     if (clip_gradient >= 0.f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[i] * wd;
 
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i] = beta2 * var_data[i] +
@@ -1362,17 +1316,13 @@ struct AdamDnsRspDnsKernel<req, cpu> {
       const dim_t data_i = row_offset + j;
       // index in grad
       const dim_t grad_i = i * row_length + j;
-      const DType grad_rescaled = grad_data[grad_i] * rescale_grad + weight_data[data_i] * wd;
+      DType grad_rescaled = grad_data[grad_i] * rescale_grad;
       if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * grad_rescaled * grad_rescaled;
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += weight_data[data_i] * wd;
+      mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+      var_data[data_i] = beta2 * var_data[data_i] +(1.f - beta2) * grad_rescaled * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
                     (square_root::Map(var_data[data_i]) + epsilon));
     }
@@ -1395,10 +1345,11 @@ struct AdamDnsRspDnsKernel<req, gpu> {
     // index in data/mean/var
     const dim_t data_i = row_offset + col_id;
     // index in grad
-    DType grad_rescaled = grad_data[i] * rescale_grad + weight_data[data_i] * wd;
+    DType grad_rescaled = grad_data[i] * rescale_grad;
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[data_i] * wd;
     mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
     var_data[data_i] = beta2 * var_data[data_i] +
                        (1.f - beta2) * grad_rescaled * grad_rescaled;
@@ -1914,8 +1865,8 @@ inline void MPLambUpdatePhaseTwo(const nnvm::NodeAttrs& attrs,
 // by Alex Graves, 2013.
 struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   float lr;
-  float gamma1;
-  float gamma2;
+  float rho;
+  float momentum;
   float epsilon;
   float wd;
   float rescale_grad;
@@ -1924,9 +1875,9 @@ struct RMSPropAlexParam : public dmlc::Parameter<RMSPropAlexParam> {
   DMLC_DECLARE_PARAMETER(RMSPropAlexParam) {
     DMLC_DECLARE_FIELD(lr)
     .describe("Learning rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    DMLC_DECLARE_FIELD(rho).set_default(0.95f)
     .describe("Decay rate.");
-    DMLC_DECLARE_FIELD(gamma2).set_default(0.9f)
+    DMLC_DECLARE_FIELD(momentum).set_default(0.9f)
     .describe("Decay rate.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
     .describe("A small constant for numerical stability.");
@@ -1956,25 +1907,26 @@ struct RMSPropAlexUpdateKernel {
     DType* state_n_data, DType* state_g_data, DType* delta_data,
     const DType* weight_data, const DType* grad_data,
     const DType clip_gradient, const DType rescale_grad,
-    const DType gamma1, const DType gamma2,
+    const DType rho, const DType momentum,
     const DType lr, const DType wd,
     const DType clip_weights, const DType epsilon,
     const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i];
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
 
-    state_n_data[i] = (1.f - gamma1) * grad_rescaled * grad_rescaled +
-                      gamma1 * state_n_data[i];
-    state_g_data[i] = (1.f - gamma1) * grad_rescaled +
-                      gamma1 * state_g_data[i];
-    delta_data[i] = gamma2 * delta_data[i] -
+    state_n_data[i] = (1.f - rho) * square::Map(grad_rescaled) +
+                      rho * state_n_data[i];
+    state_g_data[i] = (1.f - rho) * grad_rescaled +
+                      rho * state_g_data[i];
+    delta_data[i] = momentum * delta_data[i] -
                     (lr * (grad_rescaled) /
                       (square_root::Map(state_n_data[i] -
-                                        state_g_data[i] * state_g_data[i] + epsilon)));
+                                        square::Map(state_g_data[i]) + epsilon)));
 
     if (clip_weights >= 0.0f) {
       const DType clipped_weight = clip::Map(weight_data[i] + delta_data[i], clip_weights);
@@ -1997,15 +1949,15 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
   MSHADOW_REAL_TYPE_SWITCH(inputs[0].type_flag_, DType, {
     DType* weight_data = inputs[0].dptr<DType>();
     DType* grad_data = inputs[1].dptr<DType>();
-    DType* state_n_data = inputs[2].dptr<DType>();
-    DType* state_g_data = inputs[3].dptr<DType>();
+    DType* state_g_data = inputs[2].dptr<DType>();
+    DType* state_n_data = inputs[3].dptr<DType>();
     DType* delta_data = inputs[4].dptr<DType>();
     DType* out_data = outputs[0].dptr<DType>();
 
     Kernel<RMSPropAlexUpdateKernel, xpu>::Launch(s, inputs[0].shape_.Size(),
       out_data, state_n_data, state_g_data, delta_data, weight_data, grad_data,
       static_cast<DType>(param.clip_gradient), static_cast<DType>(param.rescale_grad),
-      static_cast<DType>(param.gamma1), static_cast<DType>(param.gamma2),
+      static_cast<DType>(param.rho), static_cast<DType>(param.momentum),
       static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.clip_weights), static_cast<DType>(param.epsilon), req[0]);
   });
@@ -2016,7 +1968,7 @@ inline void RMSPropAlexUpdate(const nnvm::NodeAttrs &attrs,
 // by Tieleman & Hinton, 2012
 struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   float lr;
-  float gamma1;
+  float rho;
   float epsilon;
   float wd;
   float rescale_grad;
@@ -2025,7 +1977,7 @@ struct RMSPropParam : public dmlc::Parameter<RMSPropParam> {
   DMLC_DECLARE_PARAMETER(RMSPropParam) {
     DMLC_DECLARE_FIELD(lr)
     .describe("Learning rate");
-    DMLC_DECLARE_FIELD(gamma1).set_default(0.95f)
+    DMLC_DECLARE_FIELD(rho).set_default(0.95f)
     .describe("The decay rate of momentum estimates.");
     DMLC_DECLARE_FIELD(epsilon).set_default(1e-8f)
     .describe("A small constant for numerical stability.");
@@ -2055,20 +2007,21 @@ struct RMSPropUpdateKernel {
     DType* out_data, DType* state_n_data,
     const DType* weight_data, const DType* grad_data,
     const DType clip_gradient, const DType rescale_grad,
-    const DType gamma1, const DType lr, const DType wd,
+    const DType rho, const DType lr, const DType wd,
     const DType clip_weights, const DType epsilon,
     const OpReqType req) {
     using namespace mshadow_op;
 
-    DType grad_rescaled = rescale_grad * grad_data[i] + wd * weight_data[i];
+    DType grad_rescaled = rescale_grad * grad_data[i];
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
 
-    state_n_data[i] = (1.f - gamma1) * (grad_rescaled * grad_rescaled) + gamma1 * state_n_data[i];
+    state_n_data[i] = (1.f - rho) * square::Map(grad_rescaled) + rho * state_n_data[i];
 
     DType weight = weight_data[i] -
-                   lr * (grad_rescaled / square_root::Map(state_n_data[i] + epsilon));
+                   lr * (grad_rescaled) / (square_root::Map(state_n_data[i]) + epsilon);
     if (clip_weights >= 0.0f) {
       weight = clip::Map(weight, clip_weights);
     }
@@ -2093,7 +2046,7 @@ inline void RMSPropUpdate(const nnvm::NodeAttrs &attrs, const OpContext &ctx,
     Kernel<RMSPropUpdateKernel, xpu>::Launch(s, inputs[0].shape_.Size(),
       out_data, state_n_data, weight_data, grad_data,
       static_cast<DType>(param.clip_gradient), static_cast<DType>(param.rescale_grad),
-      static_cast<DType>(param.gamma1), static_cast<DType>(param.lr), static_cast<DType>(param.wd),
+      static_cast<DType>(param.rho), static_cast<DType>(param.lr), static_cast<DType>(param.wd),
       static_cast<DType>(param.clip_weights), static_cast<DType>(param.epsilon), req[0]);
   });
 }
@@ -2150,10 +2103,9 @@ struct FtrlUpdateKernel {
                       weight_data[i] / lr;
     n_data[i] += square::Map(grad_rescaled);
 
-    KERNEL_ASSIGN(out_data[i], req,
-                  (sign::Map(z_data[i]) * lamda1 - z_data[i]) /
-                  ((beta + square_root::Map(n_data[i])) / lr + wd) *
-                  gt::Map(abs::Map(z_data[i]), lamda1));
+    DType d = - sign::Map(z_data[i]) * maximum::Map(abs::Map(z_data[i]) - lamda1,
+                                                    static_cast<DType>(0));
+    KERNEL_ASSIGN(out_data[i], req, d / ((beta + square_root::Map(n_data[i])) / lr + wd));
   }
 };
 
@@ -2197,23 +2149,18 @@ struct FtrlDnsRspDnsKernel {
       const dim_t data_i = row_offset + j;
       // index in grad
       const dim_t grad_i = i * row_length + j;
-      const DType grad_rescaled = grad_data[grad_i] * rescale_grad;
+      DType grad_rescaled = grad_data[grad_i] * rescale_grad;
       if (clip_gradient >= 0.0f) {
-        z_data[data_i] += clip::Map(grad_rescaled, clip_gradient) -
-                          (square_root::Map(n_data[data_i] +
-                          square::Map(clip::Map(grad_rescaled, clip_gradient))) -
-                          square_root::Map(n_data[data_i])) * weight_data[data_i] / lr;
-        n_data[data_i] += square::Map(clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
-                          square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
-                          weight_data[data_i] / lr;
-        n_data[data_i] += square::Map(grad_rescaled);
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
-      KERNEL_ASSIGN(out_data[data_i], req,
-                    (sign::Map(z_data[data_i]) * lamda1 - z_data[data_i]) /
-                    ((beta + square_root::Map(n_data[data_i])) / lr + wd) *
-                    gt::Map(abs::Map(z_data[data_i]), lamda1));
+      z_data[data_i] += grad_rescaled - (square_root::Map(n_data[data_i] +
+                        square::Map(grad_rescaled)) - square_root::Map(n_data[data_i])) *
+                        weight_data[data_i] / lr;
+      n_data[data_i] += square::Map(grad_rescaled);
+
+      DType d = - sign::Map(z_data[data_i]) * maximum::Map(abs::Map(z_data[data_i]) - lamda1,
+                                                           static_cast<DType>(0));
+      KERNEL_ASSIGN(out_data[data_i], req, d / ((beta + square_root::Map(n_data[data_i])) / lr + wd));
     }
   }
 };
@@ -2523,7 +2470,7 @@ struct AdagradDnsRspDnsKernel<cpu> {
       }
       const DType grad_squared = grad_rescaled * grad_rescaled;
       state_data[data_j] += grad_squared;
-      const DType div = grad_rescaled / square_root::Map(state_data[data_j] + epsilon);
+      const DType div = grad_rescaled / (square_root::Map(state_data[data_j]) + epsilon);
       // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
       out_data[data_j] = weight_data[data_j] - div * lr;
     }
@@ -2548,7 +2495,7 @@ struct AdagradDnsRspDnsKernel<gpu> {
     }
     const DType grad_squared = grad_rescaled * grad_rescaled;
     state_data[data_i] += grad_squared;
-    const DType div = grad_rescaled / square_root::Map(state_data[data_i] + epsilon);
+    const DType div = grad_rescaled / (square_root::Map(state_data[data_i]) + epsilon);
     // No need to use KERNEL_ASSIGN, as we already checked req is kWriteInplace
     out_data[data_i] = weight_data[data_i] - div * lr;
   }
diff --git a/src/operator/optimizer_op.cc b/src/operator/optimizer_op.cc
index 93e1267cc8c7..2ac3673e4a09 100644
--- a/src/operator/optimizer_op.cc
+++ b/src/operator/optimizer_op.cc
@@ -112,7 +112,6 @@ struct SGDMomStdDnsRspDnsKernel<req, cpu> {
     DType* mom_data, const DType* weight_data, const IType* grad_idx,
     const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
     const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
-    const DType rate = lr * wd;
     const bool non_zero = (i == 0) ? prefix_sum[0] > 0
                                    : prefix_sum[i] > prefix_sum[i-1];
 
@@ -122,17 +121,13 @@ struct SGDMomStdDnsRspDnsKernel<req, cpu> {
       const index_t data_i = row_i + j;
       const DType grad = non_zero ? grad_data[grad_i + j]
                                   : static_cast<DType>(0);
+      DType grad_rescaled = rescale_grad * grad;
       if (clip_gradient >= 0.0f) {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                - rate * weight_data[data_i]
-                - lr *
-                mshadow_op::clip::Map(rescale_grad * grad,
-                                      clip_gradient);
-      } else {
-        mom_data[data_i] = momentum * mom_data[data_i]
-                  - rate * weight_data[data_i]
-                  - lr * rescale_grad * grad;
+        grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += wd * weight_data[data_i];
+      mom_data[data_i] *= momentum;
+      mom_data[data_i] -= lr * grad_rescaled;
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] + mom_data[data_i]);
     }
   }
@@ -208,20 +203,16 @@ struct AdamStdDnsRspDnsKernel<req, cpu> {
     const RType grad_i = (prefix_sum[i]-1) * row_length;
     for (index_t j = 0; j < row_length; j++) {
       const index_t data_i = row_i + j;
-      const DType grad_rescaled = non_zero ? static_cast<DType>(
-                                               grad_data[grad_i + j] * rescale_grad +
-                                               weight_data[data_i] * wd)
-                                           : static_cast<DType>(weight_data[data_i] * wd);
+      DType grad_rescaled = non_zero ? static_cast<DType>(
+                                         grad_data[grad_i + j] * rescale_grad)
+                                     : static_cast<DType>(0);
       if (clip_gradient >= 0.0f) {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) *
-                            clip::Map(grad_rescaled, clip_gradient);
-        var_data[data_i] =  beta2 * var_data[data_i] + (1.f - beta2) * square::Map(
-                            clip::Map(grad_rescaled, clip_gradient));
-      } else {
-        mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
-        var_data[data_i] = beta2 * var_data[data_i] +
-                           (1.f - beta2) * square::Map(grad_rescaled);
+        grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
       }
+      grad_rescaled += weight_data[data_i] * wd;
+      mean_data[data_i] = beta1 * mean_data[data_i] + (1.f - beta1) * grad_rescaled;
+      var_data[data_i] = beta2 * var_data[data_i] +
+                         (1.f - beta2) * square::Map(grad_rescaled);
       KERNEL_ASSIGN(out_data[data_i], req, weight_data[data_i] - lr * mean_data[data_i] /
                     (square_root::Map(var_data[data_i]) + epsilon));
     }
@@ -780,7 +771,7 @@ gradient and :math:`E[g^2]_t` is the decaying average over past squared gradient
 The :math:`E[g^2]_t` is given by:
 
 .. math::
-  E[g^2]_t = \gamma * E[g^2]_{t-1} + (1-\gamma) * g_t^2
+  E[g^2]_t = \rho * E[g^2]_{t-1} + (1-\rho) * g_t^2
 
 The update step is
 
@@ -791,7 +782,7 @@ The RMSProp code follows the version in
 http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf
 Tieleman & Hinton, 2012.
 
-Hinton suggests the momentum term :math:`\gamma` to be 0.9 and the learning rate
+Hinton suggests the momentum term :math:`\rho` to be 0.9 and the learning rate
 :math:`\eta` to be 0.001.
 
 )code" ADD_FILELINE)
@@ -819,19 +810,19 @@ Define :math:`E[g^2]_t` is the decaying average over past squared gradient and
 :math:`E[g]_t` is the decaying average over past gradient.
 
 .. math::
-  E[g^2]_t = \gamma_1 * E[g^2]_{t-1} + (1 - \gamma_1) * g_t^2\\
-  E[g]_t = \gamma_1 * E[g]_{t-1} + (1 - \gamma_1) * g_t\\
-  \Delta_t = \gamma_2 * \Delta_{t-1} - \frac{\eta}{\sqrt{E[g^2]_t - E[g]_t^2 + \epsilon}} g_t\\
+  E[g^2]_t = \rho * E[g^2]_{t-1} + (1 - \rho) * g_t^2\\
+  E[g]_t = \rho * E[g]_{t-1} + (1 - \rho) * g_t\\
+  momentum_t = \gamma * momentum_{t-1} - \frac{\eta}{\sqrt{E[g^2]_t - E[g]_t^2 + \epsilon}} g_t\\
 
 The update step is
 
 .. math::
-  \theta_{t+1} = \theta_t + \Delta_t
+  \theta_{t+1} = \theta_t + momentum_t
 
 The RMSPropAlex code follows the version in
 http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
 
-Graves suggests the momentum term :math:`\gamma_1` to be 0.95, :math:`\gamma_2`
+Graves suggests the momentum term :math:`\rho` to be 0.95, :math:`\gamma`
 to be 0.9 and the learning rate :math:`\eta` to be 0.0001.
 )code" ADD_FILELINE)
 .set_num_inputs(5)
diff --git a/src/operator/optimizer_op.cu b/src/operator/optimizer_op.cu
index 6920cb06e482..b67fbf890cbe 100644
--- a/src/operator/optimizer_op.cu
+++ b/src/operator/optimizer_op.cu
@@ -37,7 +37,6 @@ struct SGDMomStdDnsRspDnsKernel<req, gpu> {
     const DType* grad_data, const RType* prefix_sum, const DType clip_gradient,
     const DType momentum, const DType lr, const DType wd, const DType rescale_grad) {
     using nnvm::dim_t;
-    const DType rate = lr * wd;
     const dim_t row_id = i / row_length;
     const dim_t col_id = i % row_length;
     const dim_t nnr = prefix_sum[row_id];
@@ -46,14 +45,13 @@ struct SGDMomStdDnsRspDnsKernel<req, gpu> {
     const RType grad_i = (nnr - 1) * row_length + col_id;
     const DType grad = non_zero ? grad_data[grad_i]
                                 : static_cast<DType>(0);
+    DType grad_rescaled = rescale_grad * grad;
     if (clip_gradient >= 0.0f) {
-      mom_data[i] = momentum * mom_data[i]
-              - rate * weight_data[i]
-              - lr * mshadow_op::clip::Map(rescale_grad * grad, clip_gradient);
-    } else {
-      mom_data[i] = momentum * mom_data[i]
-                  - rate * weight_data[i] - lr * rescale_grad * grad;
+      grad_rescaled = mshadow_op::clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += wd * weight_data[i];
+    mom_data[i] *= momentum;
+    mom_data[i] -= lr * grad_rescaled;
     KERNEL_ASSIGN(out_data[i], req, weight_data[i] + mom_data[i]);
   }
 };
@@ -139,12 +137,12 @@ struct AdamStdDnsRspDnsKernel<req, gpu> {
     const bool non_zero = (row_id == 0) ? prefix_sum[0] > 0
                           : prefix_sum[row_id] > prefix_sum[row_id - 1];
     const RType grad_offset = (prefix_sum[row_id] - 1) * row_length + col_id;
-    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad
-                                                        + weight_data[i] * wd)
-                                   : static_cast<DType>(weight_data[i] * wd);
+    DType grad_rescaled = non_zero ? static_cast<DType>(grad_data[grad_offset] * rescale_grad)
+                                   : static_cast<DType>(0);
     if (clip_gradient >= 0.0f) {
       grad_rescaled = clip::Map(grad_rescaled, clip_gradient);
     }
+    grad_rescaled += weight_data[i] * wd;
     mean_data[i] = beta1 * mean_data[i] + (1.f - beta1) * grad_rescaled;
     var_data[i] = beta2 * var_data[i] +
                   (1.f - beta2) * square::Map(grad_rescaled);
diff --git a/src/optimizer/sgd-inl.h b/src/optimizer/sgd-inl.h
index 12738f8e4053..00afe2ad079a 100644
--- a/src/optimizer/sgd-inl.h
+++ b/src/optimizer/sgd-inl.h
@@ -82,7 +82,7 @@ void sgd_mom_update(RunContext ctx, TBlob weight, const TBlob grad, TBlob mom,
   Tensor<xpu, 2> grad2d = grad.FlatTo2D<xpu, real_t>(s);
   if (param.clip_gradient > 0.0f) {
     mom2d = param.momentum*mom2d -
-            lr*(param.rescale_grad*F<sgd_clip>(grad2d, param.clip_gradient) + wd*weight2d);
+            lr*(F<sgd_clip>(param.rescale_grad * grad2d, param.clip_gradient) + wd*weight2d);
   } else {
     mom2d = param.momentum*mom2d - lr*(param.rescale_grad*grad2d + wd*weight2d);
   }
@@ -98,7 +98,7 @@ void sgd_update(RunContext ctx, TBlob weight, const TBlob grad,
   Tensor<xpu, 2> weight2d = weight.FlatTo2D<xpu, real_t>(s);
   Tensor<xpu, 2> grad2d = grad.FlatTo2D<xpu, real_t>(s);
   if (param.clip_gradient >= 0.0f) {
-    weight2d -= lr*(param.rescale_grad*F<sgd_clip>(grad2d, param.clip_gradient) +
+    weight2d -= lr*(F<sgd_clip>(param.rescale_grad * grad2d, param.clip_gradient) +
                 wd*weight2d);
   } else {
     weight2d -= lr*(param.rescale_grad*grad2d + wd*weight2d);
diff --git a/tests/python/unittest/test_contrib_optimizer.py b/tests/python/unittest/test_contrib_optimizer.py
index 7cfd0217aa31..5f7c51f257b3 100644
--- a/tests/python/unittest/test_contrib_optimizer.py
+++ b/tests/python/unittest/test_contrib_optimizer.py
@@ -25,76 +25,39 @@
 from common import with_seed
 
 
-# * GroupAdaGrad
-class PyGroupAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of Group AdaGrad optimizer.
-
-    Parameters
-    ----------
-    eps: float, optional
-        Small value to avoid division by 0.
-
-    """
-
-    def __init__(self, eps=1e-5, **kwargs):
-        super(PyGroupAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
-
-    def create_state(self, index, weight):
-        assert len(weight.shape) == 2
-        history = mx.nd.zeros(
-            (weight.shape[0], 1), weight.context, stype=weight.stype)
-        return history
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        assert wd == 0
-
-        history = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        history[:] += mx.nd.mean(mx.nd.square(grad), axis=1, keepdims=True)
-        div = lr * grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] -= div
-
-
 def test_group_adagrad():
     mx.random.seed(0)
-    opt1 = PyGroupAdaGrad
+    opt1 = mx.optimizer.contrib.GroupAdaGrad
     opt2 = mx.optimizer.contrib.GroupAdaGrad
-    shape = (3, 4)
-    eps_options = [{}, {'eps': 1e-8}]
+    shapes = [(3, 4), [5, 6]]
+    eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for options in itertools.product(eps_options, cg_options, rg_options):
+        for options in itertools.product(eps_options, cg_options, rg_options, agg_options):
             kwarg = dict(wd=0.0)
             for option in options:
                 kwarg.update(option)
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
-                dtype,
-                compare_states=False)
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
+                dtype)
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
                 dtype,
                 w_stype='row_sparse',
-                g_stype='row_sparse',
-                compare_states=False)
+                g_stype='row_sparse')
             compare_optimizer(
-                opt1(**kwarg),
-                opt2(**kwarg),
-                shape,
+                opt1(use_fused_step=False, **kwarg),
+                opt2(use_fused_step=True, **kwarg),
+                shapes,
                 dtype,
-                g_stype='row_sparse',
-                compare_states=False)
+                g_stype='row_sparse')
 
 
 @with_seed()
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 6d7cf40f29f7..6137fd9d65df 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -72,145 +72,51 @@ def test_lr_wd_mult():
     args2 = {k: v.asnumpy() for k, v in args2.items()}
 
     assert mod._optimizer.lr_mult == {'fc1_bias': 1.0, 'fc1_weight': 0.0}
-    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5, 'fc1_bias': 0.0}
+    assert mod._optimizer.wd_mult == {'fc2_bias': 0.5, 'fc2_weight': 0.5}
     assert mx.test_utils.almost_equal(args1['fc1_weight'], args2['fc1_weight'], 1e-10)
     assert not mx.test_utils.almost_equal(args1['fc1_bias'], args2['fc1_bias'], 1e-1)
     assert not mx.test_utils.almost_equal(args1['fc2_weight'], args2['fc2_weight'], 1e-1)
 
-# SGD
-
-class PySGD(mx.optimizer.Optimizer):
-    """python reference implemenation of sgd"""
-    def __init__(self, learning_rate=0.01, momentum=0.0, multi_precision=False, **kwargs):
-        super(PySGD, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.momentum = momentum
-        self.multi_precision = multi_precision
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state: momentum
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-
-        """
-        momentum = None
-        weight_master_copy = None
-        do_multi_precision = self.multi_precision and weight.dtype == np.float16
-        if do_multi_precision:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
-            weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (momentum, weight_master_copy)
-        else:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
-            return momentum
-
-    def create_state_multi_precision(self, index, weight):
-        return self.create_state(index, weight)
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
-
-        if not use_multi_precision:
-            if self.momentum == 0.0:
-                if self.clip_gradient is not None:
-                    weight[:] = ((1 - lr*wd)*weight -
-                        lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight[:] = (1 - lr*wd)*weight - lr*self.rescale_grad*grad
-            else:
-                mom = state
-                if self.clip_gradient is not None:
-                    mom[:] = (self.momentum*mom - lr*wd*weight -
-                        lr*mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                    weight += mom
-                else:
-                    mom[:] = self.momentum*mom - lr*wd*weight - lr*self.rescale_grad*grad
-                    weight += mom
-        else:
-            grad32 = array(grad, ctx=grad.context, dtype=np.float32)
-            mom = state[0]
-            weight32 = state[1]
-            if self.momentum == 0.0:
-                if self.clip_gradient is not None:
-                    weight32[:] = ((1 - lr*wd)*weight32 -
-                        lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight32[:] = (1 - lr*wd)*weight32 - lr*self.rescale_grad*grad32
-            else:
-                if self.clip_gradient is not None:
-                    mom[:] = (self.momentum*mom - lr*wd*weight32 -
-                        lr*mx.nd.clip(grad32*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                    weight32 += mom
-                else:
-                    mom[:] = self.momentum*mom - lr*wd*weight32 - lr*self.rescale_grad*grad32
-                    weight32 += mom
-            tmp = weight32.astype(weight.dtype)
-            tmp.copyto(weight)
-
-    def update_multi_precision(self, index, weight, grad, state):
-        self.update(index, weight, grad, state)
 
 @with_seed()
 def test_sgd():
-    opt1 = PySGD
+    opt1 = mx.optimizer.SGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            if (dtype == np.float16 and
-                                    ('multi_precision' not in kwarg or
-                                        not kwarg['multi_precision'])):
-                                continue
-                            if dtype == np.float16:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3)
-                            else:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                            # test operator fallback on cpu
-                            if dtype != np.float16:
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape[:2],
-                                                  dtype, w_stype='csr', g_stype='csr')
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            if dtype == np.float16:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype, rtol=1e-3, atol=1e-4)
+            else:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype)
+            # test operator fallback on cpu
+            if dtype != np.float16:
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  [shapes[0][:2], shapes[1]],
+                                  dtype, w_stype='csr', g_stype='csr')
+
 
 class PySparseSGD(mx.optimizer.Optimizer):
     """python reference implemenation of sgd"""
-    def __init__(self, learning_rate=0.01, momentum=0.0, **kwargs):
+    def __init__(self, learning_rate=0.1, momentum=0.0, **kwargs):
         super(PySparseSGD, self).__init__(learning_rate=learning_rate, **kwargs)
         self.momentum = momentum
 
@@ -228,478 +134,240 @@ def create_state(self, index, weight):
         else:
             return mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
 
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
-
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        num_rows = weight.shape[0]
-        if self.momentum == 0.0:
-            # Update on a per row basis, skip all-zero rows
-            for row in range(num_rows):
-                grad_row = grad[row].asnumpy()
-                all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
-                if all_zeros:
-                   continue
-                if self.clip_gradient is not None:
-                    weight[row] = ((1 - lr*wd)*weight[row] -
-                        lr*mx.nd.clip(grad[row]*self.rescale_grad,
-                                     -self.clip_gradient, self.clip_gradient))
-                else:
-                    weight[row] = (1 - lr*wd)*weight[row] - lr*self.rescale_grad*grad[row]
-        else:
-            mom = state
-            for row in range(num_rows):
-              grad_row = grad[row].asnumpy()
-              all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
-              if all_zeros:
-                  continue
-              if self.clip_gradient is not None:
-                  mom[row] = (self.momentum*mom[row] - lr*wd*weight[row] -
-                      lr*mx.nd.clip(grad[row]*self.rescale_grad, -self.clip_gradient, self.clip_gradient))
-                  weight[row] += mom[row]
-              else:
-                  mom[row] = self.momentum*mom[row] - lr*wd*weight[row] - lr*self.rescale_grad*grad[row]
-                  weight[row] += mom[row]
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            self._update_count(index)
+            num_rows = weight.shape[0]
+            if self.momentum == 0.0:
+                # Update on a per row basis, skip all-zero rows
+                for row in range(num_rows):
+                    grad_row = grad[row].asnumpy()
+                    all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                    if all_zeros:
+                        continue
+                    grad[row] *= self.rescale_grad
+                    if self.clip_gradient is not None:
+                        grad[row] = mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient)
+                    grad[row] += wd * weight[row]
+                    weight[row] -= lr * grad[row]
+            else:
+                mom = state
+                for row in range(num_rows):
+                    grad_row = grad[row].asnumpy()
+                    all_zeros = mx.test_utils.almost_equal(grad_row, np.zeros_like(grad_row))
+                    if all_zeros:
+                        continue
+                    grad[row] *= self.rescale_grad
+                    if self.clip_gradient is not None:
+                        grad[row] = mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient)
+                    grad[row] += wd * weight[row]
+                    mom[row] *= self.momentum
+                    mom[row] -= lr * grad[row]
+                    weight[row] += mom[row]
+
 
 @with_seed()
 def test_sparse_sgd():
     opt1 = PySparseSGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        for mp_option in mp_options:
-                            kwarg = {}
-                            kwarg.update(mom_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            kwarg.update(mp_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='row_sparse', g_stype='row_sparse')
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='default', g_stype='row_sparse')
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            compare_optimizer(opt1(**kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes, dtype,
+                              w_stype='row_sparse', g_stype='row_sparse')
+            compare_optimizer(opt1(**kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes, dtype,
+                              w_stype='default', g_stype='row_sparse')
 
 
 @with_seed()
 def test_std_sparse_sgd():
-    opt1 = PySGD
+    opt1 = mx.optimizer.SGD
     opt2 = mx.optimizer.SGD
-    shape = (3, 4, 5)
-    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    for dtype in [np.float32]:
-        for mom_option in mom_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(mom_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
-                                          w_stype='row_sparse', g_stype='row_sparse')
-                        compare_optimizer(opt1(**kwarg), opt2(lazy_update=False, **kwarg), shape, dtype,
-                                          w_stype='default', g_stype='row_sparse')
-
-
-class PyNAG(PySGD):
-    def __init__(self, **kwargs):
-        super(PyNAG, self).__init__(**kwargs)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
-    def create_state(self, index, weight):
-        """Create additional optimizer state: momentum
-
-        Parameters
-        ----------
-        weight : NDArray
-        The weight data
-
-        """
-        momentum = None
-        weight_master_copy = None
-        do_multi_precision = self.multi_precision and weight.dtype == np.float16
-        if do_multi_precision:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=np.float32)
-            weight_master_copy = array(weight, ctx=weight.context, dtype=np.float32)
-            return (momentum, weight_master_copy)
-        else:
-            if self.momentum != 0.0:
-                momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)
-            return momentum
-
-    def create_state_multi_precision(self, index, weight):
-        return self.create_state(index, weight)
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-        An unique integer key used to index the parameters
-
-        weight : NDArray
-        weight ndarray
+    for dtype in [np.float32]:
+        for params in itertools.product(mom_options, cg_options, rg_options,
+                                        wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              w_stype='row_sparse', g_stype='row_sparse')
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              w_stype='default', g_stype='row_sparse')
 
-        grad : NDArray
-        grad ndarray
-
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        use_multi_precision = isinstance(state, list) or isinstance(state, tuple)
-        if not use_multi_precision:
-            grad = grad * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            if self.momentum == 0.0:
-                weight[:] += -lr * (grad + wd * weight)
-            else:
-              mom = state
-              weight[:] += (self.momentum**2 * mom) - lr*(self.momentum + 1)*(grad + wd*weight)
-              mom[:] = (self.momentum*mom) - lr*(grad + wd*weight)
-        else:
-            grad32 = array(grad, ctx=grad.context, dtype=np.float32)
-            grad32 = grad32 * self.rescale_grad
-            if self.clip_gradient is not None:
-                grad32 = mx.nd.clip(grad32, -self.clip_gradient, self.clip_gradient)
-            mom = state[0]
-            weight32 = state[1]
-            if self.momentum == 0.0:
-                weight32[:] += -lr * (grad32 + wd * weight32)
-            else:
-                weight32[:] += (self.momentum**2 * mom) - lr*(self.momentum+1)*(grad32 + wd*weight32)
-                mom[:] = (self.momentum*mom) - lr*(grad32 + wd*weight32)
-            tmp = weight32.astype(weight.dtype)
-            tmp.copyto(weight)
 
 @with_seed()
 def test_nag():
-    opt1 = PyNAG
+    opt1 = mx.optimizer.NAG
     opt2 = mx.optimizer.NAG
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
-    for dtype in [np.float16, np.float32, np.float64]:
+    for dtype in [np.float16, np.float32]:
         for params in itertools.product(mom_options, cg_options, rg_options,
-                                        wd_options, mp_options):
+                                        wd_options, mp_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                                        not kwarg['multi_precision'])):
+                                         not kwarg['multi_precision'])):
                 continue
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-4)
 
 
-# LAMB optimizer
-class PyLAMB(mx.optimizer.Optimizer):
-    """
-	Python reference implementation of LAMB optimizer.
-    """
-    def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-6,
-                 lower_bound=None, upper_bound=None, bias_correction=True, **kwargs):
-        super(PyLAMB, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-        self.lower_bound = lower_bound
-        self.upper_bound = upper_bound
-        self.bias_correction = bias_correction
-
-    def create_state(self, index, weight):
-        stype = weight.stype
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype),
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=stype))
-
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-        mean, var = state
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-
-        mean[:] = self.beta1 * mean + (1. - self.beta1) * grad
-        var[:] = self.beta2 * var + (1. - self.beta2) * mx.nd.square(grad)
-
-        mean_hat = mean
-        var_hat = var
-        r1 = weight.norm()
-        if self.lower_bound:
-            r1 = mx.nd.maximum(r1, self.lower_bound)
-        if self.upper_bound:
-            r1 = mx.nd.minimum(r1, self.upper_bound)
-        if self.bias_correction:
-            mean_hat = mean / (1. - mx.nd.power(self.beta1, t))
-            var_hat = var / (1. - mx.nd.power(self.beta2, t))
-
-        g = mean_hat / (mx.nd.sqrt(var_hat) + self.epsilon) + wd * weight
-
-        r2 = g.norm()
-        # calculate lamb_trust_ratio
-        r = 1. if r1 == 0. or r2 == 0. else r1 / r2
-        lr *= r
-        # update weight
-        weight[:] -= lr * g
+@with_seed()
+def test_lars():
+    opt1 = mx.optimizer.LARS
+    opt2 = mx.optimizer.LARS
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eta_options = [{}, {'eta': 0.002}, {'eta': 0.01}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eta_options, mom_options, cg_options, rg_options,
+                                        wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-3)
 
 
 @with_seed()
 def test_lamb():
-    opt1 = PyLAMB
+    opt1 = mx.optimizer.LAMB
     opt2 = mx.optimizer.LAMB
-    shape = (3, 4, 5)
+    
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     bc_options = [{}, {'bias_correction': False}, {'bias_correction': True}]
     lb_options = [{}, {'lower_bound': None}, {'lower_bound': 1e-3}]
     ub_options = [{}, {'upper_bound': None}, {'upper_bound': 10}]
-    for params in itertools.product(cg_options, rg_options, wd_options, bc_options, lb_options, ub_options):
-        kwarg = {k: v for param in params for k, v in param.items()}
-        kwarg['multi_precision'] = False
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        kwarg['multi_precision'] = True
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float16, rtol=1e-3, atol=1e-3)
-
-@with_seed()
-def test_multilamb():
-    opt1 = PyLAMB
-    opt2 = mx.optimizer.LAMB
-
-    # shapes as Bert-large
-    dims_x = [1024, 4096, 1024, 1024]
-    dims_y = [1, 1, 1024, 4096]
-    dims_occurrences = [9, 1, 4, 2]
-    nlayers = 4 # 24
-    # extra_dims_x=[30522, 512, 30522]
-    # extra_dims_y=[1, 1024, 1024]
-    shapes=[]
-    for l in range(nlayers):
-        for i, (dx,dy) in enumerate(zip(dims_x, dims_y)):
-            for j in range(dims_occurrences[i]):
-                shapes.append((dx,dy))
-    # for dx,dy in zip(extra_dims_x, extra_dims_y):
-    #    shapes.append((dx,dy))
-
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    bias_options = [{'bias_correction': False}, {'bias_correction': True}]
-
-    for dtype in [np.float16, np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for bias_option in bias_options:
-                        kwarg = {}
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        kwarg.update(bias_option)
-                        if (dtype == np.float16):
-                            kwarg.update({'multi_precision': True})
-                        atol = 1e-3
-                        rtol = 1e-6
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype,
-                                          rtol=rtol, atol=atol, ntensors=len(shapes))
-
-#SGLD
-class PySGLD(mx.optimizer.Optimizer):
-    """python reference implementation of SGLD"""
-
-    def __init__(self, **kwargs):
-        super(PySGLD, self).__init__(**kwargs)
-
-    def create_state(self, index, weight):
-        return None
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, mx.nd.NDArray))
-        assert(isinstance(grad, mx.nd.NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        weight[:] += - lr/2 * (grad + wd * weight) + mx.random.normal(0, math.sqrt(lr), shape=weight.shape,
-                                                            dtype=weight.dtype, ctx=weight.context)
-
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options, rg_options,
+                                        wd_options, bc_options, lb_options, ub_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-3)
 
 
 @with_seed()
 def test_sgld():
-    opt1 = PySGLD
+    opt1 = mx.optimizer.SGLD
     opt2 = mx.optimizer.SGLD
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     ns_options = [1234, 42]
-
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-
-
-    def compare_optimizer_noise_seeded(opt1, opt2, shape, dtype, noise_seed,
-                                       w_stype='default', g_stype='default',
-                                       rtol=1e-4, atol=1e-5, compare_states=True):
-        """Compare opt1 and opt2 with the added functionality that the seed for generating random noise
-        in the SGLD optimizer update is set so that the same noise is used in opt1 and opt2.
-
-        """
-        if w_stype == 'default':
-            w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            w1 = w2.copyto(default_context())
-        elif w_stype == 'row_sparse' or w_stype == 'csr':
-            w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
-            w1 = w2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-        if g_stype == 'default':
-            g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
-            g1 = g2.copyto(default_context())
-        elif g_stype == 'row_sparse' or g_stype == 'csr':
-            g2 = rand_ndarray(shape, g_stype, dtype=dtype)
-            g1 = g2.copyto(default_context()).tostype('default')
-        else:
-            raise Exception("type not supported yet")
-
-        state1 = opt1.create_state_multi_precision(0, w1)
-        state2 = opt2.create_state_multi_precision(0, w2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2)
-
-        # set seed for Gaussian noise replication
-        mx.random.seed(noise_seed)
-        opt1.update_multi_precision(0, w1, g1, state1)
-        mx.random.seed(noise_seed)
-        opt2.update_multi_precision(0, w2, g2, state2)
-        if compare_states:
-            compare_ndarray_tuple(state1, state2, rtol=rtol, atol=atol)
-        assert_almost_equal(w1.asnumpy(), w2.asnumpy(), rtol=rtol, atol=atol)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for seed in ns_options:
-        for dtype in [np.float16, np.float32, np.float64]:
-            for params in itertools.product(cg_options, wd_options, mp_options):
+        for dtype in [np.float16, np.float32]:
+            for params in itertools.product(cg_options, wd_options, mp_options, agg_options):
                 kwarg = {k: v for param in params for k, v in param.items()}
-                if (dtype == np.float16 and ('multi_precision' not in kwarg or
-                    not kwarg['multi_precision'])):
+                if (dtype == np.float16 and ('multi_precision' not in kwarg
+                                             or not kwarg['multi_precision'])):
                     continue
                 atol = 1e-2 if dtype == np.float16 else 1e-3
                 rtol = 1e-4 if dtype == np.float16 else 1e-5
-                compare_optimizer_noise_seeded(opt1(**kwarg), opt2(**kwarg), shape, dtype, seed, atol=atol, rtol=rtol)
-
-
-
-# FTML
+                compare_optimizer_noise_seeded(opt1(**kwarg),
+                                               opt2(**kwarg),
+                                               shapes, dtype, seed, atol=atol, rtol=rtol)
 
-class PyFTML(mx.optimizer.Optimizer):
-    """python reference implemenation of FTML"""
-    def __init__(self, beta1=0.6, beta2=0.999, epsilon=1e-8, **kwargs):
-        super(PyFTML, self).__init__(**kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # d_0
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype), # v_0
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype)) # z_0
-
-    def update(self, index, weight, grad, state):
-        assert(isinstance(weight, mx.nd. NDArray))
-        assert(isinstance(grad, mx.nd.NDArray))
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        t = self._index_update_count[index]
-
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        # get previous states
-        prev_d, prev_v, prev_z = state
-        # compute states
-        v_t = self.beta2 * prev_v + (1 - self.beta2) * mx.nd.square(grad)
-        d_t = (1 - pow(self.beta1, t)) / lr * (mx.nd.sqrt(v_t / (1 - pow(self.beta2, t))) + self.epsilon)
-        sigma_t = d_t - self.beta1 * prev_d
-        z_t = self.beta1 * prev_z + (1 - self.beta1) * grad - sigma_t * weight
-        # update weight
-        weight[:] = - z_t / d_t
-        # update states
-        prev_d[:] = d_t
-        prev_v[:] = v_t
-        prev_z[:] = z_t
 
 @with_seed()
 def test_ftml():
-    opt1 = PyFTML
+    opt1 = mx.optimizer.FTML
     opt2 = mx.optimizer.FTML
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
     beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    for dtype in [np.float32]:
-        for beta1_option in beta1_options:
-            for beta2_option in beta2_options:
-                for cg_option in cg_options:
-                    for rg_option in rg_options:
-                        for wd_option in wd_options:
-                            kwarg = {}
-                            kwarg.update(beta1_option)
-                            kwarg.update(beta2_option)
-                            kwarg.update(cg_option)
-                            kwarg.update(rg_option)
-                            kwarg.update(wd_option)
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=1e-3, atol=1e-4)
-
-
-# ADAM
-class PyAdam(mx.optimizer.Optimizer):
-    """python reference implemenation of adam"""
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg),
+                              shapes, dtype, rtol=1e-3, atol=1e-4)
+
+
+# Sparse ADAM
+class PySparseAdam(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse adam"""
     def __init__(self, learning_rate=0.001, beta1=0.9, beta2=0.999, epsilon=1e-8,
-                 lazy_update=True, **kwargs):
-        super(PyAdam, self).__init__(learning_rate=learning_rate, **kwargs)
+                 lazy_update=False, **kwargs):
+        super(PySparseAdam, self).__init__(learning_rate=learning_rate, **kwargs)
         self.beta1 = beta1
         self.beta2 = beta2
         self.epsilon = epsilon
@@ -717,391 +385,212 @@ def create_state(self, index, weight):
         return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
 
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
         Parameters
         ----------
-        index : int
-        An unique integer key used to index the parameters
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            lr = self._get_lr(index)
+            wd = self._get_wd(index)
+            t = self._index_update_count[index]
 
-        weight : NDArray
-        weight ndarray
+            mean, variance = state
+            num_rows = weight.shape[0]
 
-        grad : NDArray
-        grad ndarray
+            coef1 = 1. - self.beta1 ** t
+            coef2 = 1. - self.beta2 ** t
+            lr *= math.sqrt(coef2) / coef1
 
-        state : NDArray or other objects returned by init_state
-        The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        self._update_count(index)
-
-        t = self._index_update_count[index]
-        mean, variance = state
-
-        wd = self._get_wd(index)
-        num_rows = weight.shape[0]
-        coef1 = 1. - self.beta1**t
-        coef2 = 1. - self.beta2**t
-        lr *= math.sqrt(coef2)/coef1
-        for row in range(num_rows):
-            # check row slices of all zeros
-            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            # skip zeros during lazy update
-            if all_zeros and self.lazy_update:
-                continue
-            grad[row] = grad[row] * self.rescale_grad + wd * weight[row]
-            # clip gradients
-            if self.clip_gradient is not None:
-                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
-            # update mean
-            mean[row] *= self.beta1
-            mean[row] += grad[row] * (1. - self.beta1)
-            # update variance
-            variance[row] *= self.beta2
-            variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
-            # update weight
-            weight[row] -= lr*mean[row]/(mx.nd.sqrt(variance[row]) + self.epsilon)
+            for row in range(num_rows):
+                # check row slices of all zeros
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(),
+                                                       np.zeros_like(grad[row].asnumpy()))
+                # skip zeros during lazy update
+                if all_zeros and self.lazy_update:
+                    continue
+                grad[row] *= self.rescale_grad
+                # clip gradients
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+                grad[row] += wd * weight[row]
+                # update mean
+                mean[row] *= self.beta1
+                mean[row] += grad[row] * (1. - self.beta1)
+                # update variance
+                variance[row] *= self.beta2
+                variance[row] += (1 - self.beta2) * mx.nd.square(grad[row], out=grad[row])
+                # update weight
+                weight[row] -= lr * mean[row] / (mx.nd.sqrt(variance[row]) + self.epsilon)
 
 
 @with_seed()
 def test_adam():
-    opt1 = PyAdam
+    opt1 = mx.optimizer.Adam
     opt2 = mx.optimizer.Adam
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for mp_option in mp_options:
-                        kwarg = {}
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        kwarg.update(mp_option)
-                        if (dtype == np.float16 and
-                                ('multi_precision' not in kwarg or
-                                    not kwarg['multi_precision'])):
-                            continue
-                        # atol 2e-5 needed to pass with seed 1248389097
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(**kwarg), shape, dtype,
-                                          rtol=1e-4, atol=2e-5)
-                        # atol 2e-5 needed to pass with seed 781809840
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
-                                          dtype, w_stype='row_sparse', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
-                                          dtype, w_stype='row_sparse', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape,
-                                          dtype, w_stype='default', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-                        compare_optimizer(opt1(lazy_update=False, **kwarg), opt2(lazy_update=False, **kwarg), shape,
-                                          dtype, w_stype='default', g_stype='row_sparse',
-                                          rtol=1e-4, atol=2e-5)
-
-
-# AdaMax
-class PyAdamax(mx.optimizer.Optimizer):
-    """The python reference of AdaMax optimizer.
-
-    This class implements the AdaMax optimizer, one variant of Adam based on the infinity norm,
-    available at http://arxiv.org/abs/1412.6980 Section 7.
-
-    The optimizer updates the weight by::
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        m = beta1 * m_t + (1 - beta1) * grad
-        u = maximum(beta2 * u, abs(grad))
-        weight -= lr / (1 - beta1**t) * m / u
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    beta1 : float, optional
-        Exponential decay rate for the first moment estimates.
-    beta2 : float, optional
-        Exponential decay rate for the second moment estimates.
-    """
-    def __init__(self, learning_rate=0.002, beta1=0.9, beta2=0.999, **kwargs):
-        super(PyAdamax, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.beta1 = beta1
-        self.beta2 = beta2
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # mean
-                mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # variance
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        t = self._index_update_count[index]
-        lr /= (1. - self.beta1**t)
-
-        # preprocess grad
-        grad = grad * self.rescale_grad + wd * weight
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            # atol 2e-5 needed to pass with seed 1248389097
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=2e-5)
 
-        # update m_t and u_t
-        m_t, u_t = state
-        m_t[:] = self.beta1 * m_t + (1. - self.beta1) * grad
-        u_t[:] = mx.nd.maximum(self.beta2 * u_t, mx.nd.abs(grad))
 
-        # update weight
-        weight[:] -= lr * m_t / u_t
+@with_seed()
+def test_sparse_adam():
+    opt1 = PySparseAdam
+    opt2 = mx.optimizer.Adam
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and ('multi_precision' not in kwarg or
+                                         not kwarg['multi_precision'])):
+                continue
+            # atol 2e-5 needed to pass with seed 1248389097
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=2e-5)
+            # atol 2e-5 needed to pass with seed 781809840
+            compare_optimizer(opt1(lazy_update=True, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=True, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=True, **kwarg), shapes,
+                              dtype, w_stype='default', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
+            compare_optimizer(opt1(lazy_update=False, **kwarg),
+                              opt2(use_fused_step=True, lazy_update=False, **kwarg), shapes,
+                              dtype, w_stype='default', g_stype='row_sparse',
+                              rtol=1e-4, atol=2e-5)
 
 
 @with_seed()
 def test_adamax():
-    opt1 = PyAdamax
+    opt1 = mx.optimizer.Adamax
     opt2 = mx.optimizer.Adamax
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    for dtype in [np.float16, np.float32, np.float64]:
-        for params in itertools.product(cg_options, rg_options, wd_options, mp_options):
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        rg_options, wd_options, mp_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if (dtype == np.float16 and
-                    ('multi_precision' not in kwarg or
-                    not kwarg['multi_precision'])):
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
                 continue
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-
-
-# Signum
-class PySignum(mx.optimizer.Optimizer):
-    """The python reference of Signum optimizer.
-
-    The optimizer updates the weight by:
-
-        rescaled_grad = rescale_grad * clip(grad, clip_gradient) + wd * weight
-        state = momentum * state + (1-momentum)*rescaled_grad
-        weight = (1 - lr * wd_lh) * weight - lr * sign(state)
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
-    See the original paper at: https://jeremybernste.in/projects/amazon/signum.pdf
-
-    For details of the update algorithm see
-    :class:`~mxnet.ndarray.signsgd_update` and :class:`~mxnet.ndarray.signum_update`.
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    momentum : float, optional
-       The momentum value.
-    wd_lh : float, optitional
-       The amount of decoupled weight decay regularization.
-    """
-    def __init__(self, learning_rate=0.01, momentum=0.9, wd_lh = 0.0, **kwargs):
-        super(PySignum, self).__init__(learning_rate = learning_rate, **kwargs)
-        self.momentum = momentum
-        self.wd_lh = wd_lh
-
-    def create_state(self, index, weight):
-        momentum = None
-        if self.momentum != 0.0:
-            momentum = mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype, stype=weight.stype)
-        return momentum
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-
-        if state is not None:
-            mom = state
-            if self.clip_gradient is not None:
-              mom[:] = (self.momentum*mom - (1-self.momentum)*(wd*weight +
-                  mx.nd.clip(grad*self.rescale_grad, -self.clip_gradient, self.clip_gradient)))
-            else:
-              mom[:] = self.momentum*mom - (1-self.momentum)*wd*weight - (1-self.momentum)*self.rescale_grad*grad
-            weight[:] = (1 - lr*self.wd_lh)*weight + lr*mx.nd.sign(mom)
-        else:
-            weight[:] = (1 - lr*(wd+self.wd_lh))*weight - lr*mx.nd.sign(grad)
 
 @with_seed()
 def test_signum():
-    opt1 = PySignum
+    opt1 = mx.optimizer.Signum
     opt2 = mx.optimizer.Signum
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
     wd_lh_options = [{}, {'wd_lh': 0.015}, {'wd_lh': 0.0}]
     mom_options = [{}, {'momentum': 0.9}]
     lr_options = [{'learning_rate': 0.05},{'learning_rate': 0.01}]
-    for dtype in [np.float32, np.float64]:
-        for cg_option in cg_options:
-            for rg_option in rg_options:
-                for wd_option in wd_options:
-                    for mp_option in wd_lh_options:
-                        for lr_option in lr_options:
-                            for mom_option in mom_options:
-                                kwarg = {}
-                                kwarg.update(cg_option)
-                                kwarg.update(rg_option)
-                                kwarg.update(wd_option)
-                                kwarg.update(mp_option)
-                                kwarg.update(lr_option)
-                                kwarg.update(mom_option)
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-
-
-# RMSProp
-class PyRMSProp(mx.optimizer.Optimizer):
-    """RMSProp optimizer of Tieleman & Hinton, 2012,
-
-    For centered=False, the code follows the version in
-    http://www.cs.toronto.edu/~tijmen/csc321/slides/lecture_slides_lec6.pdf by
-    Tieleman & Hinton, 2012
-
-    For centered=True, the code follows the version in
-    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
-
-    Parameters
-    ----------
-    learning_rate : float, optional
-        Step size.
-        Default value is set to 0.001.
-    gamma1: float, optional
-        decay factor of moving average for gradient, gradient^2.
-        Default value is set to 0.9.
-    gamma2: float, optional
-        "momentum" factor.
-        Default value if set to 0.9.
-        Only used if centered=True
-    epsilon : float, optional
-        Default value is set to 1e-8.
-    centered : boolean, optional
-        Use Graves or Tielemans & Hintons version of RMSProp
-    wd : float, optional
-        L2 regularization coefficient add to all the weights
-    rescale_grad : float, optional
-        rescaling factor of gradient.
-    clip_gradient : float, optional
-        clip gradient in range [-clip_gradient, clip_gradient]
-    clip_weights : float, optional
-        clip weights in range [-clip_weights, clip_weights]
-
-    """
-    def __init__(self, learning_rate=0.001, gamma1=0.9, gamma2=0.9,
-                 epsilon=1e-8, centered=False, clip_weights=None, **kwargs):
-        super(PyRMSProp, self).__init__(learning_rate=learning_rate, **kwargs)
-        self.centered = centered
-        self.gamma1 = gamma1
-        self.gamma2 = gamma2
-        self.epsilon = epsilon
-        self.clip_weights = clip_weights
-
-    def create_state(self, index, weight):
-        """Create additional optimizer state.
-
-        For centered=False: n
-        For centered=True: n, g, delta
-
-        Parameters
-        ----------
-        weight : NDArray
-            The weight data
-        """
-        if self.centered:
-            return (mx.nd.zeros(weight.shape, weight.context),  # n
-                    mx.nd.zeros(weight.shape, weight.context),  # g
-                    mx.nd.zeros(weight.shape, weight.context))  # delta
-        else:
-            return (mx.nd.zeros(weight.shape, weight.context), )  # n
-
-    def update(self, index, weight, grad, state):
-        """Update the parameters.
-
-        Parameters
-        ----------
-        index : int
-            An unique integer key used to index the parameters
-
-        weight : NDArray
-            weight ndarray
-
-        grad : NDArray
-            grad ndarray
-
-        state : NDArray or other objects returned by init_state
-            The auxiliary state used in optimization.
-        """
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
-        self._update_count(index)
-        grad = grad * self.rescale_grad + wd * weight
-
-        if not self.centered:
-            (n, ) = state
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            weight[:] -= lr * grad/(mx.nd.sqrt(n + self.epsilon))
-
-        else:
-            n, g, delta = state
-            if self.clip_gradient is not None:
-                grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-            n[:] = (1 - self.gamma1) * (grad * grad) + self.gamma1 * n
-            g[:] = (1 - self.gamma1) * grad + self.gamma1 * g
-            delta[:] = (self.gamma2) * delta - lr * grad/(mx.nd.sqrt(n - g*g + self.epsilon))
-            weight[:] += delta
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(cg_options, rg_options, wd_options,
+                                        wd_lh_options, mom_options, lr_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
 
-        if self.clip_weights:
-             mx.ndarray.clip(weight, -self.clip_weights, self.clip_weights, out=weight)
 
 @with_seed()
 def test_rms():
-    opt1 = PyRMSProp
+    opt1 = mx.optimizer.RMSProp
     opt2 = mx.optimizer.RMSProp
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    rho_options = [{}, {'rho': 0.5}, {'rho': 0.7}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     cw_options = [{}, {'clip_weights': 0.01}]
     center_options = [{}, {'centered': False}, {'centered': True}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
     mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         # Reduce foating point compare tolerance to avoid flaky test failure.
         rtol, atol = (1e-1, 1e-1) if dtype is np.float16 else (1e-2, 1e-2)
 
-        for cw_option in cw_options:
-            for cg_option in cg_options:
-                for center_option in center_options:
-                    for rg_option in rg_options:
-                        for wd_option in wd_options:
-                            for mp_option in mp_options:
-                                kwarg = {}
-                                kwarg.update(cw_option)
-                                kwarg.update(cg_option)
-                                kwarg.update(center_option)
-                                kwarg.update(rg_option)
-                                kwarg.update(wd_option)
-                                kwarg.update(mp_option)
-                                if (dtype == np.float16 and
-                                        ('multi_precision' not in kwarg or
-                                            not kwarg['multi_precision'])):
-                                    continue
-                                compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, rtol=rtol, atol=atol)
-                                if (default_context() == mx.cpu()):
-                                    compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
-
-class PyFtrl(mx.optimizer.Optimizer):
-    """The Ftrl optimizer.
+        for params in itertools.product(rho_options, cg_options, cw_options,
+                                        center_options, rg_options, wd_options,
+                                        mom_options, mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=rtol, atol=atol)
+            if default_context() == mx.cpu():
+                compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                                  opt2(use_fused_step=True, **kwarg),
+                                  shapes, dtype, g_stype='row_sparse', rtol=rtol, atol=atol)
+
+
+class PySparseFtrl(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse Ftrl optimizer.
 
     Referenced from *Ad Click Prediction: a View from the Trenches*, available at
     http://dl.acm.org/citation.cfm?id=2488200.
@@ -1119,224 +608,290 @@ class PyFtrl(mx.optimizer.Optimizer):
            \\eta_{t,i} = \\frac{learningrate}{\\beta+\\sqrt{\\sum_{s=1}^tg_{s,i}^t}}
     """
 
-    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, lazy_update=False, **kwargs):
-        super(PyFtrl, self).__init__(**kwargs)
+    def __init__(self, lamda1=0.01, learning_rate=0.1, beta=1, **kwargs):
+        super(PySparseFtrl, self).__init__(**kwargs)
         self.lamda1 = lamda1
         self.beta = beta
         self.lr = learning_rate
-        self.lazy_update = lazy_update
 
     def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # dn
+        return (mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype),  # z
                 mx.nd.zeros(weight.shape, weight.context, dtype=weight.dtype))  # n
 
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        wd = self._get_wd(index)
-        lr = self._get_lr(index)
-        num_rows = weight.shape[0]
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
 
-        dn, n = state
-        for row in range(num_rows):
-            all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
-            if all_zeros and self.lazy_update:
-                continue
-            grad[row] = grad[row] * self.rescale_grad
-            if self.clip_gradient is not None:
-                mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            wd = self._get_wd(index)
+            lr = self._get_lr(index)
+            num_rows = weight.shape[0]
 
-            #update dn, n
-            dn[row] += grad[row] - (mx.nd.sqrt(n[row] + grad[row] * grad[row]) - mx.nd.sqrt(n[row])) * weight[row] / lr
-            n[row] += grad[row] * grad[row]
+            z, n = state
+            for row in range(num_rows):
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+                if all_zeros:
+                    continue
+                grad[row] *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+
+                # update z[row], n[row]
+                sigma = - mx.nd.sqrt(n[row])
+                n[row] += mx.nd.square(grad[row])
+                denom = mx.nd.sqrt(n[row])
+                sigma += denom
+                sigma /= lr
+                z[row] += grad[row] - sigma * weight[row]
+
+                # update weight
+                denom += self.beta
+                denom /= lr
+                denom += wd
+                d = mx.nd.sign(z[row]) * mx.nd.maximum(mx.nd.abs(z[row]) - self.lamda1, 0)
+                weight[row] = - d / denom
 
-            # update weight
-            weight[row] = (mx.nd.sign(dn[row]) * self.lamda1 - dn[row]) / \
-                          ((self.beta + mx.nd.sqrt(n[row])) / lr + wd) * (mx.nd.abs(dn[row]) > self.lamda1)
 
 @with_seed()
 def test_ftrl():
-    opt1 = PyFtrl
+    opt1 = mx.optimizer.Ftrl
     opt2 = mx.optimizer.Ftrl
-    shape = (3, 4, 5)
-    kwargs = [{},
-              {'clip_gradient': 0.5},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14},
-              {'rescale_grad': 0.8},
-              {'clip_gradient': 0.5, 'wd': 0.07},
-              {'clip_gradient': 0.4, 'rescale_grad': 0.14, 'wd': 0.03},
-              {'rescale_grad': 0.8, 'wd': 0.05},
-              {'rescale_grad': 0.8, 'wd': 0.05, 'lamda1': 0.01},
-              {'clip_gradient': 0.5, 'wd': 0.07, 'lamda1': 1.0}]
-    for kwarg in kwargs:
-        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, np.float32)
-        compare_optimizer(opt1(lazy_update=True, **kwarg), opt2(**kwarg), shape,
-                          np.float32, w_stype='row_sparse', g_stype='row_sparse')
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda1_options = [{}, {'lamda1': 0.}, {'lamda1': 0.1}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda1_options, cg_options,
+                                        rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                              rtol=1e-4, atol=1e-4)
+
+
+@with_seed()
+def test_sparse_ftrl():
+    opt1 = PySparseFtrl
+    opt2 = mx.optimizer.Ftrl
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda1_options = [{}, {'lamda1': 0.}, {'lamda1': 0.1}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda1_options, cg_options,
+                                        rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes,
+                              dtype, w_stype='row_sparse', g_stype='row_sparse',
+                              rtol=1e-4, atol=1e-4)
+
 
 @with_seed()
 def test_nadam():
+    opt1 = mx.optimizer.Nadam
+    opt2 = mx.optimizer.Nadam
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
+    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+    schedule_decay_options = [{}, {'schedule_decay': 0.008}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(beta1_options, beta2_options, cg_options,
+                                        schedule_decay_options, rg_options, wd_options,
+                                        mp_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if (dtype == np.float16 and
+                    ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
+                continue
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
-    def get_net(num_hidden, flatten=True):
-        data = mx.symbol.Variable('data')
-        fc1 = mx.symbol.FullyConnected(data, name='fc1', num_hidden=128, flatten=flatten)
-        act1 = mx.symbol.Activation(fc1, name='relu1', act_type="relu")
-        fc2 = mx.symbol.FullyConnected(act1, name = 'fc2', num_hidden = 64, flatten=flatten)
-        act2 = mx.symbol.Activation(fc2, name='relu2', act_type="relu")
-        fc3 = mx.symbol.FullyConnected(act2, name='fc3', num_hidden=num_hidden, flatten=flatten)
-        return fc3
-
-    N = 20
-    data = mx.random.uniform(-1, 1, shape=(N, 10))
-    label = mx.random.uniform(-1, 1, shape=(N, 1))
-    data_iter = mx.io.NDArrayIter(data, label, batch_size=5, label_name='label', shuffle=True)
-    output = get_net(1)
-    l = mx.symbol.Variable('label')
-    Loss = gluon.loss.L1Loss()
-    loss = Loss(output, l)
-    loss = mx.sym.make_loss(loss)
-    mod = mx.mod.Module(loss, data_names=('data',), label_names=('label',))
-    mod.fit(data_iter, num_epoch=60, optimizer_params={'learning_rate': 0.001, 'wd': 0.0005},
-            initializer=mx.init.Xavier(magnitude=2), eval_metric=mx.metric.Loss(),
-            optimizer='nadam')
-    assert mod.score(data_iter, eval_metric=mx.metric.Loss())[0][1] < 0.11
-
-# AdaGrad
-class PyAdaGrad(mx.optimizer.Optimizer):
-    """The python reference of AdaGrad optimizer.
+
+class PySparseAdaGrad(mx.optimizer.Optimizer):
+    """python reference implemenation of sparse Adagrad optimizer.
 
     This class implements the AdaGrad optimizer described in *Adaptive Subgradient
     Methods for Online Learning and Stochastic Optimization*, and available at
     http://www.jmlr.org/papers/volume12/duchi11a/duchi11a.pdf.
 
-    Updates are applied by::
-
-        rescaled_grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        history = history + square(rescaled_grad)
-        w = w - learning_rate * rescaled_grad / sqrt(history + epsilon)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
     Parameters
     ----------
-    eps: float, optional
+    learning_rate : float, default 0.01
+        The initial learning rate. If None, the optimization will use the
+        learning rate from ``lr_scheduler``. If not None, it will overwrite
+        the learning rate in ``lr_scheduler``. If None and ``lr_scheduler``
+        is also None, then it will be set to 0.01 by default.
+    epsilon : float, default 1e-6
         Small value to avoid division by 0.
-
     """
-    def __init__(self, eps=1e-7, **kwargs):
-        super(PyAdaGrad, self).__init__(**kwargs)
-        self.float_stable_eps = eps
+
+    def __init__(self, learning_rate=0.01, epsilon=1e-6, **kwargs):
+        super(PySparseAdaGrad, self).__init__(learning_rate=learning_rate,
+                                              **kwargs)
+        self.epsilon = epsilon
 
     def create_state(self, index, weight):
-        return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)
+        return mx.nd.zeros(weight.shape, weight.context, stype=weight.stype)  # history
+
+    def step(self, indices, weights, grads, states):
+        """Perform an optimization step using gradients and states.
+
+        Parameters
+        ----------
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
+        """
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            self._update_count(index)
+            wd = self._get_wd(index)
+            lr = self._get_lr(index)
+            num_rows = weight.shape[0]
+
+            history = state
+            for row in range(num_rows):
+                all_zeros = mx.test_utils.almost_equal(grad[row].asnumpy(), np.zeros_like(grad[row].asnumpy()))
+                if all_zeros:
+                    continue
+                grad[row] *= self.rescale_grad
+                if self.clip_gradient is not None:
+                    mx.nd.clip(grad[row], -self.clip_gradient, self.clip_gradient, out=grad[row])
+                grad[row] += wd * weight[row]
+
+                # update history[row]
+                history[row] += mx.nd.square(grad[row])
+                denom = mx.nd.sqrt(history[row])
+                denom += self.epsilon
 
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        lr = self._get_lr(index)
-        wd = self._get_wd(index)
+                # update weight
+                weight[row] -= lr * grad[row] / denom
 
-        history = state
-        grad = grad * self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
-        history[:] += mx.nd.square(grad)
-        div = grad / mx.nd.sqrt(history + self.float_stable_eps)
-        weight[:] += (div + weight * wd) * -lr
 
 @with_seed()
 def test_adagrad():
-    opt1 = PyAdaGrad
+    opt1 = mx.optimizer.AdaGrad
     opt2 = mx.optimizer.AdaGrad
-    shape = (3, 4, 5)
-    eps_options = [{}, {'eps': 1e-8}]
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.0}]
-    for dtype in [np.float32]:
-        for eps_option in eps_options:
-            for cg_option in cg_options:
-                for rg_option in rg_options:
-                    for wd_option in wd_options:
-                        kwarg = {}
-                        kwarg.update(eps_option)
-                        kwarg.update(cg_option)
-                        kwarg.update(rg_option)
-                        kwarg.update(wd_option)
-                        compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
-                        if wd_option.get('wd', 0.0) == 0.0:
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              w_stype='row_sparse', g_stype='row_sparse')
-                            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype,
-                                              g_stype='row_sparse')
-
-# AdaDelta
-class PyAdaDelta(mx.optimizer.Optimizer):
-    """The python reference of AdaDelta optimizer.
-
-    This class implements AdaDelta, an optimizer described in  *ADADELTA: An adaptive
-    learning rate method*, available at https://arxiv.org/abs/1212.5701.
-
-    This optimizer updates each weight by::
-
-        grad = clip(grad * rescale_grad + wd * weight, clip_gradient)
-        acc_grad = rho * acc_grad + (1. - rho) * grad ** 2
-        cur_delta = sqrt(acc_delta + epsilon) / sqrt(acc_grad + epsilon) * grad
-        acc_delta = rho * acc_delta + (1. - rho) * cur_delta ** 2
-        weight -= (cur_delta + wd * weight)
-
-    This optimizer accepts the following parameters in addition to those accepted
-    by :class:`.Optimizer`.
-
-    Parameters
-    ----------
-    rho: float
-        Decay rate for both squared gradients and delta.
-    epsilon : float
-        Small value to avoid division by 0.
-    """
-    def __init__(self, rho=0.90, epsilon=1e-5, **kwargs):
-        super(PyAdaDelta, self).__init__(**kwargs)
-        self.rho = rho
-        self.epsilon = epsilon
-
-    def create_state(self, index, weight):
-        return (mx.nd.zeros(weight.shape, weight.context),
-                mx.nd.zeros(weight.shape, weight.context))
-
-    def update(self, index, weight, grad, state):
-        self._update_count(index)
-        wd = self._get_wd(index)
-
-        grad *= self.rescale_grad
-        if self.clip_gradient is not None:
-            grad = mx.nd.clip(grad, -self.clip_gradient, self.clip_gradient)
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eps_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(use_fused_step=False, **kwarg),
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
 
-        acc_grad, acc_delta = state
 
-        acc_grad[:] = self.rho * acc_grad + (1. - self.rho) * grad ** 2
-        current_delta = (mx.nd.sqrt(acc_delta + self.epsilon) /
-                         mx.nd.sqrt(acc_grad + self.epsilon)) * grad
-        acc_delta[:] = self.rho * acc_delta + (1. - self.rho) * current_delta ** 2
+@with_seed()
+def test_sparse_adagrad():
+    opt1 = PySparseAdaGrad
+    opt2 = mx.optimizer.AdaGrad
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    eps_options = [{}, {'epsilon': 1e-8}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.0}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(eps_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            if kwarg.get('wd', 0.0) == 0.0:
+                compare_optimizer(opt1(**kwarg), opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  w_stype='row_sparse', g_stype='row_sparse')
+                compare_optimizer(opt1(**kwarg), opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                  g_stype='row_sparse')
 
-        # update weight
-        weight[:] -= current_delta + wd * weight
 
 @with_seed()
 def test_adadelta():
-    opt1 = PyAdaDelta
+    opt1 = mx.optimizer.AdaDelta
     opt2 = mx.optimizer.AdaDelta
-    shape = (3, 4, 5)
+    shapes = [(3, 4, 5), (10, 4), (7,)]
     rho_options = [{'rho': 0.9}]
     eps_options = [{}, {'epsilon': 1e-8}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.0}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(rho_options, eps_options, cg_options,
-                                        rg_options, wd_options):
+                                        rg_options, wd_options, agg_options):
             kwarg = {k: v for param in params for k, v in param.items()}
             if dtype is np.float16:
                 kwarg.update({'multi_precision': True})
-            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shape, dtype)
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
+
+
+@with_seed()
+def test_dcasgd():
+    opt1 = mx.optimizer.DCASGD
+    opt2 = mx.optimizer.DCASGD
+    shapes = [(3, 4, 5), (10, 4), (7,)]
+    lamda_options = [{}, {'lamda': 0.01}, {'lamda': 0.1}]
+    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
+    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+                   {'aggregate_num': 4}, {'aggregate_num': np.inf}]
+    for dtype in [np.float16, np.float32]:
+        for params in itertools.product(lamda_options, mom_options, cg_options,
+                                        rg_options, wd_options, agg_options):
+            kwarg = {k: v for param in params for k, v in param.items()}
+            if dtype is np.float16:
+                kwarg.update({'multi_precision': True})
+            compare_optimizer(opt1(**kwarg), opt2(**kwarg), shapes, dtype)
 
 
 def test_factor_scheduler():
@@ -1353,6 +908,7 @@ def test_factor_scheduler():
     np.testing.assert_almost_equal(sched(201), base_lr * factor * factor)
     np.testing.assert_almost_equal(sched(1000), 1e-4)
 
+
 def test_multifactor_scheduler():
     base_lr = 0.1
     steps = [15, 25]
@@ -1368,6 +924,7 @@ def test_multifactor_scheduler():
     np.testing.assert_almost_equal(sched(26), base_lr * factor * factor)
     np.testing.assert_almost_equal(sched(100), base_lr * factor * factor)
 
+
 def test_poly_scheduler():
     base_lr = 3
     final_lr = 0
@@ -1382,6 +939,7 @@ def test_poly_scheduler():
     assert (poly_sched(500) < 1.6)
     np.testing.assert_almost_equal(poly_sched(steps), final_lr)
 
+
 def test_cosine_scheduler():
     # also tests case without warmup
     base_lr = 3
@@ -1392,6 +950,8 @@ def test_cosine_scheduler():
     np.testing.assert_almost_equal(cosine_sched(steps), final_lr)
     assert (cosine_sched(500) > 1.5)
 
+
 if __name__ == '__main__':
     import nose
     nose.runmodule()
+

From b4ae72178b3d8c641d052b1040deb78c1e2b3ce0 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-37-252.us-west-2.compute.internal>
Date: Wed, 22 Jan 2020 22:37:47 +0000
Subject: [PATCH 03/10] fix svrg test

---
 .../svrg_optimization/svrg_optimizer.py       | 59 ++++++++++---------
 1 file changed, 30 insertions(+), 29 deletions(-)

diff --git a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
index 0f695a1b2ff0..095727a32f88 100644
--- a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
+++ b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
@@ -27,24 +27,24 @@ class _AssignmentOptimizer(mx.optimizer.Optimizer):
     """_AssignmentOptimizer assigns gradients to weights for SVRGModule's full gradients
     accumulation in the KVStore. It is a helper optimizer that is designed to be used with SVRGModule only.
     """
-    def update(self, index, weight, grad, state):
+    def update(self, indices, weights, grads, states):
         """Assign the gradients to weight for accumulating full gradients in the KVStore across all devices and workers.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state: any obj
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
             AssignmentOptimizer will not need to be associated with state.
         """
-
-        weight[:] = grad
+        for weight, grad in zip(weights, grads):
+            weight[:] = grad
 
 
 @mx.optimizer.register
@@ -98,31 +98,32 @@ def _check_params(**kwargs):
 
         return default_params
 
-    def update(self, index, weight, grad, state):
+    def update(self, indices, weights, grads, states):
         """Updates the given parameter using the corresponding gradient and state. If key contains 'full', update with
         `_AssignmentOptimizer` otherwise will use default optimizer.
 
         Parameters
         ----------
-        index : int
-            The unique index of the parameter into the individual learning
-            rates and weight decays. Learning rates and weight decay
-            may be set via `set_lr_mult()` and `set_wd_mult()`, respectively.
-        weight : NDArray
-            The parameter to be updated.
-        grad : NDArray
-            The gradient of the objective with respect to this parameter.
-        state : any obj
-            The state returned by `create_state()`.
+        indices : list of int
+            List of unique indices of the parameters into the individual learning rates
+            and weight decays. Learning rates and weight decay may be set via `set_lr_mult()`
+            and `set_wd_mult()`, respectively.
+        weights : list of NDArray
+            List of parameters to be updated.
+        grads : list of NDArray
+            List of gradients of the objective with respect to this parameter.
+        states : List of any obj
+            List of state returned by `create_state()`.
         """
 
-        name = self._check_index(index)
+        for index, weight, grad, state in zip(indices, weights, grads, states):
+            name = self._check_index(index)
 
-        if "full" in name:
-            self.aux_opt.update(index, weight, grad, state)
-        else:
-            # use the default optimizer
-            self.default_opt.update(index, weight, grad, state)
+            if "full" in name:
+                self.aux_opt.update([index], [weight], [grad], [state])
+            else:
+                # use the default optimizer
+                self.default_opt.update([index], [weight], [grad], [state])
 
     def create_state(self, index, weight):
         """Creates auxiliary state for a given weight.

From 56421a2bd76b50b033dff153234d3ddcfdbe622a Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Wed, 22 Jan 2020 14:40:47 -0800
Subject: [PATCH 04/10] fix rmsprop param naming

---
 R-package/R/optimizer.R                       | 14 +++++-----
 R-package/tests/testthat/test_optimizer.R     |  4 +--
 benchmark/opperf/rules/default_params.py      |  8 +++---
 .../org/apache/clojure_mxnet/optimizer.clj    | 28 +++++++++----------
 .../apache/clojure_mxnet/optimizer_test.clj   |  2 +-
 cpp-package/include/mxnet-cpp/optimizer.hpp   |  4 +--
 .../tutorials/packages/optimizer/index.md     |  4 +--
 example/speech_recognition/deepspeech.cfg     |  2 +-
 example/speech_recognition/default.cfg        |  2 +-
 .../AI-MXNet/lib/AI/MXNet/Optimizer.pm        | 13 ++++-----
 perl-package/AI-MXNet/t/test_optimizers.t     | 17 ++++++-----
 .../org/apache/mxnet/optimizer/RMSProp.scala  | 12 ++++----
 src/operator/optimizer_op-inl.h               |  3 +-
 .../main/resources/templates/opt_rmsprop.st   |  4 +--
 14 files changed, 58 insertions(+), 59 deletions(-)

diff --git a/R-package/R/optimizer.R b/R-package/R/optimizer.R
index 6f13f7b26ddb..be8d977b2a98 100644
--- a/R-package/R/optimizer.R
+++ b/R-package/R/optimizer.R
@@ -109,9 +109,9 @@ mx.opt.sgd <- function(learning.rate = 0.01,
 #'
 #' @param learning.rate float, default=0.002
 #'      The initial learning rate.
-#' @param gamma1 float, default=0.95
+#' @param rho float, default=0.95
 #'      decay factor of moving average for gradient, gradient^2.
-#' @param gamma2 float, default=0.9
+#' @param momentum float, default=0.9
 #'      "momentum" factor.
 #' @param epsilon float, default=1e-4
 #' @param wd float, default=0.0
@@ -125,8 +125,8 @@ mx.opt.sgd <- function(learning.rate = 0.01,
 #'
 mx.opt.rmsprop <- function(learning.rate = 0.002,
                            centered = TRUE,
-                           gamma1 = 0.95,
-                           gamma2 = 0.9,
+                           rho = 0.95,
+                           momentum = 0.9,
                            epsilon = 1e-4,
                            wd = 0,
                            rescale.grad = 1,
@@ -158,8 +158,8 @@ mx.opt.rmsprop <- function(learning.rate = 0.002,
                                           g,
                                           delta,
                                           lr = lr,
-                                          gamma1 = gamma1,
-                                          gamma2 = gamma2,
+                                          rho = rho,
+                                          momentum = momentum,
                                           epsilon = epsilon,
                                           wd = wd,
                                           rescale_grad = rescale.grad,
@@ -174,7 +174,7 @@ mx.opt.rmsprop <- function(learning.rate = 0.002,
                                       grad,
                                       n,
                                       lr = lr,
-                                      gamma1 = gamma1,
+                                      rho = rho,
                                       epsilon = epsilon,
                                       wd = wd,
                                       rescale_grad = rescale.grad,
diff --git a/R-package/tests/testthat/test_optimizer.R b/R-package/tests/testthat/test_optimizer.R
index 1eec83f2d46e..cbe9575c90ca 100644
--- a/R-package/tests/testthat/test_optimizer.R
+++ b/R-package/tests/testthat/test_optimizer.R
@@ -73,8 +73,8 @@ test_that("rmsprop", {
     fc1_weight = w1, label = y), aux.arrays = NULL, grad.reqs = c("null", "write", 
     "null"))
   
-  optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, gamma1 = 0.95, 
-    gamma2 = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
+  optimizer <- mx.opt.create("rmsprop", learning.rate = 1, centered = TRUE, rho = 0.95,
+    momentum = 0.9, epsilon = 1e-04, wd = 0, rescale.grad = 1, clip_gradient = -1)
   
   updaters <- mx.opt.get.updater(optimizer, exec$ref.arg.arrays, ctx = mx.ctx.default())
   
diff --git a/benchmark/opperf/rules/default_params.py b/benchmark/opperf/rules/default_params.py
index 69b28c6535cc..fa42386ccd39 100644
--- a/benchmark/opperf/rules/default_params.py
+++ b/benchmark/opperf/rules/default_params.py
@@ -81,8 +81,8 @@
 DEFAULT_DELTA = [(1024, 1024), (10000, 1), (10000, 100)]
 DEFAULT_LRS = [(0.1,0.1)]
 DEFAULT_LR = [0.1,0.5,0.9]
-DEFAULT_GAMMA_1 = [0.1,0.5,0.9]
-DEFAULT_GAMMA_2 = [0.1,0.5,0.9]
+DEFAULT_RHO = [0.1,0.5,0.9]
+DEFAULT_MOMENTUM = [0.1,0.5,0.9]
 DEFAULT_EPSILON = [1e-08]
 DEFAULT_BETA_1 = [0.1,0.5,0.9]
 DEFAULT_BETA_2 = [0.1,0.5,0.9]
@@ -139,8 +139,8 @@
                    "lr" : DEFAULT_LR,
                    "lrs" : DEFAULT_LRS,
                    "wds" : DEFAULT_LRS,
-                   "gamma1" : DEFAULT_GAMMA_1,
-                   "gamma2" : DEFAULT_GAMMA_2,
+                   "rho" : DEFAULT_RHO,
+                   "momentum" : DEFAULT_MOMENTUM,
                    "epsilon" : DEFAULT_EPSILON,
                    "beta1" : DEFAULT_BETA_1,
                    "beta2" : DEFAULT_BETA_2,
diff --git a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
index 672090a899b3..e94a59879466 100644
--- a/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
+++ b/contrib/clojure-package/src/org/apache/clojure_mxnet/optimizer.clj
@@ -96,30 +96,30 @@
   ([]
    (ada-delta {})))
 
-(s/def gamma1 number?)
-(s/def gamma2 number?)
-(s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::gamma1 ::gamma2 ::wd ::clip-gradient]))
+(s/def rho number?)
+(s/def momentum number?)
+(s/def ::rms-prop-opts (s/keys :opt-un [::learning-rate ::rescale-gradient ::rho ::momentum ::wd ::clip-gradient]))
 
 (defn rms-prop
   "RMSProp optimizer as described in Tieleman & Hinton, 2012.
    http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
    - learningRate Step size.
-   - gamma1  decay factor of moving average for gradient, gradient^^2.
-   -  gamma2  momentum factor of moving average for gradient.
-   -  rescale-gradient rescaling factor of gradient.
-   -  wd L2 regularization coefficient add to all the weights
-   -  clip-gradient clip gradient in range [-clip_gradient, clip_gradient]
-   -  lr-scheduler The learning rate scheduler"
-  ([{:keys [learning-rate rescale-gradient gamma1 gamma2 wd lr-scheduler clip-gradient] :as opts
+   - rho  decay factor of moving average for gradient, gradient^^2.
+   - momentum  momentum factor of moving average for gradient.
+   - rescale-gradient rescaling factor of gradient.
+   - wd L2 regularization coefficient add to all the weights
+   - clip-gradient clip gradient in range [-clip_gradient, clip_gradient]
+   - lr-scheduler The learning rate scheduler"
+  ([{:keys [learning-rate rescale-gradient rho momentum wd lr-scheduler clip-gradient] :as opts
      :or {learning-rate 0.002
           rescale-gradient 1.0
-          gamma1 0.95
-          gamma2 0.9
+          rho 0.95
+          momentum 0.9
           wd 0.0
           clip-gradient 0}}]
    (util/validate! ::rms-prop-opts opts "Incorrect rms-prop optimizer options")
-   (new RMSProp (float learning-rate) (float rescale-gradient) (float gamma1)
-        (float gamma2) (float wd) lr-scheduler (float clip-gradient)))
+   (new RMSProp (float learning-rate) (float rescale-gradient) (float rho)
+        (float momentum) (float wd) lr-scheduler (float clip-gradient)))
   ([]
    (rms-prop {})))
 
diff --git a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
index 599a0672bea5..f2413dc91101 100644
--- a/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
+++ b/contrib/clojure-package/test/org/apache/clojure_mxnet/optimizer_test.clj
@@ -50,7 +50,7 @@
   (is (thrown? Exception (optimizer/dcasgd {:lambda 'a})))
   (is (thrown? Exception (optimizer/nag {:momentum 'a})))
   (is (thrown? Exception (optimizer/ada-delta {:epsilon 'a})))
-  (is (thrown? Exception (optimizer/rms-prop {:gamma1 'a})))
+  (is (thrown? Exception (optimizer/rms-prop {:rho 'a})))
   (is (thrown? Exception (optimizer/ada-grad {:rescale-gradient 'a})))
   (is (thrown? Exception (optimizer/adam {:beta1 'a})))
   (is (thrown? Exception (optimizer/sgld {:lr-scheduler 0.1}))))
\ No newline at end of file
diff --git a/cpp-package/include/mxnet-cpp/optimizer.hpp b/cpp-package/include/mxnet-cpp/optimizer.hpp
index e9a8bca5f028..b259c7bba61d 100644
--- a/cpp-package/include/mxnet-cpp/optimizer.hpp
+++ b/cpp-package/include/mxnet-cpp/optimizer.hpp
@@ -270,8 +270,8 @@ inline RMSPropOptimizer::RMSPropOptimizer(unsigned begin_num_update)
   : Optimizer(begin_num_update) {
   update_handle_ = op_map()->GetSymbolCreator("rmsprop_update");
   alex_update_handle_ = op_map()->GetSymbolCreator("rmspropalex_update");
-  SetParam("gamma1", 0.9f);
-  SetParam("gamma2", 0.9f);
+  SetParam("rho", 0.9f);
+  SetParam("momentum", 0.9f);
   SetParam("epsilon", 1e-8);
 }
 
diff --git a/docs/python_docs/python/tutorials/packages/optimizer/index.md b/docs/python_docs/python/tutorials/packages/optimizer/index.md
index 3350cc6f5a9a..cc5c06d06a0d 100644
--- a/docs/python_docs/python/tutorials/packages/optimizer/index.md
+++ b/docs/python_docs/python/tutorials/packages/optimizer/index.md
@@ -181,10 +181,10 @@ Here is an example snippet creating the RMSProp optimizer in MXNet.
 
 
 ```python
-rmsprop_optimizer = optimizer.RMSProp(learning_rate=0.001, gamma1=0.9, gamma2=0.9, epsilon=1e-07, centered=False)
+rmsprop_optimizer = optimizer.RMSProp(learning_rate=0.001, rho=0.9, momentum=0.9, epsilon=1e-07, centered=False)
 ```
 
-In the code snippet above, `gamma1` is $\beta$ in the equations above and `gamma2` is $\gamma$, which is only used where `centered=True`.
+In the code snippet above, `rho` is $\beta$ in the equations above and `momentum` is $\gamma$, which is only used where `centered=True`.
 
 ### [AdaDelta](/api/python/docs/api/optimizer/index.html#mxnet.optimizer.AdaDelta)
 
diff --git a/example/speech_recognition/deepspeech.cfg b/example/speech_recognition/deepspeech.cfg
index 69894ae7d640..387d4f31eb2b 100644
--- a/example/speech_recognition/deepspeech.cfg
+++ b/example/speech_recognition/deepspeech.cfg
@@ -112,7 +112,7 @@ optimizer_params_dictionary={"momentum":0.9}
 # adagrad
 # optimizer_params_dictionary={"eps":1e-08}
 # rmsprop
-# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# optimizer_params_dictionary={"rho":0.9, "momentum":0.9,"epsilon":1e-08}
 # adadelta
 # optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
 # set to 0 to disable gradient clipping
diff --git a/example/speech_recognition/default.cfg b/example/speech_recognition/default.cfg
index b0869a9dad2e..f583da11c61a 100644
--- a/example/speech_recognition/default.cfg
+++ b/example/speech_recognition/default.cfg
@@ -109,7 +109,7 @@ optimizer_params_dictionary={"beta1":0.9,"beta2":0.999}
 # adagrad
 # optimizer_params_dictionary={"eps":1e-08}
 # rmsprop
-# optimizer_params_dictionary={"gamma1":0.9, "gamma2":0.9,"epsilon":1e-08}
+# optimizer_params_dictionary={"rho":0.9, "momentum":0.9,"epsilon":1e-08}
 # adadelta
 # optimizer_params_dictionary={"rho":0.95, "epsilon":1e-08}
 # set to 0 to disable gradient clipping
diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
index 7e78cd384220..114ee37afc70 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -1359,11 +1359,10 @@ __PACKAGE__->register;
     learning_rate : Num, optional
         Step size.
         Default value is set to 0.001.
-    gamma1: Num, optional
+    rho: Num, optional
         decay factor of moving average for gradient^2.
         Default value is set to 0.9.
-    gamma2: Num, optional
-        "momentum" factor.
+    momentum: Num, optional
         Default value if set to 0.9.
         Only used if centered=True
     epsilon : Num, optional
@@ -1386,8 +1385,8 @@ use Mouse;
 extends 'AI::MXNet::Optimizer';
 
 has '+learning_rate' => (default => 0.001);
-has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'rho'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'momentum'         => (is => "ro", isa => "Num",  default => 0.9);
 has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
 has 'centered'       => (is => "ro", isa => "Bool", default => 0);
 has 'clip_weights'   => (is => "ro", isa => "Num");
@@ -1397,12 +1396,12 @@ sub BUILD
 {
     my $self = shift;
     $self->kwargs({
-        gamma1       => $self->gamma1,
+        rho       => $self->rho,
         epsilon      => $self->epsilon
     });
     if($self->centered)
     {
-        $self->kwargs->{gamma2} = $self->gamma2;
+        $self->kwargs->{momentum} = $self->momentum;
     }
     if($self->clip_gradient)
     {
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
index af3e54e554f3..bf2dd0665c87 100644
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -109,11 +109,10 @@ method update($index, $weight, $grad, $state)
     learning_rate : float, optional
         Step size.
         Default value is set to 0.001.
-    gamma1: float, optional
+    rho: float, optional
         decay factor of moving average for gradient, gradient^2.
         Default value is set to 0.9.
-    gamma2: float, optional
-        "momentum" factor.
+    momentum: float, optional
         Default value if set to 0.9.
         Only used if centered=True
     epsilon : float, optional
@@ -134,8 +133,8 @@ package PerlRMSProp;
 use Mouse;
 extends 'AI::MXNet::Optimizer';
 has '+learning_rate' => (default => 0.001);
-has 'gamma1'         => (is => "ro", isa => "Num",  default => 0.9);
-has 'gamma2'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'rho'         => (is => "ro", isa => "Num",  default => 0.9);
+has 'momentum'         => (is => "ro", isa => "Num",  default => 0.9);
 has 'epsilon'        => (is => "ro", isa => "Num",  default => 1e-8);
 has 'centered'       => (is => "ro", isa => "Bool", default => 0);
 has 'clip_weights'   => (is => "ro", isa => "Num");
@@ -182,7 +181,7 @@ method update($index, $weight, $grad, $state)
         {
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
-        $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
+        $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
         $weight -= $lr * $grad/(mx->nd->sqrt($n + $self->epsilon));
     }
     else
@@ -192,9 +191,9 @@ method update($index, $weight, $grad, $state)
         {
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
-        $n .= (1 - $self->gamma1) * ($grad * $grad) + $self->gamma1 * $n;
-        $g .= (1 - $self->gamma1) * $grad + $self->gamma1 * $g;
-        $delta .= ($self->gamma2) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
+        $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
+        $g .= (1 - $self->rho) * $grad + $self->rho * $g;
+        $delta .= ($self->momentum) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
         $weight += $delta;
     }
     if($self->clip_weights)
diff --git a/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala b/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala
index 49fca6a1242e..71b20b8c356d 100644
--- a/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala
+++ b/scala-package/core/src/main/scala/org/apache/mxnet/optimizer/RMSProp.scala
@@ -26,15 +26,15 @@ import org.apache.mxnet.NDArrayConversions._
  * http://arxiv.org/pdf/1308.0850v5.pdf Eq(38) - Eq(45) by Alex Graves, 2013.
  *
  * @param learningRate Float, Step size.
- * @param gamma1 Float, decay factor of moving average for gradient, gradient^^2.
- * @param gamma2 Float, momentum factor of moving average for gradient.
+ * @param rho Float, decay factor of moving average for gradient, gradient^^2.
+ * @param momentum Float, momentum factor of moving average for gradient.
  * @param rescaleGradient Float, rescaling factor of gradient.
  * @param wd Float, L2 regularization coefficient add to all the weights
  * @param clipGradient Float, clip gradient in range [-clip_gradient, clip_gradient]
  * @param lrScheduler The learning rate scheduler
  */
 class RMSProp(val learningRate: Float = 0.002f, rescaleGradient: Float = 1.0f,
-              gamma1: Float = 0.95f, gamma2: Float = 0.9f, wd: Float = 0.0f,
+              rho: Float = 0.95f, momentum: Float = 0.9f, wd: Float = 0.0f,
               lrScheduler: LRScheduler = null, clipGradient: Float = 0f) extends Optimizer {
 
   /**
@@ -57,18 +57,18 @@ class RMSProp(val learningRate: Float = 0.002f, rescaleGradient: Float = 1.0f,
       oldResdGrad.dispose()
     }
 
-    val nUpdated = ((1 - this.gamma1) * (resdGrad * resdGrad) + this.gamma1 * n)
+    val nUpdated = ((1 - this.rho) * (resdGrad * resdGrad) + this.rho * n)
       .disposeDepsExcept(resdGrad, n)
     n.set(nUpdated)
     nUpdated.dispose()
 
-    val gUpdated = ((1 - this.gamma1) * resdGrad + this.gamma1 * g)
+    val gUpdated = ((1 - this.rho) * resdGrad + this.rho * g)
       .disposeDepsExcept(resdGrad, g)
     g.set(gUpdated)
     gUpdated.dispose()
 
     val deltaUpdated =
-      (this.gamma2 * delta - lr * (resdGrad / NDArray.sqrt(n - g * g + 1e-4f) + wd * weight))
+      (this.momentum * delta - lr * (resdGrad / NDArray.sqrt(n - g * g + 1e-4f) + wd * weight))
       .disposeDepsExcept(delta, resdGrad, n, g, weight)
     delta.set(deltaUpdated)
     deltaUpdated.dispose()
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index b7dea1015bdb..c511f8417a72 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -2160,7 +2160,8 @@ struct FtrlDnsRspDnsKernel {
 
       DType d = - sign::Map(z_data[data_i]) * maximum::Map(abs::Map(z_data[data_i]) - lamda1,
                                                            static_cast<DType>(0));
-      KERNEL_ASSIGN(out_data[data_i], req, d / ((beta + square_root::Map(n_data[data_i])) / lr + wd));
+      KERNEL_ASSIGN(out_data[data_i], req,
+                    d / ((beta + square_root::Map(n_data[data_i])) / lr + wd));
     }
   }
 };
diff --git a/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st b/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
index 6baec42951d5..1ace86d36f61 100644
--- a/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
+++ b/tools/caffe_translator/src/main/resources/templates/opt_rmsprop.st
@@ -18,7 +18,7 @@
 !>
 <opt_vars(solver)>
 <if(solver.rms_decay)>
-gamma1 = <solver.rms_decay>
+rho = <solver.rms_decay>
 <endif>
 <if(solver.delta)>
 epsilon = <solver.delta>
@@ -26,7 +26,7 @@ epsilon = <solver.delta>
 
 optimizer_params={'learning_rate':base_lr<\\>
 <if(solver.wd)>, 'wd':wd<endif><\\>
-<if(solver.rms_decay)>, 'gamma1':gamma1<endif><\\>
+<if(solver.rms_decay)>, 'rho':rho<endif><\\>
 <if(solver.delta)>, 'epsilon':epsilon<endif>}<\\>
 
 module.init_optimizer(optimizer='RMSProp', optimizer_params=optimizer_params)

From f0519f8d65f7fbd28cbc23438ff552ebad77a353 Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Wed, 22 Jan 2020 15:08:59 -0800
Subject: [PATCH 05/10] fix signum test

---
 python/mxnet/optimizer/signum.py |  5 +++--
 src/operator/optimizer_op-inl.h  | 21 +++++++++------------
 2 files changed, 12 insertions(+), 14 deletions(-)

diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
index 16188ccd2fb8..26cbe2a605d1 100644
--- a/python/mxnet/optimizer/signum.py
+++ b/python/mxnet/optimizer/signum.py
@@ -149,14 +149,15 @@ def fused_step(self, indices, weights, grads, states):
                 kwargs['momentum'] = self.momentum
             if self.clip_gradient:
                 kwargs['clip_gradient'] = self.clip_gradient
-            if self.wd_lh:
-                kwargs['wd_lh'] = self.wd_lh
 
             # update weight with fused kernel
             if state is not None:
+                if self.wd_lh:
+                    kwargs['wd_lh'] = self.wd_lh
                 signum_update(weight, grad, state, out=weight,
                               lr=lr, wd=wd, **kwargs)
             else:
+                wd += self.wd_lh
                 signsgd_update(weight, grad, out=weight,
                                lr=lr, wd=wd, **kwargs)
 
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index c511f8417a72..952bfc9ecd2e 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -2298,8 +2298,8 @@ struct SignSGDKernel {
 
     // param_clip_gradient has no effect for SignSGD
     KERNEL_ASSIGN(out_data[i], req,
-             (1.f-param_lr*param_wd)*weight_data[i]
-               - (param_lr)*((grad_data[i] > 0) - (grad_data[i] < 0)));
+             (1.f - param_lr * param_wd) * weight_data[i]
+               - (param_lr) * mshadow_op::sign::Map(grad_data[i]));
   }
 };
 
@@ -2363,18 +2363,15 @@ struct SignumKernel {
     const DType* grad_data, const DType param_clip_gradient, const DType param_momentum,
     const DType param_lr, const DType param_wd, const DType param_rescale_grad,
     const DType param_wd_lh, const OpReqType req) {
+    DType rescale_grad = param_rescale_grad * grad_data[i];
     if (param_clip_gradient >= 0.0f) {
-      mom_data[i] = param_momentum*mom_data[i]
-              - (1-param_momentum)*param_wd*weight_data[i]
-              - (1-param_momentum)
-              *mshadow_op::clip::Map(param_rescale_grad*grad_data[i], param_clip_gradient);
-    } else {
-      mom_data[i] = param_momentum*mom_data[i]
-                - (1-param_momentum)*param_wd*weight_data[i]
-                - (1-param_momentum)*param_rescale_grad*grad_data[i];
+      rescale_grad = mshadow_op::clip::Map(rescale_grad, param_clip_gradient);
     }
-    KERNEL_ASSIGN(out_data[i], req, (1.f-param_lr*param_wd_lh)*weight_data[i]
-      + (param_lr)*((mom_data[i] > 0) - (mom_data[i] < 0)));
+    rescale_grad += param_wd * weight_data[i];
+    mom_data[i] *= param_momentum;
+    mom_data[i] -= (1 - param_momentum) * rescale_grad;
+    KERNEL_ASSIGN(out_data[i], req, (1.f - param_lr * param_wd_lh) * weight_data[i]
+      + (param_lr) * mshadow_op::sign::Map(mom_data[i]));
   }
 };
 

From a85edaf86e56d69a66ae7bcace471f64d01aec86 Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Wed, 22 Jan 2020 17:49:42 -0800
Subject: [PATCH 06/10] fix pylint and perl test

---
 perl-package/AI-MXNet/t/test_optimizers.t     | 29 +++++++++++--------
 .../svrg_optimization/svrg_optimizer.py       |  2 ++
 python/mxnet/optimizer/adadelta.py            |  3 +-
 python/mxnet/optimizer/adagrad.py             |  4 ---
 python/mxnet/optimizer/adam.py                |  2 --
 python/mxnet/optimizer/adamax.py              |  5 ++--
 python/mxnet/optimizer/contrib.py             |  6 +---
 python/mxnet/optimizer/dcasgd.py              |  5 ++--
 python/mxnet/optimizer/ftml.py                |  2 --
 python/mxnet/optimizer/ftrl.py                |  4 ---
 python/mxnet/optimizer/lamb.py                |  6 ++--
 python/mxnet/optimizer/lars.py                |  9 ++----
 python/mxnet/optimizer/nadam.py               |  3 +-
 python/mxnet/optimizer/nag.py                 |  4 ---
 python/mxnet/optimizer/optimizer.py           |  6 ++--
 python/mxnet/optimizer/rmsprop.py             |  4 ---
 python/mxnet/optimizer/sgd.py                 |  5 ----
 python/mxnet/optimizer/sgld.py                |  5 ++--
 python/mxnet/optimizer/signum.py              |  5 ----
 python/mxnet/optimizer/updater.py             |  8 ++---
 python/mxnet/optimizer/utils.py               |  2 +-
 python/mxnet/test_utils.py                    |  4 +--
 22 files changed, 41 insertions(+), 82 deletions(-)

diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
index bf2dd0665c87..d51047cccb89 100644
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -76,11 +76,12 @@ method update($index, $weight, $grad, $state)
     my $t = $self->_index_update_count->{$index};
     my ($mean, $variance) = @$state;
     my $wd = $self->_get_wd($index);
-    $grad = $grad * $self->rescale_grad + $wd * $weight;
+    $grad = $grad * $self->rescale_grad;
     if($self->clip_gradient)
     {
         mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient, out => $grad);
     }
+    $grad += $wd * $weight;
     $mean *= $self->beta1;
     $mean += $grad * (1 - $self->beta1);
 
@@ -173,7 +174,7 @@ method update($index, $weight, $grad, $state)
     my $lr = $self->_get_lr($index);
     my $wd = $self->_get_wd($index);
     $self->_update_count($index);
-    $grad = $grad * $self->rescale_grad + $wd * $weight;
+    $grad = $grad * $self->rescale_grad;
     if(not $self->centered)
     {
         my ($n) = @$state;
@@ -181,8 +182,9 @@ method update($index, $weight, $grad, $state)
         {
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
+        $grad += $wd * $weight;
         $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
-        $weight -= $lr * $grad/(mx->nd->sqrt($n + $self->epsilon));
+        $weight -= $lr * $grad/(mx->nd->sqrt($n) + $self->epsilon);
     }
     else
     {
@@ -191,6 +193,7 @@ method update($index, $weight, $grad, $state)
         {
             $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
+        $grad += $wd * $weight;
         $n .= (1 - $self->rho) * ($grad * $grad) + $self->rho * $n;
         $g .= (1 - $self->rho) * $grad + $self->rho * $g;
         $delta .= ($self->momentum) * $delta - $lr * $grad/(mx->nd->sqrt($n - $g*$g + $self->epsilon));
@@ -442,12 +445,13 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
+            $grad += $wd * $weight;
             my $mom = $state;
             $mom *= $self->momentum;
-            $grad += $wd * $weight;
-            $mom += $grad;
+            $mom -= $lr * $grad;
+            $grad *= -$lr;
             $grad += $self->momentum * $mom;
-            $weight += -$lr * $grad;
+            $weight += $grad;
         }
     }
     else
@@ -466,11 +470,12 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
-            $mom *= $self->momentum;
             $grad32 += $wd * $weight32;
-            $mom += $grad32;
+            $mom *= $self->momentum;
+            $mom -= $lr * $grad32;
+            $grad32 *= -$lr;
             $grad32 += $self->momentum * $mom;
-            $weight32 += -$lr * $grad32;
+            $weight32 += $grad32;
         }
         my $tmp = $weight32->astype($weight->dtype);
         $tmp->copyto($weight);
@@ -603,8 +608,8 @@ method update($index, $weight, $grad, $state)
         $n->at($row) += $grad_row * $grad_row;
 
         # update weight
-        $weight->at($row) .= (mx->nd->sign($dn->at($row)) * $self->lamda1 - $dn->at($row)) /
-                          (($self->beta + mx->nd->sqrt($n->at($row))) / $lr + $wd) * (mx->nd->abs($dn->at($row)) > $self->lamda1);
+        $weight->at($row) .= - mx->nd->sign($dn->at($row)) * mx->nd->maximum(mx->nd->abs($dn->at($row)) - $self->lamda1, 0) /
+                          (($self->beta + mx->nd->sqrt($n->at($row))) / $lr + $wd);
     }
 }
 
@@ -631,7 +636,7 @@ method update($index, $weight, $grad, $state)
         $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
     }
     $history += mx->nd->square($grad);
-    my $div = $grad / mx->nd->sqrt($history + $self->eps);
+    my $div = $grad / (mx->nd->sqrt($history) + $self->eps);
     $weight += ($div + $weight * $wd) * -$lr;
 }
 
diff --git a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
index 095727a32f88..fba99a0434d7 100644
--- a/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
+++ b/python/mxnet/contrib/svrg_optimization/svrg_optimizer.py
@@ -14,6 +14,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+
+# pylint: disable=W0223
 """A `_SVRGOptimizer` encapsulates two optimizers to support SVRGModule in single machine and distributed settings.
 Both `_AssignmentOptimizer` and `_SVRGOptimizer` are designed to be used with SVRGModule only.
 """
diff --git a/python/mxnet/optimizer/adadelta.py b/python/mxnet/optimizer/adadelta.py
index 0c6fdd02aaca..a8f01401e282 100644
--- a/python/mxnet/optimizer/adadelta.py
+++ b/python/mxnet/optimizer/adadelta.py
@@ -15,7 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=too-many-lines
+# pylint: disable=W0223
 """AdaDelta optimizer."""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sqrt, square)
@@ -89,7 +89,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
diff --git a/python/mxnet/optimizer/adagrad.py b/python/mxnet/optimizer/adagrad.py
index 8e181fda90a6..aa31abf0030b 100644
--- a/python/mxnet/optimizer/adagrad.py
+++ b/python/mxnet/optimizer/adagrad.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """AdaGrad optimizer"""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sqrt, square)
@@ -91,7 +89,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
@@ -131,7 +128,6 @@ def fused_step(self, indices, weights, grads, states):
                 self._update_count(index)
                 lr = self._get_lr(index)
                 wd = self._get_wd(index)
-                t = self._index_update_count[index]
                 kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
                 if self.clip_gradient:
                     kwargs['clip_gradient'] = self.clip_gradient
diff --git a/python/mxnet/optimizer/adam.py b/python/mxnet/optimizer/adam.py
index a08c5c73e6fe..24500c917433 100644
--- a/python/mxnet/optimizer/adam.py
+++ b/python/mxnet/optimizer/adam.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """Adam optimizer."""
 from __future__ import absolute_import
 import math
diff --git a/python/mxnet/optimizer/adamax.py b/python/mxnet/optimizer/adamax.py
index 50af82138f43..a2ffd9c68b2c 100644
--- a/python/mxnet/optimizer/adamax.py
+++ b/python/mxnet/optimizer/adamax.py
@@ -15,10 +15,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=too-many-lines
+# pylint: disable=W0223
 """Adamax optimizer."""
 from __future__ import absolute_import
-from ..ndarray import (zeros, clip, sqrt, maximum, abs as NDabs)
+from ..ndarray import (zeros, clip, maximum, abs as NDabs)
 from .optimizer import Optimizer, register
 
 __all__ = ['Adamax']
@@ -109,4 +109,3 @@ def step(self, indices, weights, grads, states):
             # update weight
             d = mean / var
             weight[:] -= lr * d
-
diff --git a/python/mxnet/optimizer/contrib.py b/python/mxnet/optimizer/contrib.py
index 1092db3979ea..71cda70098be 100644
--- a/python/mxnet/optimizer/contrib.py
+++ b/python/mxnet/optimizer/contrib.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """Contrib optimizers."""
 from ..ndarray import (clip, contrib, mean, sqrt, square, zeros)
 from .optimizer import Optimizer, register
@@ -94,7 +92,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
             assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
 
             # preprocess grad
@@ -134,7 +131,6 @@ def fused_step(self, indices, weights, grads, states):
                 self._update_count(index)
                 lr = self._get_lr(index)
                 wd = self._get_wd(index)
-                t = self._index_update_count[index]
                 assert wd == 0, 'Weight decay is not supported for GroupAdaGrad'
 
                 kwargs = {'epsilon': self.epsilon, 'rescale_grad': self.rescale_grad}
@@ -153,4 +149,4 @@ def fused_step(self, indices, weights, grads, states):
                     **kwargs)
             else:
                 # When the grad is not sparse, the func step is called to update weight and state
-                self.step([index], [weight], [grad], [state])
\ No newline at end of file
+                self.step([index], [weight], [grad], [state])
diff --git a/python/mxnet/optimizer/dcasgd.py b/python/mxnet/optimizer/dcasgd.py
index f9ef2624c982..789ceeb03cd7 100644
--- a/python/mxnet/optimizer/dcasgd.py
+++ b/python/mxnet/optimizer/dcasgd.py
@@ -16,10 +16,10 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=too-many-lines
+# pylint: disable=W0223
 """DCASGD optimizer."""
 from __future__ import absolute_import
-from ..ndarray import (zeros, clip, sqrt, square)
+from ..ndarray import (zeros, clip, square)
 from .optimizer import Optimizer, register
 
 __all__ = ['DCASGD']
@@ -88,7 +88,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
diff --git a/python/mxnet/optimizer/ftml.py b/python/mxnet/optimizer/ftml.py
index 9b5aec3054d4..d9f1ccb5d080 100644
--- a/python/mxnet/optimizer/ftml.py
+++ b/python/mxnet/optimizer/ftml.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """FTML optimizer."""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sqrt, square)
diff --git a/python/mxnet/optimizer/ftrl.py b/python/mxnet/optimizer/ftrl.py
index b0e484b8f971..a14a1a79b2ee 100644
--- a/python/mxnet/optimizer/ftrl.py
+++ b/python/mxnet/optimizer/ftrl.py
@@ -14,8 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """FTRL optimizer."""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sqrt, square, sign, maximum, abs as NDabs)
@@ -115,7 +113,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
@@ -160,7 +157,6 @@ def fused_step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             kwargs = {'lamda1': self.lamda1, 'beta': self.beta, 'rescale_grad': self.rescale_grad}
             if self.clip_gradient:
diff --git a/python/mxnet/optimizer/lamb.py b/python/mxnet/optimizer/lamb.py
index 11b7e18c0bf3..f1f7e1347f4b 100644
--- a/python/mxnet/optimizer/lamb.py
+++ b/python/mxnet/optimizer/lamb.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """Lamb optimizer."""
 from __future__ import absolute_import
 import numpy
@@ -239,7 +237,8 @@ def fused_step(self, indices, weights, grads, states):
                         kwargs['upper_bound'] = self.upper_bound
                     r_1 = weight32.norm()
                     r_2 = g.norm()
-                    mp_lamb_update_phase2(weight, g, r_1, r_2, weight32, lr=lr, out=weight, **kwargs)
+                    mp_lamb_update_phase2(weight, g, r_1, r_2, weight32, lr=lr,
+                                          out=weight, **kwargs)
                 else:
                     mean, var = state
                     g = lamb_update_phase1(weight, grad, mean, var, wd=wd, **kwargs)
@@ -260,4 +259,3 @@ def update_multi_precision(self, indices, weights, grads, states):
             self.update(indices, weights, grads, states)
         else:
             super(LAMB, self).update_multi_precision(indices, weights, grads, states)
-
diff --git a/python/mxnet/optimizer/lars.py b/python/mxnet/optimizer/lars.py
index 1cd746c6dd32..9492a9380018 100644
--- a/python/mxnet/optimizer/lars.py
+++ b/python/mxnet/optimizer/lars.py
@@ -15,14 +15,12 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """LARS optimizer."""
 from __future__ import absolute_import
-import os
 import numpy
-from ..ndarray import (zeros, clip, sqrt, array,
-                       multi_sum_sq, multi_lars, norm as NDnorm,
+from ..ndarray import (zeros, clip, array,
+                       multi_sum_sq, multi_lars,
+                       norm as NDnorm,
                        where, ones_like)
 from ..ndarray import (sgd_update, sgd_mom_update,
                        mp_sgd_update, mp_sgd_mom_update,
@@ -145,7 +143,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # compute lars
             # clip grad + wd * weight is performed after computing lars
diff --git a/python/mxnet/optimizer/nadam.py b/python/mxnet/optimizer/nadam.py
index 483a44a8cc46..a0e298696842 100644
--- a/python/mxnet/optimizer/nadam.py
+++ b/python/mxnet/optimizer/nadam.py
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=too-many-lines
+# pylint: disable=W0223
 """Nadam optimizer."""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sqrt, square)
@@ -122,4 +122,3 @@ def step(self, indices, weights, grads, states):
             # update weight
             d = mean_bar / (sqrt(var_prime) + self.epsilon)
             weight[:] -= lr * d
-
diff --git a/python/mxnet/optimizer/nag.py b/python/mxnet/optimizer/nag.py
index 463f7949a9e9..8b816a729637 100644
--- a/python/mxnet/optimizer/nag.py
+++ b/python/mxnet/optimizer/nag.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """NAG optimizer."""
 from __future__ import absolute_import
 import numpy
@@ -91,7 +89,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
@@ -132,7 +129,6 @@ def fused_step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             kwargs = {'rescale_grad': self.rescale_grad}
             if self.momentum > 0:
diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index b5e8c2468304..a40bd2575d28 100755
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -340,7 +340,7 @@ def update_multi_precision(self, indices, weights, grads, states):
         weights_master_copy = []
         original_states = []
         grads32 = []
-        for index, weight, grad, state in zip(indices, weights, grads, states):
+        for weight, grad, state in zip(weights, grads, states):
             if self.multi_precision and weight.dtype == numpy.float16:
                 weights_master_copy.append(state[0])
                 original_states.append(state[1])
@@ -559,7 +559,7 @@ def __setstate__(self, state):
 # convenience wrapper for Optimizer.Register
 register = Optimizer.register   # pylint: disable=invalid-name
 
-
+# pylint: disable=W0223
 @register
 class Test(Optimizer):
     """The Test optimizer"""
@@ -576,10 +576,8 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
             grad = self.rescale_grad * grad
             weight[:] -= lr * (grad + wd * weight)
 
 
 create = Optimizer.create_optimizer  # pylint: disable=invalid-name
-
diff --git a/python/mxnet/optimizer/rmsprop.py b/python/mxnet/optimizer/rmsprop.py
index b57c82130b4e..2d4b2d618d64 100644
--- a/python/mxnet/optimizer/rmsprop.py
+++ b/python/mxnet/optimizer/rmsprop.py
@@ -14,8 +14,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """RMSProp optimizer."""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sqrt, square)
@@ -110,7 +108,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
@@ -164,7 +161,6 @@ def fused_step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             kwargs = {'rho': self.rho, 'epsilon': self.epsilon,
                       'rescale_grad': self.rescale_grad}
diff --git a/python/mxnet/optimizer/sgd.py b/python/mxnet/optimizer/sgd.py
index 3e0f74928182..7b2905710806 100644
--- a/python/mxnet/optimizer/sgd.py
+++ b/python/mxnet/optimizer/sgd.py
@@ -15,11 +15,8 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """SGD optimizer"""
 from __future__ import absolute_import
-import os
 import numpy
 from ..ndarray import (zeros, clip)
 from ..ndarray import (sgd_update, sgd_mom_update,
@@ -138,7 +135,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
@@ -244,4 +240,3 @@ def update_multi_precision(self, indices, weights, grads, states):
             self.update(indices, weights, grads, states)
         else:
             super(SGD, self).update_multi_precision(indices, weights, grads, states)
-
diff --git a/python/mxnet/optimizer/sgld.py b/python/mxnet/optimizer/sgld.py
index 8a99d8f977d7..cc97fa5ebcc5 100644
--- a/python/mxnet/optimizer/sgld.py
+++ b/python/mxnet/optimizer/sgld.py
@@ -16,7 +16,7 @@
 # specific language governing permissions and limitations
 # under the License.
 
-# pylint: disable=too-many-lines
+# pylint: disable=W0223
 """SGLD optimizer."""
 from __future__ import absolute_import
 import math
@@ -71,11 +71,10 @@ def step(self, indices, weights, grads, states):
         states : List of any obj
             List of state returned by `create_state()`.
         """
-        for index, weight, grad, state in zip(indices, weights, grads, states):
+        for index, weight, grad in zip(indices, weights, grads):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             # preprocess grad
             grad *= self.rescale_grad
diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
index 26cbe2a605d1..136f8240e334 100644
--- a/python/mxnet/optimizer/signum.py
+++ b/python/mxnet/optimizer/signum.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """Signum optimizer."""
 from __future__ import absolute_import
 from ..ndarray import (zeros, clip, sign)
@@ -99,7 +97,6 @@ def step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             if state is not None:
                 # preprocess grad
@@ -142,7 +139,6 @@ def fused_step(self, indices, weights, grads, states):
             self._update_count(index)
             lr = self._get_lr(index)
             wd = self._get_wd(index)
-            t = self._index_update_count[index]
 
             kwargs = {'rescale_grad': self.rescale_grad}
             if self.momentum > 0:
@@ -160,4 +156,3 @@ def fused_step(self, indices, weights, grads, states):
                 wd += self.wd_lh
                 signsgd_update(weight, grad, out=weight,
                                lr=lr, wd=wd, **kwargs)
-
diff --git a/python/mxnet/optimizer/updater.py b/python/mxnet/optimizer/updater.py
index 03398396c449..62b700455075 100644
--- a/python/mxnet/optimizer/updater.py
+++ b/python/mxnet/optimizer/updater.py
@@ -15,8 +15,6 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
-
-# pylint: disable=too-many-lines
 """Updater class."""
 from __future__ import absolute_import
 import pickle
@@ -86,11 +84,11 @@ def __call__(self, index, grad, weight):
                             states)
                         current_index += self.optimizer.aggregate_num
             else:
-                states = [self.states[index] for index in indices]
+                states = [self.states[i] for i in indices]
                 self.optimizer.update_multi_precision(indices, weights, grads, states)
         else:
-            for index, weight, grad in zip(indices, weights, grads):
-                self.optimizer.update_multi_precision([index], [weight], [grad], [self.states[index]])
+            for i, w, g in zip(indices, weights, grads):
+                self.optimizer.update_multi_precision([i], [w], [g], [self.states[i]])
 
     def sync_state_context(self, state, context):
         """sync state context."""
diff --git a/python/mxnet/optimizer/utils.py b/python/mxnet/optimizer/utils.py
index af95a53ccae5..f7dc136c10c7 100644
--- a/python/mxnet/optimizer/utils.py
+++ b/python/mxnet/optimizer/utils.py
@@ -40,4 +40,4 @@ def _as_classic(a, allow_np):
                 return a.as_nd_ndarray()
             else:
                 raise ValueError('Converting np.ndarray to mx.nd.NDArray is not allowed')
-    return a
\ No newline at end of file
+    return a
diff --git a/python/mxnet/test_utils.py b/python/mxnet/test_utils.py
index d3b056c6efdc..20975b5d3ec9 100755
--- a/python/mxnet/test_utils.py
+++ b/python/mxnet/test_utils.py
@@ -2316,7 +2316,7 @@ def compare_optimizer_noise_seeded(opt1, opt2, shapes, dtype, noise_seed,
         if w_stype == 'default':
             w2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
             w1 = w2.copyto(default_context())
-        elif w_stype == 'row_sparse' or w_stype == 'csr':
+        elif w_stype in ('row_sparse', 'csr'):
             w2 = rand_ndarray(shape, w_stype, density=1, dtype=dtype)
             w1 = w2.copyto(default_context()).tostype('default')
         else:
@@ -2324,7 +2324,7 @@ def compare_optimizer_noise_seeded(opt1, opt2, shapes, dtype, noise_seed,
         if g_stype == 'default':
             g2 = mx.random.uniform(shape=shape, ctx=default_context(), dtype=dtype)
             g1 = g2.copyto(default_context())
-        elif g_stype == 'row_sparse' or g_stype == 'csr':
+        elif g_stype in ('row_sparse', 'csr'):
             g2 = rand_ndarray(shape, g_stype, dtype=dtype)
             g1 = g2.copyto(default_context()).tostype('default')
         else:

From 2ff1ad4af8f5e2c4e15ac15e1898a8ebe068d4ef Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-37-252.us-west-2.compute.internal>
Date: Thu, 23 Jan 2020 06:04:05 +0000
Subject: [PATCH 07/10] fix perl test and signsgd test

---
 .../AI-MXNet/lib/AI/MXNet/Optimizer.pm        | 24 ++++++++++---------
 perl-package/AI-MXNet/t/test_optimizers.t     | 24 ++++++++++---------
 python/mxnet/optimizer/signum.py              |  6 ++---
 src/operator/optimizer_op-inl.h               |  4 ++--
 tests/python/unittest/test_optimizer.py       |  4 +++-
 5 files changed, 34 insertions(+), 28 deletions(-)

diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
index 114ee37afc70..be29d31f5219 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -1037,12 +1037,13 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
+            $grad += $wd * $weight;
             my $mom = $state;
             $mom *= $self->momentum;
-            $grad += $wd * $weight;
-            $mom += $grad;
+            $mom -= $lr * $grad;
+	    $grad *= -$lr;
             $grad += $self->momentum * $mom;
-            $weight += -$lr * $grad;
+            $weight += $grad;
         }
     }
     else
@@ -1061,11 +1062,12 @@ method update($index, $weight, $grad, $state)
         }
         else
         {
+	    $grad32 += $wd * $weight32;
             $mom *= $self->momentum;
-            $grad32 += $wd * $weight32;
-            $mom += $grad32;
+            $mom -= $lr * $grad32;
+	    $grad32 *= -$lr;
             $grad32 += $self->momentum * $mom;
-            $weight32 += -$lr * $grad32;
+            $weight32 += $grad32;
         }
         my $tmp = $weight32->astype($weight->dtype);
         $tmp->copyto($weight);
@@ -1276,7 +1278,7 @@ __PACKAGE__->register;
     rescale_grad : Num, optional
         rescaling factor of gradient. Normally should be 1/batch_size.
 
-    eps: Num, optional
+    epsilon: Num, optional
         A small float number to make the updating processing stable
         Default value is set to 1e-7.
 
@@ -1288,7 +1290,7 @@ use Mouse;
 
 extends 'AI::MXNet::Optimizer';
 
-has 'eps'    => (is => "rw", isa => "Num", default => 1e-7);
+has 'epsilon'    => (is => "rw", isa => "Num", default => 1e-7);
 
 method create_state(Index $index, AI::MXNet::NDArray $weight)
 {
@@ -1314,7 +1316,7 @@ method update(
     if($is_sparse)
     {
         my %kwargs = (
-            epsilon => $self->eps,
+            epsilon => $self->epsilon,
             rescale_grad => $self->rescale_grad
         );
         if($self->clip_gradient)
@@ -1331,7 +1333,7 @@ method update(
             $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
         $history += $grad->square;
-        my $div = $grad / ($history + $self->eps)->sqrt;
+        my $div = $grad / (($history)->sqrt + $self->epsilon);
         $weight += ($div + $weight * $wd) * -$lr;
     }
 }
@@ -1460,7 +1462,7 @@ method update(
     if($self->centered)
     {
         AI::MXNet::NDArray->rmspropalex_update(
-            $weight, $grad, $n, $g, $delta,
+            $weight, $grad, $g, $n, $delta,
             {
                 out => $weight,
                 lr  => $lr,
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
index d51047cccb89..5b944e7c2a08 100644
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -503,11 +503,12 @@ method update($index, $weight, $grad, $state)
     my $wd = $self->_get_wd($index);
     my $t = $self->_index_update_count->{$index};
 
-    my $grad = $grad * $self->rescale_grad + $wd * $weight;
+    my $grad = $grad * $self->rescale_grad;
     if(defined $self->clip_gradient)
     {
         $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
     }
+    $grad += $wd * $weight;
     # get previous states
     my ($prev_d, $prev_v, $prev_z) = @{ $state };
     # compute states
@@ -608,7 +609,7 @@ method update($index, $weight, $grad, $state)
         $n->at($row) += $grad_row * $grad_row;
 
         # update weight
-        $weight->at($row) .= - mx->nd->sign($dn->at($row)) * mx->nd->maximum(mx->nd->abs($dn->at($row)) - $self->lamda1, 0) /
+        $weight->at($row) .= - mx->nd->sign($dn->at($row)) * (mx->nd->abs($dn->at($row)) - $self->lamda1)->maximum(0) /
                           (($self->beta + mx->nd->sqrt($n->at($row))) / $lr + $wd);
     }
 }
@@ -617,7 +618,7 @@ package PerlAdaGrad;
 use Mouse;
 extends 'AI::MXNet::Optimizer';
 
-has 'eps' => (is => 'rw', default => 1e-7);
+has 'epsilon' => (is => 'rw', default => 1e-7);
 method create_state($index, $weight)
 {
     mx->nd->zeros($weight->shape, ctx => $weight->context, stype => $weight->stype);
@@ -635,9 +636,10 @@ method update($index, $weight, $grad, $state)
     {
         $grad = mx->nd->clip($grad, -$self->clip_gradient, $self->clip_gradient);
     }
+    $grad += $wd * $weight;
     $history += mx->nd->square($grad);
-    my $div = $grad / (mx->nd->sqrt($history) + $self->eps);
-    $weight += ($div + $weight * $wd) * -$lr;
+    my $div = $grad / (mx->nd->sqrt($history) + $self->epsilon);
+    $weight -= $lr * $div;
 }
 
 package main;
@@ -1056,7 +1058,7 @@ sub test_adagrad
     my $opt1 = 'PerlAdaGrad';
     my $opt2 = mx->optimizer->AdaGrad;
     my $shape = [3, 4, 5];
-    my @eps_options= ({}, {eps => 1e-9});
+    my @eps_options= ({epsilon => 1e-9});
     my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
     my @rg_options = ({}, {rescale_grad  => 0.14}, {rescale_grad => 0.8});
     my @wd_options = ({}, {wd => 0});
@@ -1076,11 +1078,11 @@ sub test_adagrad
                         %kwarg = (%kwarg, %$rg_option);
                         %kwarg = (%kwarg, %$wd_option);
                         compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype);
-                        if(($wd_option->{wd}//0) == 0)
-                        {
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
-                            compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'default', 'row_sparse');
-                        }
+			if($wd_option->{wd} == 0)
+			{
+			    compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'row_sparse', 'row_sparse');
+			    compare_optimizer($opt1->new(%kwarg), $opt2->new(%kwarg), $shape, $dtype, 'default', 'row_sparse');
+			}
                     }
                 }
             }
diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
index 136f8240e334..b8dece961f5c 100644
--- a/python/mxnet/optimizer/signum.py
+++ b/python/mxnet/optimizer/signum.py
@@ -109,14 +109,14 @@ def step(self, indices, weights, grads, states):
                 mom = state
                 mom[:] *= self.momentum
                 mom[:] -= (1 - self.momentum) * grad
-                weight[:] *= 1 - lr * self.wd_lh
 
                 # update weight
-                weight[:] += lr * sign(mom)
+                weight[:] *= 1 - lr * self.wd_lh
+                weight[:] += lr * ((mom > 0) - (mom < 0))
             else:
                 # update weight
                 weight[:] *= 1 - lr * (wd + self.wd_lh)
-                weight[:] -= lr * sign(grad)
+                weight[:] -= lr * ((grad > 0) - (grad < 0))
 
     def fused_step(self, indices, weights, grads, states):
         """Perform a fused optimization step using gradients and states.
diff --git a/src/operator/optimizer_op-inl.h b/src/operator/optimizer_op-inl.h
index 952bfc9ecd2e..59e3351e669c 100644
--- a/src/operator/optimizer_op-inl.h
+++ b/src/operator/optimizer_op-inl.h
@@ -2299,7 +2299,7 @@ struct SignSGDKernel {
     // param_clip_gradient has no effect for SignSGD
     KERNEL_ASSIGN(out_data[i], req,
              (1.f - param_lr * param_wd) * weight_data[i]
-               - (param_lr) * mshadow_op::sign::Map(grad_data[i]));
+               - (param_lr) * ((grad_data[i] > 0) - (grad_data[i] < 0)));
   }
 };
 
@@ -2371,7 +2371,7 @@ struct SignumKernel {
     mom_data[i] *= param_momentum;
     mom_data[i] -= (1 - param_momentum) * rescale_grad;
     KERNEL_ASSIGN(out_data[i], req, (1.f - param_lr * param_wd_lh) * weight_data[i]
-      + (param_lr) * mshadow_op::sign::Map(mom_data[i]));
+      + (param_lr) * ((mom_data[i] > 0) - (mom_data[i] < 0)));
   }
 };
 
diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index 6137fd9d65df..cdc845d49696 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -550,8 +550,10 @@ def test_signum():
             if (dtype == np.float16 and
                     ('multi_precision' not in kwarg or not kwarg['multi_precision'])):
                 continue
+            rtol, atol = (1e-3, 1e-4) if dtype is np.float16 else (1e-4, 1e-5)
             compare_optimizer(opt1(use_fused_step=False, **kwarg),
-                              opt2(use_fused_step=True, **kwarg), shapes, dtype)
+                              opt2(use_fused_step=True, **kwarg), shapes, dtype,
+                                   rtol=rtol, atol=atol)
 
 
 @with_seed()

From 2b182de4def74ea4ac35ee28fc554009a549b9d3 Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-37-252.us-west-2.compute.internal>
Date: Thu, 23 Jan 2020 22:35:24 +0000
Subject: [PATCH 08/10] fix

---
 perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm | 3 ++-
 perl-package/AI-MXNet/t/test_optimizers.t       | 2 +-
 python/mxnet/optimizer/signum.py                | 2 +-
 3 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
index be29d31f5219..e4d8b5abde0b 100644
--- a/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
+++ b/perl-package/AI-MXNet/lib/AI/MXNet/Optimizer.pm
@@ -1332,9 +1332,10 @@ method update(
         {
             $grad = AI::MXNet::NDArray->clip($grad, -$self->clip_gradient, $self->clip_gradient);
         }
+	$grad += $wd * $weight;
         $history += $grad->square;
         my $div = $grad / (($history)->sqrt + $self->epsilon);
-        $weight += ($div + $weight * $wd) * -$lr;
+        $weight += $div * -$lr;
     }
 }
 
diff --git a/perl-package/AI-MXNet/t/test_optimizers.t b/perl-package/AI-MXNet/t/test_optimizers.t
index 5b944e7c2a08..26a87cdd75ba 100644
--- a/perl-package/AI-MXNet/t/test_optimizers.t
+++ b/perl-package/AI-MXNet/t/test_optimizers.t
@@ -1058,7 +1058,7 @@ sub test_adagrad
     my $opt1 = 'PerlAdaGrad';
     my $opt2 = mx->optimizer->AdaGrad;
     my $shape = [3, 4, 5];
-    my @eps_options= ({epsilon => 1e-9});
+    my @eps_options= ({}, {epsilon => 1e-9});
     my @cg_options = ({}, {clip_gradient => 0.4}, {clip_gradient => 0.5});
     my @rg_options = ({}, {rescale_grad  => 0.14}, {rescale_grad => 0.8});
     my @wd_options = ({}, {wd => 0});
diff --git a/python/mxnet/optimizer/signum.py b/python/mxnet/optimizer/signum.py
index b8dece961f5c..0bb44f9bb4cd 100644
--- a/python/mxnet/optimizer/signum.py
+++ b/python/mxnet/optimizer/signum.py
@@ -17,7 +17,7 @@
 # under the License.
 """Signum optimizer."""
 from __future__ import absolute_import
-from ..ndarray import (zeros, clip, sign)
+from ..ndarray import (zeros, clip)
 from ..ndarray import (signsgd_update, signum_update)
 from .optimizer import Optimizer, register
 

From 092f6069e8ea5bbd439bf500319d01c90ba03fff Mon Sep 17 00:00:00 2001
From: Zheng <shzheng@a483e789dd93.ant.amazon.com>
Date: Fri, 21 Feb 2020 13:59:35 -0800
Subject: [PATCH 09/10] retrigger ci

---
 python/mxnet/optimizer/optimizer.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/python/mxnet/optimizer/optimizer.py b/python/mxnet/optimizer/optimizer.py
index 91b5b9b5d863..2fef62eb8319 100755
--- a/python/mxnet/optimizer/optimizer.py
+++ b/python/mxnet/optimizer/optimizer.py
@@ -23,9 +23,7 @@
 from ..ndarray import (NDArray, zeros, cast)
 from ..util import is_np_array
 
-__all__ = [
-    'Optimizer', 'Test', 'create', 'register'
-]
+__all__ = ['Optimizer', 'Test', 'create', 'register']
 
 
 class Optimizer(object):

From 54333e50b5be64f02c4fb79d3c02e61df52d18dd Mon Sep 17 00:00:00 2001
From: Ubuntu <ubuntu@ip-172-31-37-252.us-west-2.compute.internal>
Date: Mon, 24 Feb 2020 23:22:25 +0000
Subject: [PATCH 10/10] reduce ci overheads

---
 tests/python/unittest/test_optimizer.py | 126 ++++++++++++------------
 1 file changed, 63 insertions(+), 63 deletions(-)

diff --git a/tests/python/unittest/test_optimizer.py b/tests/python/unittest/test_optimizer.py
index cdc845d49696..2a15e3407862 100755
--- a/tests/python/unittest/test_optimizer.py
+++ b/tests/python/unittest/test_optimizer.py
@@ -87,8 +87,8 @@ def test_sgd():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for dtype in [np.float16, np.float32]:
@@ -191,8 +191,8 @@ def test_sparse_sgd():
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float32]:
         for params in itertools.product(mom_options, cg_options, rg_options,
@@ -215,7 +215,7 @@ def test_std_sparse_sgd():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for dtype in [np.float32]:
@@ -238,9 +238,9 @@ def test_nag():
     mom_options = [{}, {'momentum': 0.9}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for dtype in [np.float16, np.float32]:
@@ -261,12 +261,12 @@ def test_lars():
     opt2 = mx.optimizer.LARS
     shapes = [(3, 4, 5), (10, 4), (7,)]
     eta_options = [{}, {'eta': 0.002}, {'eta': 0.01}]
-    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(eta_options, mom_options, cg_options, rg_options,
@@ -286,15 +286,15 @@ def test_lamb():
     opt2 = mx.optimizer.LAMB
     
     shapes = [(3, 4, 5), (10, 4), (7,)]
-    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
-    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    bc_options = [{}, {'bias_correction': False}, {'bias_correction': True}]
-    lb_options = [{}, {'lower_bound': None}, {'lower_bound': 1e-3}]
-    ub_options = [{}, {'upper_bound': None}, {'upper_bound': 10}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
+    beta1_options = [{}, {'beta1': 0.5}]
+    beta2_options = [{}, {'beta2': 0.8}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}]
+    bc_options = [{'bias_correction': False}, {'bias_correction': True}]
+    lb_options = [{'lower_bound': None}, {'lower_bound': 1e-3}]
+    ub_options = [{'upper_bound': None}, {'upper_bound': 10}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
     agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}]
     for dtype in [np.float16, np.float32]:
@@ -318,8 +318,8 @@ def test_sgld():
     ns_options = [1234, 42]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for seed in ns_options:
@@ -346,8 +346,8 @@ def test_ftml():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
 
     for dtype in [np.float16, np.float32]:
@@ -446,8 +446,8 @@ def test_adam():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(beta1_options, beta2_options, cg_options,
@@ -467,13 +467,13 @@ def test_sparse_adam():
     opt1 = PySparseAdam
     opt2 = mx.optimizer.Adam
     shapes = [(3, 4, 5), (10, 4), (7,)]
-    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
-    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    beta1_options = [{}, {'beta1': 0.5}]
+    beta2_options = [{}, {'beta2': 0.8}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(beta1_options, beta2_options, cg_options,
@@ -515,8 +515,8 @@ def test_adamax():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(beta1_options, beta2_options, cg_options,
@@ -539,8 +539,8 @@ def test_signum():
     wd_lh_options = [{}, {'wd_lh': 0.015}, {'wd_lh': 0.0}]
     mom_options = [{}, {'momentum': 0.9}]
     lr_options = [{'learning_rate': 0.05},{'learning_rate': 0.01}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(cg_options, rg_options, wd_options,
@@ -561,15 +561,15 @@ def test_rms():
     opt1 = mx.optimizer.RMSProp
     opt2 = mx.optimizer.RMSProp
     shapes = [(3, 4, 5), (10, 4), (7,)]
-    rho_options = [{}, {'rho': 0.5}, {'rho': 0.7}]
-    cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
+    rho_options = [{}, {'rho': 0.5}]
+    cg_options = [{}, {'clip_gradient': 0.4}]
     cw_options = [{}, {'clip_weights': 0.01}]
-    center_options = [{}, {'centered': False}, {'centered': True}]
-    rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mom_options = [{}, {'momentum': 0.0}, {'momentum': 0.9}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    center_options = [{'centered': False}, {'centered': True}]
+    rg_options = [{}, {'rescale_grad': 0.14}]
+    wd_options = [{}, {'wd': 0.03}]
+    mom_options = [{'momentum': 0.0}, {'momentum': 0.9}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         # Reduce foating point compare tolerance to avoid flaky test failure.
@@ -672,12 +672,12 @@ def test_ftrl():
     opt1 = mx.optimizer.Ftrl
     opt2 = mx.optimizer.Ftrl
     shapes = [(3, 4, 5), (10, 4), (7,)]
-    lamda1_options = [{}, {'lamda1': 0.}, {'lamda1': 0.1}]
+    lamda1_options = [{'lamda1': 0.}, {'lamda1': 0.1}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(lamda1_options, cg_options,
@@ -697,12 +697,12 @@ def test_sparse_ftrl():
     opt1 = PySparseFtrl
     opt2 = mx.optimizer.Ftrl
     shapes = [(3, 4, 5), (10, 4), (7,)]
-    lamda1_options = [{}, {'lamda1': 0.}, {'lamda1': 0.1}]
+    lamda1_options = [{'lamda1': 0.}, {'lamda1': 0.1}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(lamda1_options, cg_options,
@@ -722,14 +722,14 @@ def test_nadam():
     opt1 = mx.optimizer.Nadam
     opt2 = mx.optimizer.Nadam
     shapes = [(3, 4, 5), (10, 4), (7,)]
-    beta1_options = [{}, {'beta1': 0.5}, {'beta1': 0.7}]
-    beta2_options = [{}, {'beta2': 0.8}, {'beta2': 0.9}]
+    beta1_options = [{}, {'beta1': 0.5}]
+    beta2_options = [{}, {'beta2': 0.8}]
     schedule_decay_options = [{}, {'schedule_decay': 0.008}]
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
-    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    mp_options = [{}, {'multi_precision': False}, {'multi_precision': True}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}]
+    mp_options = [{'multi_precision': False}, {'multi_precision': True}]
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(beta1_options, beta2_options, cg_options,
@@ -818,7 +818,7 @@ def test_adagrad():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.0}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(eps_options, cg_options,
@@ -839,7 +839,7 @@ def test_sparse_adagrad():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.0}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(eps_options, cg_options,
@@ -864,7 +864,7 @@ def test_adadelta():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(rho_options, eps_options, cg_options,
@@ -885,7 +885,7 @@ def test_dcasgd():
     cg_options = [{}, {'clip_gradient': 0.4}, {'clip_gradient': 0.5}]
     rg_options = [{}, {'rescale_grad': 0.14}, {'rescale_grad': 0.8}]
     wd_options = [{}, {'wd': 0.03}, {'wd': 0.05}, {'wd': 0.07}]
-    agg_options = [{}, {'aggregate_num': 0}, {'aggregate_num': 1},
+    agg_options = [{'aggregate_num': 0}, {'aggregate_num': 1},
                    {'aggregate_num': 4}, {'aggregate_num': np.inf}]
     for dtype in [np.float16, np.float32]:
         for params in itertools.product(lamda_options, mom_options, cg_options,