Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

support sgd & dot & embedding with rsp #75

Merged
merged 1 commit into from
Jun 10, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion python/mxnet/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import logging
from .ndarray import NDArray, zeros, clip, sqrt, sign
from .ndarray import sgd_update, sgd_mom_update, adam_update, rmsprop_update, rmspropalex_update
from .sparse_ndarray import zeros as sparse_zeros
from .random import normal


Expand Down Expand Up @@ -332,7 +333,8 @@ def create_state(self, index, weight):
if self.momentum == 0.0:
return None
else:
return zeros(weight.shape, weight.context, dtype=weight.dtype)
return sparse_zeros(weight.storage_type, weight.shape,
weight.context, dtype=weight.dtype)

def update(self, index, weight, grad, state):
assert(isinstance(weight, NDArray))
Expand Down
6 changes: 4 additions & 2 deletions python/mxnet/sparse_ndarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -571,7 +571,7 @@ def to_dense(source):
"""
return ndarray.cast_storage(source, storage_type='default')

def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None, **kwargs):
"""Return a new array of given shape and type, filled with zeros.

Parameters
Expand Down Expand Up @@ -599,6 +599,8 @@ def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
>>> mx.sparse_nd.zeros('row_sparse', (1,2), mx.gpu(0), 'float16').asnumpy()
array([[ 0., 0.]], dtype=float16)
"""
if storage_type == 'default':
return ndarray.zeros(shape, ctx, dtype, **kwargs)
if ctx is None:
ctx = Context.default_ctx
dtype = mx_real_t if dtype is None else dtype
Expand All @@ -609,7 +611,7 @@ def zeros(storage_type, shape, ctx=None, dtype=None, aux_types=None):
raise Exception("unknown storage type")
assert(len(aux_types) == len(_STORAGE_AUX_TYPES[storage_type]))
out = _ndarray_cls(_new_alloc_handle(storage_type, shape, ctx, True, dtype, aux_types))
return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out)
return _internal._zeros(shape=shape, ctx=ctx, dtype=dtype, out=out, **kwargs)

def _ndarray_cls(handle, writable=True):
stype = _storage_type(handle)
Expand Down
31 changes: 31 additions & 0 deletions src/operator/operator_common.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,19 @@ inline std::string type_string(const int& x) {
return "unknown";
}

/*! \brief get string representation of storage_type */
inline std::string stype_string(const int& x) {
switch (x) {
case kDefaultStorage:
return "default";
case kCSRStorage:
return "csr";
case kRowSparseStorage:
return "row_sparse";
}
return "unknown";
}

/*!
* \brief Assign x to y. Checks for compatiblity when y is not empty.
* Allow missing dim in both x and y (as 0).
Expand Down Expand Up @@ -186,6 +199,24 @@ inline bool type_assign(int *y, const int& x) {
} \
}

/*!
* \brief macro assign type to out if out is unknown (-1) otherwise check consistency
* Use macro so we can see the error file more clearly
* \param type_array the storage type array to store the result
* \param index the index of in the array
* \param type the inferred storage type
*/
#define STORAGE_TYPE_ASSIGN_CHECK(type_array, index, type) \
{ \
if (!type_assign(&(type_array)[index], type)) { \
std::ostringstream os; \
os << "Storage type inconsistent, Provided=" \
<< stype_string((type_array)[index]) << ',' \
<< " inferred storage type=" << stype_string(type); \
throw ::mxnet::op::InferTypeError(os.str(), index); \
} \
}

// helper macro to implement bind dispatch
#if MXNET_USE_CUDA
#define DO_BIND_DISPATCH(Method, ...) \
Expand Down
159 changes: 118 additions & 41 deletions src/operator/optimizer_op-inl.h
Original file line number Diff line number Diff line change
Expand Up @@ -112,32 +112,31 @@ struct SGDDnsRspKernel {

template<typename xpu>
inline void SGDUpdateDnsRspImpl(const SGDParam& param,
const OpContext &ctx,
const std::vector<NDArray> &inputs,
const std::vector<OpReqType> &req,
const std::vector<NDArray> &outputs) {
const OpContext &ctx,
const TBlob& weight,
const NDArray& grad,
const OpReqType& req,
TBlob *out) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mshadow_op;
using namespace mxnet_op;
Stream<xpu>* s = ctx.get_stream<xpu>();
auto &weight = inputs[0];
auto &grad = inputs[1];
auto &out = outputs[0];
CHECK_EQ(weight.storage_type(), kDefaultStorage);
CHECK_EQ(grad.storage_type(), kRowSparseStorage);
if (!grad.storage_initialized()) return;
// if gradients are zeros, no weights are updated
if (!grad.storage_initialized() || req == kNullOp) return;
CHECK_GT(weight.shape_.Size(), 0);

MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, {
MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
MSHADOW_INT_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
auto weight_data = weight.data().FlatTo2D<xpu, DType>(s);
auto grad_idx = grad.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
auto grad_val = grad.data().FlatTo2D<xpu, DType>(s);
auto out_data = out.data().FlatTo2D<xpu, DType>(s);
MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
auto weight_data = weight.dptr<DType>();
auto grad_idx = grad.aux_data(rowsparse::kIdx).dptr<IType>();
auto grad_val = grad.data().dptr<DType>();
auto num_rows = grad.aux_shape(rowsparse::kIdx)[0];
auto width = weight.shape().ProdShape(1, weight.shape().ndim());
mxnet_op::Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, width,
out_data.dptr_, weight_data.dptr_, grad_idx.dptr_, grad_val.dptr_,
auto width = weight.shape_.ProdShape(1, weight.ndim());
Kernel<SGDDnsRspKernel<req_type>, xpu>::Launch(s, num_rows, width,
out->dptr<DType>(), weight_data, grad_idx, grad_val,
static_cast<DType>(param.clip_gradient),
static_cast<DType>(param.lr), static_cast<DType>(param.wd),
static_cast<DType>(param.rescale_grad));
Expand All @@ -146,6 +145,29 @@ inline void SGDUpdateDnsRspImpl(const SGDParam& param,
});
}

template<typename xpu>
inline void SGDUpdateRspRspImpl(const SGDParam& param,
const OpContext& ctx,
const NDArray& weight,
const NDArray& grad,
const OpReqType& req,
NDArray *out) {
if (weight.storage_shape()[0] == weight.shape()[0] &&
out->storage_shape()[0] == out->shape()[0]) {
// TODO(haibin) this is a temporary solution, due to the fact that imperative_invoke only
// feed in kWriteTo as req for all operators.
// For sgd we don't want to assign zeros to the output values when req == kWriteTo
auto out_req = req;
if (out_req == kWriteTo) out_req = kWriteInplace;
// reuse dns rsp implementation when storage_shape == shape
TBlob out_blob = out->data();
SGDUpdateDnsRspImpl<xpu>(param, ctx, weight.data(), grad, out_req, &out_blob);
} else {
LOG(FATAL) << "SGDUpdate for RowSparse weights is only implemented when "
<< "weights.values.shape == weights.shape";
}
}

template<typename xpu>
inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs,
const OpContext &ctx,
Expand All @@ -159,7 +181,11 @@ inline void SGDUpdateEx(const nnvm::NodeAttrs& attrs,
auto weight_stype = inputs[0].storage_type();
auto grad_stype = inputs[1].storage_type();
if (weight_stype == kDefaultStorage && grad_stype == kRowSparseStorage) {
SGDUpdateDnsRspImpl<xpu>(param, ctx, inputs, req, outputs);
TBlob out = outputs[0].data();
SGDUpdateDnsRspImpl<xpu>(param, ctx, inputs[0].data(), inputs[1], req[0], &out);
} else if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage) {
NDArray out = outputs[0];
SGDUpdateRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1], req[0], &out);
} else if (weight_stype == kDefaultStorage && grad_stype == kDefaultStorage) {
FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs, SGDUpdate<xpu>, "SGDUpdate");
}
Expand Down Expand Up @@ -262,30 +288,31 @@ struct SGDMomDnsRspDnsKernel {

template<typename xpu>
inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
const OpContext &ctx,
const std::vector<NDArray> &inputs,
const std::vector<OpReqType> &req,
const std::vector<NDArray> &outputs) {
const OpContext& ctx,
const TBlob& weight,
const NDArray& grad,
const TBlob& mom,
const OpReqType& req,
TBlob *out) {
using namespace mxnet_op;
using namespace rowsparse;
Stream<xpu>* s = ctx.get_stream<xpu>();
auto &weight = inputs[0];
auto &grad = inputs[1];
auto &mom = inputs[2];
auto &out = outputs[0];
if (!grad.storage_initialized()) return;
if (!grad.storage_initialized() || req == kNullOp) return;
CHECK_GT(weight.shape_.Size(), 0);
CHECK_GT(mom.shape_.Size(), 0);

MSHADOW_REAL_TYPE_SWITCH(weight.dtype(), DType, {
MSHADOW_INT_TYPE_SWITCH(grad.aux_type(rowsparse::kIdx), IType, {
MXNET_ASSIGN_REQ_SWITCH(req[0], req_type, {
auto weight_data = weight.data().FlatTo2D<xpu, DType>(s);
auto grad_idx = grad.aux_data(rowsparse::kIdx).FlatTo1D<xpu, IType>(s);
auto grad_val = grad.data().FlatTo2D<xpu, DType>(s);
auto mom_data = mom.data().FlatTo2D<xpu, DType>(s);
auto out_data = out.data().FlatTo2D<xpu, DType>(s);
auto num_rows = grad.aux_shape(rowsparse::kIdx)[0];
auto width = weight.shape().ProdShape(1, weight.shape().ndim());
MSHADOW_REAL_TYPE_SWITCH(weight.type_flag_, DType, {
MSHADOW_INT_TYPE_SWITCH(grad.aux_type(kIdx), IType, {
MXNET_ASSIGN_REQ_SWITCH(req, req_type, {
auto weight_data = weight.dptr<DType>();
auto grad_idx = grad.aux_data(kIdx).dptr<IType>();
auto grad_val = grad.data().dptr<DType>();
auto mom_data = mom.dptr<DType>();
auto out_data = out->dptr<DType>();
auto num_rows = grad.aux_shape(kIdx)[0];
auto width = weight.shape_.ProdShape(1, weight.ndim());
Kernel<SGDMomDnsRspDnsKernel<req_type>, xpu>::Launch(s, num_rows, width,
out_data.dptr_, mom_data.dptr_, weight_data.dptr_, grad_idx.dptr_, grad_val.dptr_,
out_data, mom_data, weight_data, grad_idx, grad_val,
static_cast<DType>(param.clip_gradient), static_cast<DType>(param.momentum),
static_cast<DType>(param.lr), static_cast<DType>(param.wd),
static_cast<DType>(param.rescale_grad));
Expand All @@ -294,6 +321,50 @@ inline void SGDMomUpdateDnsRspDnsImpl(const SGDMomParam& param,
});
}

template<typename xpu>
inline void SGDMomUpdateRspRspRspImpl(const SGDMomParam& param,
const OpContext& ctx,
const NDArray& weight,
const NDArray& grad,
const NDArray& mom,
const OpReqType& req,
NDArray *out) {
using namespace mshadow;
using namespace mshadow::expr;
using namespace mxnet_op;
using namespace rowsparse;
if (weight.storage_shape()[0] == weight.shape()[0] &&
out->storage_shape()[0] == out->shape()[0]) {
Stream<xpu>* s = ctx.get_stream<xpu>();
// fill mom with zero values in order to reuse the sgd mom dns impl
if (!mom.storage_initialized()) {
MSHADOW_REAL_TYPE_SWITCH(mom.dtype(), DType, {
MSHADOW_INT_TYPE_SWITCH(mom.aux_type(kIdx), IType, {
auto num_rows = mom.shape()[0];
mom.CheckAndAlloc({Shape1(num_rows)});
auto mom_idx = mom.aux_data(kIdx).FlatTo1D<xpu, IType>(s);
auto mom_val = mom.data();
// TODO(haibin) this is single-thread execution
Kernel<set_zero, xpu>::Launch(s, mom_val.Size(), mom_val.dptr<DType>());
ASSIGN_DISPATCH(mom_idx, kWriteTo, range<IType>(0, num_rows, 1, 1))
});
});
}
// TODO(haibin) this is a temporary solution, due to the fact that imperative_invoke only
// feed in kWriteTo as req for all operators.
// For sgd we don't want to assign zeros to the output values when req == kWriteTo
auto out_req = req;
if (out_req == kWriteTo) out_req = kWriteInplace;
TBlob out_blob = out->data();
// reuse dns rsp implementation when storage_shape == shape
SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, weight.data(), grad,
mom.data(), out_req, &out_blob);
} else {
LOG(FATAL) << "SGDUpdate for RowSparse weights is only implemented when "
<< "weights.values.shape == weights.shape";
}
}

template<typename xpu>
inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
const OpContext &ctx,
Expand All @@ -305,10 +376,16 @@ inline void SGDMomUpdateEx(const nnvm::NodeAttrs& attrs,
auto weight_stype = inputs[0].storage_type();
auto grad_stype = inputs[1].storage_type();
auto mom_stype = inputs[2].storage_type();

if (weight_stype == kDefaultStorage && grad_stype == kRowSparseStorage &&
mom_stype == kDefaultStorage) {
SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, inputs, req, outputs);
TBlob out = outputs[0].data();
SGDMomUpdateDnsRspDnsImpl<xpu>(param, ctx, inputs[0].data(), inputs[1],
inputs[2].data(), req[0], &out);
} else if (weight_stype == kRowSparseStorage && grad_stype == kRowSparseStorage &&
mom_stype == kRowSparseStorage) {
NDArray out = outputs[0];
SGDMomUpdateRspRspRspImpl<xpu>(param, ctx, inputs[0], inputs[1],
inputs[2], req[0], &out);
} else if (weight_stype == kDefaultStorage && grad_stype == kDefaultStorage &&
mom_stype == kDefaultStorage) {
FCompExFallback<xpu>(attrs, ctx, inputs, req, outputs,
Expand Down
9 changes: 3 additions & 6 deletions src/operator/tensor/elemwise_unary_op.h
Original file line number Diff line number Diff line change
Expand Up @@ -324,10 +324,8 @@ inline void CastStorageDnsRspImpl(mshadow::Stream<cpu>* s, const TBlob& dns, NDA
struct CastStorageRspDnsKernel {
template<typename DType, typename IType>
MSHADOW_XINLINE static void Map(int i, const index_t width, const IType* idx, const DType *data,
DType* dns, const index_t invalid_rid) {
DType* dns) {
auto rid = idx[i];
// skip invalid rows
if (rid == invalid_rid) return;
auto dns_offset = rid * width;
auto rsp_offset = i * width;
for (size_t col = 0; col < width; col++) {
Expand Down Expand Up @@ -356,10 +354,9 @@ void CastStorageRspDnsImpl(mshadow::Stream<xpu>* s, const NDArray& rsp, TBlob* d
auto out_data = dns->FlatTo2D<xpu, DType>(s).dptr_;
auto num_rows = rsp.aux_shape(rowsparse::kIdx).Size();
auto rsp_shape = rsp.shape();
auto invalid_rid = rsp_shape[0];
auto width = rsp_shape.ProdShape(1, rsp_shape.ndim());
mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_rows, width, in_idx, in_data,
out_data, invalid_rid);
mxnet_op::Kernel<CastStorageRspDnsKernel, xpu>::Launch(s, num_rows, width, in_idx,
in_data, out_data);
}
});
});
Expand Down
26 changes: 17 additions & 9 deletions src/operator/tensor/indexing_op.cc
Original file line number Diff line number Diff line change
Expand Up @@ -87,39 +87,47 @@ NNVM_REGISTER_OP(_backward_Embedding)
.set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);

NNVM_REGISTER_OP(SparseEmbedding)
.describe(R"code(Maps integer indices to vector representations (embeddings) with sparse weight update
)code" ADD_FILELINE)
.describe(R"doc(Represents words or other sparse inputs by dense continuous vectors.
It assumes that the input is in one-hot form. E.g., for a vocabulary size of 10,000,
each input vector is expected to have dimension 10,000.
The index of the non-zero entry is the index of the word or item it represents.

The corresponding embedding vectors are stored as rows of a matrix.
Hence, mapping an input word to its embedding is implemented as a matrix product.

The gradient of an embedding matrix has the form of gradient vectors that are only
non-zero for words seen in a minibatch.
)doc" ADD_FILELINE)
.set_num_inputs(2)
.set_num_outputs(1)
.set_attr_parser(ParamParser<EmbeddingParam>)
.set_attr<nnvm::FListInputNames>("FListInputNames",
[](const NodeAttrs& attrs) {
return std::vector<std::string>{"data", "weight"};
})
.set_attr<nnvm::FInferShape>("FInferShape", EmbeddingOpShape)
.set_attr<nnvm::FInferShape>("FInferShape", SparseEmbeddingShape)
.set_attr<nnvm::FInferType>("FInferType", EmbeddingOpType)
.set_attr<nnvm::FInferStorageType>("FInferStorageType", SparseEmbeddingForwardStorageType)
.set_attr<FResourceRequest>("FResourceRequest",
[](const NodeAttrs& attrs) {
return std::vector<ResourceRequest>{ResourceRequest::kTempSpace};
})
.set_attr<FCompute>("FCompute<cpu>", EmbeddingOpForward<cpu>)
.set_attr<FComputeEx>(FCOMP_EX_CPU, SparseEmbeddingForwardEx<cpu>)
.set_attr<nnvm::FGradient>("FGradient",
[](const nnvm::NodePtr& n, const std::vector<nnvm::NodeEntry>& ograds) {
return MakeNonlossGradNode("_backward_SparseEmbedding", n, ograds,
{n->inputs[0]}, n->attrs.dict);
})
.add_argument("data", "NDArray-or-Symbol", "The input array to the embedding operator.")
.add_argument("data", "NDArray-or-Symbol",
"The input array to the sparse embedding operator.")
.add_argument("weight", "NDArray-or-Symbol", "The embedding weight matrix.")
.add_arguments(EmbeddingParam::__FIELDS__());

NNVM_REGISTER_OP(_backward_SparseEmbedding)
.set_num_inputs(2)
.set_num_outputs(2)
.set_attr<nnvm::TIsBackward>("TIsBackward", true)
.set_attr<nnvm::FInferStorageType>("FInferStorageType", SparseEmbeddingBackwardStorageType)
.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingOpBackwardEx<cpu>);
// TODO(haibin) handle dense case
// .set_attr<FCompute>("FCompute<cpu>", EmbeddingOpBackward<cpu>);
.set_attr<FComputeEx>("FComputeEx<cpu>", SparseEmbeddingBackwardEx<cpu>);

NNVM_REGISTER_OP(take)
.describe(R"code(Takes elements from an input array along the given axis.
Expand Down
Loading