From eee4372e34200b26ade3880067767985ca5d949c Mon Sep 17 00:00:00 2001 From: Youssef Kashef Date: Fri, 29 Jan 2016 19:21:48 +0100 Subject: [PATCH] tranpose parameter add to IP layer to support tied weights in an autoencoder. Arguments to matrix mul. are conditioned on this parameter, no actual transposing takes place. --- include/caffe/layers/inner_product_layer.hpp | 1 + src/caffe/layers/inner_product_layer.cpp | 42 +++- src/caffe/layers/inner_product_layer.cu | 31 ++- src/caffe/proto/caffe.proto | 5 + src/caffe/test/test_inner_product_layer.cpp | 213 +++++++++++++++++++ 5 files changed, 277 insertions(+), 15 deletions(-) diff --git a/include/caffe/layers/inner_product_layer.hpp b/include/caffe/layers/inner_product_layer.hpp index 250576a4817..18d0d6192eb 100644 --- a/include/caffe/layers/inner_product_layer.hpp +++ b/include/caffe/layers/inner_product_layer.hpp @@ -44,6 +44,7 @@ class InnerProductLayer : public Layer { int N_; bool bias_term_; Blob bias_multiplier_; + bool transpose_; ///< if true, assume transposed weights }; } // namespace caffe diff --git a/src/caffe/layers/inner_product_layer.cpp b/src/caffe/layers/inner_product_layer.cpp index d9088805501..e65349f0055 100644 --- a/src/caffe/layers/inner_product_layer.cpp +++ b/src/caffe/layers/inner_product_layer.cpp @@ -11,6 +11,7 @@ void InnerProductLayer::LayerSetUp(const vector*>& bottom, const vector*>& top) { const int num_output = this->layer_param_.inner_product_param().num_output(); bias_term_ = this->layer_param_.inner_product_param().bias_term(); + transpose_ = this->layer_param_.inner_product_param().transpose(); N_ = num_output; const int axis = bottom[0]->CanonicalAxisIndex( this->layer_param_.inner_product_param().axis()); @@ -27,10 +28,15 @@ void InnerProductLayer::LayerSetUp(const vector*>& bottom, } else { this->blobs_.resize(1); } - // Intialize the weight + // Initialize the weights vector weight_shape(2); - weight_shape[0] = N_; - weight_shape[1] = K_; + if (transpose_) { + weight_shape[0] = K_; + weight_shape[1] = N_; + } else { + weight_shape[0] = N_; + weight_shape[1] = K_; + } this->blobs_[0].reset(new Blob(weight_shape)); // fill the weights shared_ptr > weight_filler(GetFiller( @@ -80,7 +86,8 @@ void InnerProductLayer::Forward_cpu(const vector*>& bottom, const Dtype* bottom_data = bottom[0]->cpu_data(); Dtype* top_data = top[0]->mutable_cpu_data(); const Dtype* weight = this->blobs_[0]->cpu_data(); - caffe_cpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., + caffe_cpu_gemm(CblasNoTrans, transpose_ ? CblasNoTrans : CblasTrans, + M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., top_data); if (bias_term_) { caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., @@ -97,8 +104,17 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, const Dtype* top_diff = top[0]->cpu_diff(); const Dtype* bottom_data = bottom[0]->cpu_data(); // Gradient with respect to weight - caffe_cpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); + if (transpose_) { + caffe_cpu_gemm(CblasTrans, CblasNoTrans, + K_, N_, M_, + (Dtype)1., bottom_data, top_diff, + (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); + } else { + caffe_cpu_gemm(CblasTrans, CblasNoTrans, + N_, K_, M_, + (Dtype)1., top_diff, bottom_data, + (Dtype)1., this->blobs_[0]->mutable_cpu_diff()); + } } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->cpu_diff(); @@ -110,9 +126,17 @@ void InnerProductLayer::Backward_cpu(const vector*>& top, if (propagate_down[0]) { const Dtype* top_diff = top[0]->cpu_diff(); // Gradient with respect to bottom data - caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->cpu_data(), (Dtype)0., - bottom[0]->mutable_cpu_diff()); + if (transpose_) { + caffe_cpu_gemm(CblasNoTrans, CblasTrans, + M_, K_, N_, + (Dtype)1., top_diff, this->blobs_[0]->cpu_data(), + (Dtype)0., bottom[0]->mutable_cpu_diff()); + } else { + caffe_cpu_gemm(CblasNoTrans, CblasNoTrans, + M_, K_, N_, + (Dtype)1., top_diff, this->blobs_[0]->cpu_data(), + (Dtype)0., bottom[0]->mutable_cpu_diff()); + } } } diff --git a/src/caffe/layers/inner_product_layer.cu b/src/caffe/layers/inner_product_layer.cu index dc25aa33bd1..a58b56e3281 100644 --- a/src/caffe/layers/inner_product_layer.cu +++ b/src/caffe/layers/inner_product_layer.cu @@ -19,7 +19,9 @@ void InnerProductLayer::Forward_gpu(const vector*>& bottom, caffe_gpu_axpy(N_, bias_multiplier_.cpu_data()[0], this->blobs_[1]->gpu_data(), top_data); } else { - caffe_gpu_gemm(CblasNoTrans, CblasTrans, M_, N_, K_, (Dtype)1., + caffe_gpu_gemm(CblasNoTrans, + transpose_ ? CblasNoTrans : CblasTrans, + M_, N_, K_, (Dtype)1., bottom_data, weight, (Dtype)0., top_data); if (bias_term_) caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, N_, 1, (Dtype)1., @@ -36,8 +38,17 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, const Dtype* top_diff = top[0]->gpu_diff(); const Dtype* bottom_data = bottom[0]->gpu_data(); // Gradient with respect to weight - caffe_gpu_gemm(CblasTrans, CblasNoTrans, N_, K_, M_, (Dtype)1., - top_diff, bottom_data, (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); + if (transpose_) { + caffe_gpu_gemm(CblasTrans, CblasNoTrans, + K_, N_, M_, + (Dtype)1., bottom_data, top_diff, + (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); + } else { + caffe_gpu_gemm(CblasTrans, CblasNoTrans, + N_, K_, M_, + (Dtype)1., top_diff, bottom_data, + (Dtype)1., this->blobs_[0]->mutable_gpu_diff()); + } } if (bias_term_ && this->param_propagate_down_[1]) { const Dtype* top_diff = top[0]->gpu_diff(); @@ -49,9 +60,17 @@ void InnerProductLayer::Backward_gpu(const vector*>& top, if (propagate_down[0]) { const Dtype* top_diff = top[0]->gpu_diff(); // Gradient with respect to bottom data - caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, M_, K_, N_, (Dtype)1., - top_diff, this->blobs_[0]->gpu_data(), (Dtype)0., - bottom[0]->mutable_gpu_diff()); + if (transpose_) { + caffe_gpu_gemm(CblasNoTrans, CblasTrans, + M_, K_, N_, + (Dtype)1., top_diff, this->blobs_[0]->gpu_data(), + (Dtype)0., bottom[0]->mutable_gpu_diff()); + } else { + caffe_gpu_gemm(CblasNoTrans, CblasNoTrans, + M_, K_, N_, + (Dtype)1., top_diff, this->blobs_[0]->gpu_data(), + (Dtype)0., bottom[0]->mutable_gpu_diff()); + } } } diff --git a/src/caffe/proto/caffe.proto b/src/caffe/proto/caffe.proto index 6493a72d778..7edb6ae87e0 100644 --- a/src/caffe/proto/caffe.proto +++ b/src/caffe/proto/caffe.proto @@ -786,6 +786,11 @@ message InnerProductParameter { // all preceding axes are retained in the output. // May be negative to index from the end (e.g., -1 for the last axis). optional int32 axis = 5 [default = 1]; + // Specify whether to transpose the weight matrix or not. + // If transpose == true, any operations will be performed on the transpose + // of the weight matrix. The weight matrix itself is not going to be transposed + // but rather the transfer flag of operations will be toggled accordingly. + optional bool transpose = 6 [default = false]; } // Message that stores parameters used by LogLayer diff --git a/src/caffe/test/test_inner_product_layer.cpp b/src/caffe/test/test_inner_product_layer.cpp index b888b510318..7eeaa922078 100644 --- a/src/caffe/test/test_inner_product_layer.cpp +++ b/src/caffe/test/test_inner_product_layer.cpp @@ -60,6 +60,50 @@ TYPED_TEST(InnerProductLayerTest, TestSetUp) { EXPECT_EQ(this->blob_top_->channels(), 10); } +/** @brief TestSetUp while toggling tranpose flag + */ +TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeFalse) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->set_transpose(false); + shared_ptr > layer( + new InnerProductLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(2, this->blob_top_->num()); + EXPECT_EQ(1, this->blob_top_->height()); + EXPECT_EQ(1, this->blob_top_->width()); + EXPECT_EQ(10, this->blob_top_->channels()); + EXPECT_EQ(2, layer->blobs()[0]->num_axes()); + EXPECT_EQ(10, layer->blobs()[0]->shape(0)); + EXPECT_EQ(60, layer->blobs()[0]->shape(1)); +} + +/** @brief TestSetUp while toggling tranpose flag + */ +TYPED_TEST(InnerProductLayerTest, TestSetUpTranposeTrue) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->set_transpose(true); + shared_ptr > layer( + new InnerProductLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(2, this->blob_top_->num()); + EXPECT_EQ(1, this->blob_top_->height()); + EXPECT_EQ(1, this->blob_top_->width()); + EXPECT_EQ(10, this->blob_top_->channels()); + EXPECT_EQ(2, layer->blobs()[0]->num_axes()); + EXPECT_EQ(60, layer->blobs()[0]->shape(0)); + EXPECT_EQ(10, layer->blobs()[0]->shape(1)); +} + TYPED_TEST(InnerProductLayerTest, TestForward) { typedef typename TypeParam::Dtype Dtype; this->blob_bottom_vec_.push_back(this->blob_bottom_); @@ -91,6 +135,79 @@ TYPED_TEST(InnerProductLayerTest, TestForward) { } } +/** + * @brief Init. an IP layer without transpose + random weights, + * run Forward, save the result. + * Init. another IP layer with transpose. + * manually copy and transpose the weights from the first IP layer, + * then run Forward on the same input and check that the result is the same + */ +TYPED_TEST(InnerProductLayerTest, TestForwardTranspose) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + bool IS_VALID_CUDA = false; +#ifndef CPU_ONLY + IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; +#endif + if (Caffe::mode() == Caffe::CPU || + sizeof(Dtype) == 4 || IS_VALID_CUDA) { + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->mutable_weight_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_min(1); + inner_product_param->mutable_bias_filler()->set_max(2); + inner_product_param->set_transpose(false); + shared_ptr > layer( + new InnerProductLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + const int count = this->blob_top_->count(); + Blob* const top = new Blob(); + top->ReshapeLike(*this->blob_top_); + caffe_copy(count, this->blob_top_->cpu_data(), top->mutable_cpu_data()); + this->blob_top_vec_.clear(); + this->blob_top_vec_.push_back(new Blob()); + inner_product_param->set_transpose(true); + shared_ptr > ip_t( + new InnerProductLayer(layer_param)); + ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + const int count_w = layer->blobs()[0]->count(); + EXPECT_EQ(count_w, ip_t->blobs()[0]->count()); + // manually copy and transpose the weights from 1st IP layer into 2nd + const Dtype* w = layer->blobs()[0]->cpu_data(); + Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data(); + const int width = layer->blobs()[0]->shape(1); + const int width_t = ip_t->blobs()[0]->shape(1); + for (int i = 0; i < count_w; ++i) { + int r = i / width; + int c = i % width; + w_t[c*width_t+r] = w[r*width+c]; // copy while transposing + } + // copy bias from 1st IP layer to 2nd IP layer + ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count()); + caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(), + ip_t->blobs()[1]->mutable_cpu_data()); + ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + EXPECT_EQ(count, this->blob_top_->count()) + << "Invalid count for top blob for IP with transpose."; + Blob* const top_t = new Blob();\ + top_t->ReshapeLike(*this->blob_top_vec_[0]); + caffe_copy(count, + this->blob_top_vec_[0]->cpu_data(), + top_t->mutable_cpu_data()); + const Dtype* data = top->cpu_data(); + const Dtype* data_t = top_t->cpu_data(); + for (int i = 0; i < count; ++i) { + EXPECT_FLOAT_EQ(data[i], data_t[i]); + } + } else { + LOG(ERROR) << "Skipping test due to old architecture."; + } +} + TYPED_TEST(InnerProductLayerTest, TestForwardNoBatch) { typedef typename TypeParam::Dtype Dtype; this->blob_bottom_vec_.push_back(this->blob_bottom_nobatch_); @@ -148,4 +265,100 @@ TYPED_TEST(InnerProductLayerTest, TestGradient) { } } +TYPED_TEST(InnerProductLayerTest, TestBackwardTranspose) { + typedef typename TypeParam::Dtype Dtype; + this->blob_bottom_vec_.push_back(this->blob_bottom_); + bool IS_VALID_CUDA = false; +#ifndef CPU_ONLY + IS_VALID_CUDA = CAFFE_TEST_CUDA_PROP.major >= 2; +#endif + if (Caffe::mode() == Caffe::CPU || + sizeof(Dtype) == 4 || IS_VALID_CUDA) { + LayerParameter layer_param; + InnerProductParameter* inner_product_param = + layer_param.mutable_inner_product_param(); + inner_product_param->set_num_output(10); + inner_product_param->mutable_weight_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_type("uniform"); + inner_product_param->mutable_bias_filler()->set_min(1); + inner_product_param->mutable_bias_filler()->set_max(2); + inner_product_param->set_transpose(false); + shared_ptr > layer( + new InnerProductLayer(layer_param)); + layer->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + layer->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + // copy top blob + Blob* const top = new Blob(); + top->CopyFrom(*this->blob_top_, false, true); + // fake top diff + Blob* const diff = new Blob(); + diff->ReshapeLike(*this->blob_top_); + { + FillerParameter filler_param; + UniformFiller filler(filler_param); + filler.Fill(diff); + } + caffe_copy(this->blob_top_vec_[0]->count(), + diff->cpu_data(), + this->blob_top_vec_[0]->mutable_cpu_diff()); + vector propagate_down(1, true); + layer->Backward(this->blob_top_vec_, + propagate_down, + this->blob_bottom_vec_); + // copy first ip's weights and their diffs + Blob* const w = new Blob(); + w->CopyFrom(*layer->blobs()[0], false, true); + w->CopyFrom(*layer->blobs()[0], true, true); + // copy bottom diffs + Blob* const bottom_diff = new Blob(); + bottom_diff->CopyFrom(*this->blob_bottom_vec_[0], true, true); + // repeat original top with tranposed ip + this->blob_top_vec_.clear(); + this->blob_top_vec_.push_back(new Blob()); + inner_product_param->set_transpose(true); + shared_ptr > ip_t( + new InnerProductLayer(layer_param)); + ip_t->SetUp(this->blob_bottom_vec_, this->blob_top_vec_); + // manually copy and transpose the weights from 1st IP layer into 2nd + { + const Dtype* w_src = w->cpu_data(); + Dtype* w_t = ip_t->blobs()[0]->mutable_cpu_data(); + const int width = layer->blobs()[0]->shape(1); + const int width_t = ip_t->blobs()[0]->shape(1); + for (int i = 0; i < layer->blobs()[0]->count(); ++i) { + int r = i / width; + int c = i % width; + w_t[c*width_t+r] = w_src[r*width+c]; // copy while transposing + } + // copy bias from 1st IP layer to 2nd IP layer + ASSERT_EQ(layer->blobs()[1]->count(), ip_t->blobs()[1]->count()); + caffe_copy(layer->blobs()[1]->count(), layer->blobs()[1]->cpu_data(), + ip_t->blobs()[1]->mutable_cpu_data()); + } + ip_t->Forward(this->blob_bottom_vec_, this->blob_top_vec_); + caffe_copy(this->blob_top_vec_[0]->count(), + diff->cpu_data(), + this->blob_top_vec_[0]->mutable_cpu_diff()); + ip_t->Backward(this->blob_top_vec_, propagate_down, this->blob_bottom_vec_); + const Dtype* data = w->cpu_diff(); + const Dtype* data_t = ip_t->blobs()[0]->cpu_diff(); + const int WIDTH = layer->blobs()[0]->shape(1); + const int WIDTH_T = ip_t->blobs()[0]->shape(1); + for (int i = 0; i < layer->blobs()[0]->count(); ++i) { + int r = i / WIDTH; + int c = i % WIDTH; + EXPECT_NE(Dtype(0.), data[r*WIDTH+c]); + EXPECT_FLOAT_EQ(data[r*WIDTH+c], data_t[c*WIDTH_T+r]); + } + data = bottom_diff->cpu_diff(); + data_t = this->blob_bottom_vec_[0]->cpu_diff(); + for (int i = 0; i < this->blob_bottom_vec_[0]->count(); ++i) { + EXPECT_NE(Dtype(0.), data[i]); + EXPECT_FLOAT_EQ(data[i], data_t[i]); + } + } else { + LOG(ERROR) << "Skipping test due to old architecture."; + } +} + } // namespace caffe