Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Weight Sharing #546

Merged
merged 3 commits into from
Jun 26, 2014
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion include/caffe/net.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <map>
#include <set>
#include <string>
#include <utility>
#include <vector>

#include "caffe/blob.hpp"
Expand All @@ -14,9 +15,10 @@
#include "caffe/proto/caffe.pb.h"

using std::map;
using std::vector;
using std::pair;
using std::set;
using std::string;
using std::vector;

namespace caffe {

Expand Down Expand Up @@ -103,6 +105,7 @@ class Net {
const shared_ptr<Blob<Dtype> > blob_by_name(const string& blob_name);
bool has_layer(const string& layer_name);
const shared_ptr<Layer<Dtype> > layer_by_name(const string& layer_name);
const map<string, int>& param_names_index() { return param_names_index_; }

protected:
// Helpers for Init.
Expand All @@ -114,6 +117,8 @@ class Net {
int AppendBottom(const NetParameter& param, const int layer_id,
const int bottom_id, set<string>* available_blobs,
map<string, int>* blob_name_to_idx);
void AppendParam(const NetParameter& param, const int layer_id,
const int param_id);
// Function to get misc parameters, e.g. the learning rate multiplier and
// weight decay.
void GetLearningRateAndWeightDecay();
Expand All @@ -138,6 +143,9 @@ class Net {
// top_vecs stores the vectors containing the output for each layer
vector<vector<Blob<Dtype>*> > top_vecs_;
vector<vector<int> > top_id_vecs_;
vector<int> param_owners_;
vector<pair<int, int> > layer_param_indices_;
map<string, int> param_names_index_;
// blob indices for the input and the output of the net
vector<int> net_input_blob_indices_;
vector<int> net_output_blob_indices_;
Expand Down
111 changes: 104 additions & 7 deletions src/caffe/net.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,12 @@
#include "caffe/net.hpp"
#include "caffe/util/io.hpp"
#include "caffe/util/insert_splits.hpp"
#include "caffe/util/math_functions.hpp"
#include "caffe/util/upgrade_proto.hpp"

using std::pair;
using std::make_pair;
using std::map;
using std::pair;
using std::set;

namespace caffe {
Expand Down Expand Up @@ -86,8 +88,9 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
}
DLOG(INFO) << "Memory required for data: " << memory_used_ * sizeof(Dtype);
const int blobs_lr_size = layers_[layer_id]->layer_param().blobs_lr_size();
CHECK(blobs_lr_size == layers_[layer_id]->blobs().size() ||
blobs_lr_size == 0) << "Incorrect blobs lr size: should be either 0 "
const int num_param_blobs = layers_[layer_id]->blobs().size();
CHECK(blobs_lr_size == num_param_blobs || blobs_lr_size == 0)
<< "Incorrect blobs lr size: should be either 0 "
<< "or the same as the number of the layer's parameter blobs.";
if (blobs_lr_size) {
// Check if this layer needs backward operation itself
Expand All @@ -100,6 +103,17 @@ void Net<Dtype>::Init(const NetParameter& in_param) {
// learning rate to be 1. Thus we will need to perform backward.
need_backward = true;
}
const int param_size = layer_param.param_size();
CHECK(param_size == num_param_blobs || param_size == 0)
<< "Incorrect param size: should be either 0 or the same as "
"the number of the layer's parameter blobs: " << num_param_blobs;
const int blob_share_mode_size = layer_param.blob_share_mode_size();
CHECK(blob_share_mode_size == num_param_blobs || blob_share_mode_size == 0)
<< "Incorrect blob_share_mode size: should be either 0 or the same as "
"the number of the layer's parameter blobs: " << num_param_blobs;
for (int param_id = 0; param_id < num_param_blobs; ++param_id) {
AppendParam(param, layer_id, param_id);
}
// Finally, set the backward flag
layer_need_backward_.push_back(need_backward);
if (need_backward) {
Expand Down Expand Up @@ -217,14 +231,69 @@ int Net<Dtype>::AppendBottom(const NetParameter& param,
return blob_id;
}

template <typename Dtype>
void Net<Dtype>::AppendParam(const NetParameter& param, const int layer_id,
const int param_id) {
const LayerParameter& layer_param = layers_[layer_id]->layer_param();
const int param_size = layer_param.param_size();
string param_name;
if (param_size) {
param_name = layer_param.param(param_id);
}
const int net_param_id = params_.size();
params_.push_back(layers_[layer_id]->blobs()[param_id]);
layer_param_indices_.push_back(make_pair(layer_id, param_id));
if (!param_size || !param_name.size() || (param_name.size() &&
param_names_index_.find(param_name) == param_names_index_.end())) {
// This layer "owns" this parameter blob -- it is either anonymous
// (i.e., not given a param_name) or explicitly given a name that we
// haven't already seen.
param_owners_.push_back(-1);
if (param_size) {
param_names_index_[param_name] = net_param_id;
}
} else {
// Named param blob with name we've seen before: share params
const int owner_net_param_id = param_names_index_[param_name];
param_owners_.push_back(owner_net_param_id);
const pair<int, int>& owner_index =
layer_param_indices_[owner_net_param_id];
const int owner_layer_id = owner_index.first;
const int owner_param_id = owner_index.second;
LOG(INFO) << "Sharing parameters '" << param_name << "' owned by "
<< "layer '" << layer_names_[owner_layer_id] << "', param "
<< "index " << owner_param_id;
Blob<Dtype>* this_blob = layers_[layer_id]->blobs()[param_id].get();
Blob<Dtype>* owner_blob =
layers_[owner_layer_id]->blobs()[owner_param_id].get();
const int blob_share_mode_size = layer_param.blob_share_mode_size();
if (blob_share_mode_size > param_id &&
(layer_param.blob_share_mode(param_id) ==
LayerParameter_DimCheckMode_PERMISSIVE)) {
// Permissive dimension checking -- only check counts are the same.
CHECK_EQ(this_blob->count(), owner_blob->count())
<< "Shared parameter blobs must have the same count.";
} else {
// Strict dimension checking -- all dims must be the same.
CHECK_EQ(this_blob->num(), owner_blob->num())
<< "Shared parameter blobs must have the same num.";
CHECK_EQ(this_blob->channels(), owner_blob->channels())
<< "Shared parameter blobs must have the same channels.";
CHECK_EQ(this_blob->height(), owner_blob->height())
<< "Shared parameter blobs must have the same height.";
CHECK_EQ(this_blob->width(), owner_blob->width())
<< "Shared parameter blobs must have the same width.";
}
layers_[layer_id]->blobs()[param_id]->ShareData(
*layers_[owner_layer_id]->blobs()[owner_param_id]);
}
}

template <typename Dtype>
void Net<Dtype>::GetLearningRateAndWeightDecay() {
LOG(INFO) << "Collecting Learning Rate and Weight Decay.";
for (int i = 0; i < layers_.size(); ++i) {
vector<shared_ptr<Blob<Dtype> > >& layer_blobs = layers_[i]->blobs();
for (int j = 0; j < layer_blobs.size(); ++j) {
params_.push_back(layer_blobs[j]);
}
// push the learning rate mutlipliers
if (layers_[i]->layer_param().blobs_lr_size()) {
CHECK_EQ(layers_[i]->layer_param().blobs_lr_size(), layer_blobs.size());
Expand Down Expand Up @@ -403,8 +472,36 @@ void Net<Dtype>::ToProto(NetParameter* param, bool write_diff) {

template <typename Dtype>
void Net<Dtype>::Update() {
// First, accumulate the diffs of any shared parameters into their owner's
// diff. (Assumes that the learning rate, weight decay, etc. have already been
// accounted for in the current diff.)
for (int i = 0; i < params_.size(); ++i) {
params_[i]->Update();
if (param_owners_[i] < 0) {
continue;
}
const int count = params_[i]->count();
const Dtype* this_diff;
Dtype* owner_diff;
switch (Caffe::mode()) {
case Caffe::CPU:
this_diff = params_[i]->cpu_diff();
owner_diff = params_[param_owners_[i]]->mutable_cpu_diff();
caffe_add(count, this_diff, owner_diff, owner_diff);
break;
case Caffe::GPU:
this_diff = params_[i]->gpu_diff();
owner_diff = params_[param_owners_[i]]->mutable_gpu_diff();
caffe_gpu_add(count, this_diff, owner_diff, owner_diff);
break;
default:
LOG(FATAL) << "Unknown caffe mode: " << Caffe::mode();
}
}
// Now, update the owned parameters.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@shelhamer, If I understood correctly, you add up all the diffs to the owner_diff and then update the parameters accordingly, right? Does it imply if the layer owning this blob is not contributing to the loss, the updated diff from other layers that use this blob but don't own it, will not be used?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We either must change the ownership from the first layer that mentions the param to the first layer that mentions the param and participates in the loss, or we need to fix the layer_need_backward_[layer_id] accordingly. Is my concern valid?

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ashafaei Was this fixed in the current version?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@ashafaei @abhi2610 This was never actually an issue although it was worth raising since you need to know how the Net, Solver and Blobs cooperate. The loss / backward logic only decides whether backward is computed for a layer or not. Net::Update() is always called by the solver, all of the shared weight params will accumulate their diff with the owner in the loop at 478, and then Blob::Update() will always be called for every weight owner as in the loop at 501. This does bring up why Blob::Update() is unconditional when it could be skipped but that's another matter. Thanks @jeffdonahue for discussion.

for (int i = 0; i < params_.size(); ++i) {
if (param_owners_[i] < 0) {
params_[i]->Update();
}
}
}

Expand Down
16 changes: 14 additions & 2 deletions src/caffe/proto/caffe.proto
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,18 @@ message LayerParameter {

// The blobs containing the numeric parameters of the layer
repeated BlobProto blobs = 6;
// The names of the parameter blobs -- useful for sharing parameters among
// layers (but never required).
repeated string param = 1001;
// Whether to require shared weights to have the same shape, or just the same
// count -- defaults to STRICT if unspecified.
repeated DimCheckMode blob_share_mode = 1002;
enum DimCheckMode {
// STRICT (default) requires that num, channels, height, width each match.
STRICT = 0;
// PERMISSIVE requires only the count (num*channels*height*width) to match.
PERMISSIVE = 1;
}
// The ratio that is multiplied on the global learning rate. If you want to
// set the learning ratio for one blob, you need to set it for all blobs.
repeated float blobs_lr = 7;
Expand Down Expand Up @@ -307,9 +319,9 @@ message HDF5OutputParameter {
}

message HingeLossParameter {
enum Norm {
enum Norm {
L1 = 1;
L2 = 2;
L2 = 2;
}
// Specify the Norm to use L1 or L2
optional Norm norm = 1 [default = L1];
Expand Down
Loading