Skip to content

Commit

Permalink
Changed send_back_flags implementation
Browse files Browse the repository at this point in the history
Fix the bug of not finishing training
Refacted code
  • Loading branch information
CoderSherlock committed Oct 21, 2018
1 parent 2a976a5 commit 5a482e2
Show file tree
Hide file tree
Showing 6 changed files with 38 additions and 29 deletions.
4 changes: 2 additions & 2 deletions dlib/dnn/syncer/syncer.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ class dnn_syncer {


int verbose = 0;
int num_debug = 1;
int num_debug = 0;
int exper = 0;


Expand Down Expand Up @@ -247,7 +247,7 @@ class dnn_async_leader : public dnn_leader<trainer_type> {
std::vector<std::thread *> recievers;

std::vector<std::vector<resizable_tensor>> send_back_paras;
std::vector<int> send_back_flags;
volatile int* send_back_flags;

task_queue tq;
};
Expand Down
19 changes: 14 additions & 5 deletions dlib/dnn/syncer/syncer_async.h
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,8 @@ void dnn_async_leader<trainer_type>::init_reciever_pool() {
});

this->send_back_paras.resize (this->get_running_slaves_num());
this->send_back_flags.resize (this->get_running_slaves_num());
// this->send_back_flags.resize (this->get_running_slaves_num());
this->send_back_flags = new int[this->get_running_slaves_num()];

for (size_t i = 0; i < this->send_back_paras.size(); i++) {
this->send_back_paras[i].resize (this->trainer->num_computational_layers);
Expand Down Expand Up @@ -56,6 +57,8 @@ void dnn_async_leader<trainer_type>::async_thread (int slave_index) {

while (1) {
this->recieve_gradients_from_one (slave_index, gradients);
if (this->slaves_status[slave_index] != slaveStatus::Running)
break;
std::cout << "Recieved from slave " << slave_index << std::endl;

task t (slave_index, 1, gradients);
Expand All @@ -74,10 +77,16 @@ template<typename trainer_type>
int dnn_async_leader<trainer_type>::recieve_gradients_from_one (int slave_index, std::vector<resizable_tensor> &cli_tensors) {
// std::cout << slave_index << ":" << &this->slaves_conns << std::endl;

for (size_t i = 0; i < cli_tensors.size(); i++) {
if (cli_tensors[i].size() != 0) {
network::recieve_compressed_tensor (this->slaves_conns[slave_index], &cli_tensors[i]);
try {
for (size_t i = 0; i < cli_tensors.size(); i++) {
if (cli_tensors[i].size() != 0) {
network::recieve_compressed_tensor (this->slaves_conns[slave_index], &cli_tensors[i]);
}
}
} catch (...) {
std::cout << "It seems that slave " << slave_index << " closed" << std::endl;
this->slaves_status[slave_index] = slaveStatus::NotConn;
close_gracefully(this->slaves_conns[slave_index], 1);
}

return 1;
Expand Down Expand Up @@ -141,7 +150,7 @@ void dnn_async_leader<trainer_type>::sync() {
while (this->trainer->synchronization_status != 3) { }

visit_layer_parameters (this->trainer->devices[0]->net, [&] (size_t k, tensor & t) {
std::cout << "SP get parameteres from" << &t << std::endl;
// std::cout << "SP get parameteres from" << &t << std::endl;
this->send_back_paras[ (*i).slave_index][k] = t;
});

Expand Down
2 changes: 1 addition & 1 deletion dlib/dnn/syncer/syncer_leader_default.h
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ void dnn_leader<trainer_type>::send_parameters (connection *slave) {
tensors.resize (this->trainer->num_computational_layers);

visit_layer_parameters (this->trainer->devices[0]->net, [&] (size_t i, tensor & t) {
std::cout << "SP get parameteres from" << &t << std::endl;
// std::cout << "SP get parameteres from" << &t << std::endl;
tensors[i] = &t;
});

Expand Down
2 changes: 1 addition & 1 deletion dlib/dnn/syncer/utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ enum slaveStatus {
struct task {
public:
size_t slave_index = -1;
bool ready = 0;
volatile bool ready = 0;
std::vector<resizable_tensor> tensors;

task () = default;
Expand Down
6 changes: 3 additions & 3 deletions dlib/dnn/trainer.h
Original file line number Diff line number Diff line change
Expand Up @@ -914,9 +914,9 @@ namespace dlib
for (size_t i = 0; i < devices.size(); ++i)
tp[i]->wait_for_all_tasks();

visit_layer_parameters (devices[0]->net, [&] (size_t j, tensor & t) {
std::cout<<"TR get parameteres from" << j << " -- "<<&t << std::endl;
});
// visit_layer_parameters (devices[0]->net, [&] (size_t j, tensor & t) {
// std::cout<<"TR get parameteres from" << j << " -- "<<&t << std::endl;
// });

// Every now and then force all the parameters to be the same just to make
// sure they aren't drifting apart due to any non-deterministic behavior on
Expand Down
34 changes: 17 additions & 17 deletions examples/dnn_dist_worker.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,8 @@ int main (int argc, char **argv) try {
int mark = 0;
auto time = 0;

sleep ((unsigned int) (me.number % 2) * 10);

while (true) {
mark += 1;
auto epoch_time = system_clock::now(); // HPZ: Counting
Expand Down Expand Up @@ -177,7 +179,6 @@ int main (int argc, char **argv) try {
// accuracy(net, local_training_images, local_training_labels);
// accuracy(net, testing_images, testing_labels);

sleep ((unsigned int) me.number);
auto sync_time = system_clock::now(); // HPZ: Counting
syncer.sn_sync();
std::cout << "(sync time " << std::chrono::duration_cast<std::chrono::milliseconds> (system_clock::now() - sync_time).count() << std::endl; // HPZ: Counting
Expand All @@ -199,31 +200,30 @@ int main (int argc, char **argv) try {
// accuracy(net, testing_images, testing_labels);
//

if (ismaster) {
if (trainer.learning_rate <= 0.001) {
std::cout << "---------------------------" << std::endl;
std::cout << "|Exit because l_rate |" << std::endl;
std::cout << "---------------------------" << std::endl;
break;
}

if (epoch >= 60) {
std::cout << "---------------------------" << std::endl;
std::cout << "|Exit because 60 epochs |" << std::endl;
std::cout << "---------------------------" << std::endl;
break;
}
if (trainer.learning_rate <= 0.001) {
std::cout << "---------------------------" << std::endl;
std::cout << "|Exit because l_rate |" << std::endl;
std::cout << "---------------------------" << std::endl;
break;
}

if (epoch >= 60) {
std::cout << "---------------------------" << std::endl;
std::cout << "|Exit because 60 epochs |" << std::endl;
std::cout << "---------------------------" << std::endl;
break;
}


}

// trainer.train(training_images, training_labels);

local_training.accuracy (net);
testing.accuracy (net);
// local_training.accuracy (net);
// testing.accuracy (net);
std::cout << "All time: " << time << std::endl;
std::cout << trainer << std::endl;
sleep((unsigned int) 3600);

// At this point our net object should have learned how to classify MNIST images. But
// before we try it out let's save it to disk. Note that, since the trainer has been
Expand Down

0 comments on commit 5a482e2

Please sign in to comment.