From 5a482e20f9c23ce06d342635956e87de36f46f71 Mon Sep 17 00:00:00 2001 From: CoderSherlock Date: Sun, 21 Oct 2018 15:14:55 -0400 Subject: [PATCH] Changed send_back_flags implementation Fix the bug of not finishing training Refacted code --- dlib/dnn/syncer/syncer.h | 4 +-- dlib/dnn/syncer/syncer_async.h | 19 ++++++++++---- dlib/dnn/syncer/syncer_leader_default.h | 2 +- dlib/dnn/syncer/utils.h | 2 +- dlib/dnn/trainer.h | 6 ++--- examples/dnn_dist_worker.cpp | 34 ++++++++++++------------- 6 files changed, 38 insertions(+), 29 deletions(-) diff --git a/dlib/dnn/syncer/syncer.h b/dlib/dnn/syncer/syncer.h index 6ec58c6b..6cdfc7e9 100644 --- a/dlib/dnn/syncer/syncer.h +++ b/dlib/dnn/syncer/syncer.h @@ -47,7 +47,7 @@ class dnn_syncer { int verbose = 0; - int num_debug = 1; + int num_debug = 0; int exper = 0; @@ -247,7 +247,7 @@ class dnn_async_leader : public dnn_leader { std::vector recievers; std::vector> send_back_paras; - std::vector send_back_flags; + volatile int* send_back_flags; task_queue tq; }; diff --git a/dlib/dnn/syncer/syncer_async.h b/dlib/dnn/syncer/syncer_async.h index 6d7da298..53a612c8 100644 --- a/dlib/dnn/syncer/syncer_async.h +++ b/dlib/dnn/syncer/syncer_async.h @@ -16,7 +16,8 @@ void dnn_async_leader::init_reciever_pool() { }); this->send_back_paras.resize (this->get_running_slaves_num()); - this->send_back_flags.resize (this->get_running_slaves_num()); + // this->send_back_flags.resize (this->get_running_slaves_num()); + this->send_back_flags = new int[this->get_running_slaves_num()]; for (size_t i = 0; i < this->send_back_paras.size(); i++) { this->send_back_paras[i].resize (this->trainer->num_computational_layers); @@ -56,6 +57,8 @@ void dnn_async_leader::async_thread (int slave_index) { while (1) { this->recieve_gradients_from_one (slave_index, gradients); + if (this->slaves_status[slave_index] != slaveStatus::Running) + break; std::cout << "Recieved from slave " << slave_index << std::endl; task t (slave_index, 1, gradients); @@ -74,10 +77,16 @@ template int dnn_async_leader::recieve_gradients_from_one (int slave_index, std::vector &cli_tensors) { // std::cout << slave_index << ":" << &this->slaves_conns << std::endl; - for (size_t i = 0; i < cli_tensors.size(); i++) { - if (cli_tensors[i].size() != 0) { - network::recieve_compressed_tensor (this->slaves_conns[slave_index], &cli_tensors[i]); + try { + for (size_t i = 0; i < cli_tensors.size(); i++) { + if (cli_tensors[i].size() != 0) { + network::recieve_compressed_tensor (this->slaves_conns[slave_index], &cli_tensors[i]); + } } + } catch (...) { + std::cout << "It seems that slave " << slave_index << " closed" << std::endl; + this->slaves_status[slave_index] = slaveStatus::NotConn; + close_gracefully(this->slaves_conns[slave_index], 1); } return 1; @@ -141,7 +150,7 @@ void dnn_async_leader::sync() { while (this->trainer->synchronization_status != 3) { } visit_layer_parameters (this->trainer->devices[0]->net, [&] (size_t k, tensor & t) { - std::cout << "SP get parameteres from" << &t << std::endl; + // std::cout << "SP get parameteres from" << &t << std::endl; this->send_back_paras[ (*i).slave_index][k] = t; }); diff --git a/dlib/dnn/syncer/syncer_leader_default.h b/dlib/dnn/syncer/syncer_leader_default.h index 4c93f8de..30f238c2 100644 --- a/dlib/dnn/syncer/syncer_leader_default.h +++ b/dlib/dnn/syncer/syncer_leader_default.h @@ -149,7 +149,7 @@ void dnn_leader::send_parameters (connection *slave) { tensors.resize (this->trainer->num_computational_layers); visit_layer_parameters (this->trainer->devices[0]->net, [&] (size_t i, tensor & t) { - std::cout << "SP get parameteres from" << &t << std::endl; + // std::cout << "SP get parameteres from" << &t << std::endl; tensors[i] = &t; }); diff --git a/dlib/dnn/syncer/utils.h b/dlib/dnn/syncer/utils.h index 1a00cb1b..9e4e26ba 100644 --- a/dlib/dnn/syncer/utils.h +++ b/dlib/dnn/syncer/utils.h @@ -37,7 +37,7 @@ enum slaveStatus { struct task { public: size_t slave_index = -1; - bool ready = 0; + volatile bool ready = 0; std::vector tensors; task () = default; diff --git a/dlib/dnn/trainer.h b/dlib/dnn/trainer.h index 03c213f6..bdc288fe 100644 --- a/dlib/dnn/trainer.h +++ b/dlib/dnn/trainer.h @@ -914,9 +914,9 @@ namespace dlib for (size_t i = 0; i < devices.size(); ++i) tp[i]->wait_for_all_tasks(); - visit_layer_parameters (devices[0]->net, [&] (size_t j, tensor & t) { - std::cout<<"TR get parameteres from" << j << " -- "<<&t << std::endl; - }); + // visit_layer_parameters (devices[0]->net, [&] (size_t j, tensor & t) { + // std::cout<<"TR get parameteres from" << j << " -- "<<&t << std::endl; + // }); // Every now and then force all the parameters to be the same just to make // sure they aren't drifting apart due to any non-deterministic behavior on diff --git a/examples/dnn_dist_worker.cpp b/examples/dnn_dist_worker.cpp index 75de9bfb..d80f03c5 100644 --- a/examples/dnn_dist_worker.cpp +++ b/examples/dnn_dist_worker.cpp @@ -148,6 +148,8 @@ int main (int argc, char **argv) try { int mark = 0; auto time = 0; + sleep ((unsigned int) (me.number % 2) * 10); + while (true) { mark += 1; auto epoch_time = system_clock::now(); // HPZ: Counting @@ -177,7 +179,6 @@ int main (int argc, char **argv) try { // accuracy(net, local_training_images, local_training_labels); // accuracy(net, testing_images, testing_labels); - sleep ((unsigned int) me.number); auto sync_time = system_clock::now(); // HPZ: Counting syncer.sn_sync(); std::cout << "(sync time " << std::chrono::duration_cast (system_clock::now() - sync_time).count() << std::endl; // HPZ: Counting @@ -199,20 +200,18 @@ int main (int argc, char **argv) try { // accuracy(net, testing_images, testing_labels); // - if (ismaster) { - if (trainer.learning_rate <= 0.001) { - std::cout << "---------------------------" << std::endl; - std::cout << "|Exit because l_rate |" << std::endl; - std::cout << "---------------------------" << std::endl; - break; - } - - if (epoch >= 60) { - std::cout << "---------------------------" << std::endl; - std::cout << "|Exit because 60 epochs |" << std::endl; - std::cout << "---------------------------" << std::endl; - break; - } + if (trainer.learning_rate <= 0.001) { + std::cout << "---------------------------" << std::endl; + std::cout << "|Exit because l_rate |" << std::endl; + std::cout << "---------------------------" << std::endl; + break; + } + + if (epoch >= 60) { + std::cout << "---------------------------" << std::endl; + std::cout << "|Exit because 60 epochs |" << std::endl; + std::cout << "---------------------------" << std::endl; + break; } @@ -220,10 +219,11 @@ int main (int argc, char **argv) try { // trainer.train(training_images, training_labels); - local_training.accuracy (net); - testing.accuracy (net); + // local_training.accuracy (net); + // testing.accuracy (net); std::cout << "All time: " << time << std::endl; std::cout << trainer << std::endl; + sleep((unsigned int) 3600); // At this point our net object should have learned how to classify MNIST images. But // before we try it out let's save it to disk. Note that, since the trainer has been