From 417b2a4ac7a0eb76ee55ae248227ace204a53384 Mon Sep 17 00:00:00 2001
From: Roger Waleffe <rogerwaleffe@Rogers-MBP.attlocal.net>
Date: Sat, 18 Nov 2023 20:56:01 -0600
Subject: [PATCH] add timing

---
 src/cpp/src/pipeline/pipeline_gpu.cpp | 44 ++++++++++++++++++++++++++-
 1 file changed, 43 insertions(+), 1 deletion(-)
diff --git a/src/cpp/src/pipeline/pipeline_gpu.cpp b/src/cpp/src/pipeline/pipeline_gpu.cpp
index fcb23dde..e4e7de96 100644
--- a/src/cpp/src/pipeline/pipeline_gpu.cpp
+++ b/src/cpp/src/pipeline/pipeline_gpu.cpp
@@ -44,6 +44,8 @@ void updateEvalForBatch(Pipeline* pipeline_, shared_ptr<Batch> batch) {
 void RemoteLoadWorker::run() {
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             // NOTE: this "train" is probably not set correctly all the time
             shared_ptr<Batch> batch = std::make_shared<Batch>(pipeline_->dataloader_->train_);
 
@@ -77,6 +79,8 @@ void RemoteLoadWorker::run() {
             } else {
                 ((PipelineCPU *)pipeline_)->loaded_batches_->blocking_push(batch);
             }
+            t.stop();
+            std::cout<<"remote load: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }
@@ -85,7 +89,12 @@ void RemoteLoadWorker::run() {
 void RemoteToDeviceWorker::run() {
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             auto tup = ((PipelineGPU *)pipeline_)->loaded_batches_->blocking_pop();
+            t.stop();
+            std::cout<<"remote to block: "<<t.getDuration()<<"\n";
+            t.start();
             bool popped = std::get<0>(tup);
             shared_ptr<Batch> batch = std::get<1>(tup);
             if (!popped) {
@@ -108,6 +117,8 @@ void RemoteToDeviceWorker::run() {
 
             batch->creator_id_ = pipeline_->model_->pg_gloo_->pg->getRank();
             batch->remoteTo(pipeline_->model_->pg_gloo_->pg, child, tag);
+            t.stop();
+            std::cout<<"remote to: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }
@@ -120,7 +131,12 @@ void BatchToDeviceWorker::run() {
 
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             auto tup = ((PipelineGPU *)pipeline_)->loaded_batches_->blocking_pop();
+            t.stop();
+            std::cout<<"batch to block: "<<t.getDuration()<<"\n";
+            t.start();
             bool popped = std::get<0>(tup);
             shared_ptr<Batch> batch = std::get<1>(tup);
             if (!popped) {
@@ -128,6 +144,8 @@ void BatchToDeviceWorker::run() {
             }
 
             batchToDevice(pipeline_, batch);
+            t.stop();
+            std::cout<<"batch to: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }
@@ -139,7 +157,12 @@ void ComputeWorkerGPU::run() {
 
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             auto tup = ((PipelineGPU *)pipeline_)->device_loaded_batches_[gpu_id_]->blocking_pop();
+            t.stop();
+            std::cout<<"compute block: "<<t.getDuration()<<"\n";
+            t.start();
             bool popped = std::get<0>(tup);
             shared_ptr<Batch> batch = std::get<1>(tup);
             if (!popped) {
@@ -229,6 +252,8 @@ void ComputeWorkerGPU::run() {
                     batch->clear();
                 }
             }
+            t.stop();
+            std::cout<<"compute: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }
@@ -260,7 +285,12 @@ void EncodeNodesWorkerGPU::run() {
 void BatchToHostWorker::run() {
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             auto tup = ((PipelineGPU *)pipeline_)->device_update_batches_[gpu_id_]->blocking_pop();
+            t.stop();
+            std::cout<<"batch to host block: "<<t.getDuration()<<"\n";
+            t.start();
             bool popped = std::get<0>(tup);
             shared_ptr<Batch> batch = std::get<1>(tup);
             if (!popped) {
@@ -285,6 +315,8 @@ void BatchToHostWorker::run() {
             }
 
             ((PipelineGPU *)pipeline_)->update_batches_->blocking_push(batch);
+            t.stop();
+            std::cout<<"batch to host: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }
@@ -293,7 +325,12 @@ void BatchToHostWorker::run() {
 void RemoteToHostWorker::run() {
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             auto tup = ((PipelineGPU *)pipeline_)->update_batches_->blocking_pop();
+            t.stop();
+            std::cout<<"remote to host block: "<<t.getDuration()<<"\n";
+            t.start();
             bool popped = std::get<0>(tup);
             shared_ptr<Batch> batch = std::get<1>(tup);
             if (!popped) {
@@ -330,7 +367,8 @@ void RemoteToHostWorker::run() {
             lock.unlock();
 
             batch->remoteTo(pipeline_->model_->pg_gloo_->pg, parent, tag);
-
+            t.stop();
+            std::cout<<"remote to host: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }
@@ -339,6 +377,8 @@ void RemoteToHostWorker::run() {
 void RemoteListenForUpdatesWorker::run() {
     while (!done_) {
         while (!paused_) {
+            Timer t = new Timer(false);
+            t.start();
             // NOTE: this "train" is probably not set correctly all the time
             shared_ptr<Batch> batch = std::make_shared<Batch>(pipeline_->dataloader_->train_);
 
@@ -370,6 +410,8 @@ void RemoteListenForUpdatesWorker::run() {
             batch->remoteReceive(pipeline_->model_->pg_gloo_->pg, child, tag);
 
             ((PipelineGPU *)pipeline_)->update_batches_->blocking_push(batch);
+            t.stop();
+            std::cout<<"remote listen: "<<t.getDuration()<<"\n";
         }
         nanosleep(&sleep_time_, NULL);
     }