From 895ad6f1a2e8aa973a9028839c9fbbda22c2441d Mon Sep 17 00:00:00 2001
From: JoseJVS <villamar_jose@hotmail.com>
Date: Thu, 24 Oct 2024 14:53:03 +0200
Subject: [PATCH] Added NVTX ranges

---
 CMakeLists.txt             |   2 +
 cmake/ProcessOptions.cmake |   8 ++
 libnestutil/config.h.in    |   3 +
 src/CMakeLists.txt         |   1 +
 src/nestgpu.cu             | 155 +++++++++++++++++++++++++++++++------
 src/nvtx_macros.h          |  33 ++++++++
 6 files changed, 180 insertions(+), 22 deletions(-)
 create mode 100644 src/nvtx_macros.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e17c705b0..e8aebb3a3 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -64,6 +64,7 @@ set( with-includes OFF CACHE STRING "Add additional include paths [default=OFF].
 set( with-defines OFF CACHE STRING "Additional defines, e.g. '-DXYZ=1' [default=OFF]. Separate multiple defines by ';'." )
 set( with-max-rreg-count "55" CACHE STRING "Set a maximum amount of register used when compiling [default=55]. Separate multiple flags by ';'." )
 set( with-ptxas-options OFF CACHE STRING "Options for ptxas compiling [default=OFF]. Separate multiple flags by ';'." )
+set( with-nvtx OFF CACHE STRING "Enable compilation with NVTX ranges." )
 
 # generic build configuration
 set( with-version-suffix OFF CACHE STRING "Set a user defined version suffix [default='']." )
@@ -130,6 +131,7 @@ nest_process_with_optimize()
 nest_process_with_debug()
 nest_process_with_warning()
 nestgpu_post_process_compile_flags()
+nestgpu_process_with_nvtx()
 
 nest_process_version_suffix()
 
diff --git a/cmake/ProcessOptions.cmake b/cmake/ProcessOptions.cmake
index b36355cc1..720f13cfb 100644
--- a/cmake/ProcessOptions.cmake
+++ b/cmake/ProcessOptions.cmake
@@ -209,6 +209,14 @@ function( NESTGPU_POST_PROCESS_COMPILE_FLAGS )
 endfunction()
 
 
+function( NESTGPU_PROCESS_WITH_NVTX )
+  set( HAVE_NVTX OFF PARENT_SCOPE )
+  if ( with-nvtx )
+    set( HAVE_NVTX ON PARENT_SCOPE )
+  endif()
+endfunction()
+
+
 function( NEST_PROCESS_VERSION_SUFFIX )
   if ( with-version-suffix )
     foreach ( flag ${with-version-suffix} )
diff --git a/libnestutil/config.h.in b/libnestutil/config.h.in
index e16bc45bd..bba81bf6c 100644
--- a/libnestutil/config.h.in
+++ b/libnestutil/config.h.in
@@ -48,6 +48,9 @@
 /* Define to 1 if you have the <unistd.h> header file. */
 #cmakedefine HAVE_UNISTD_H 1
 
+/* Define to 1 for compiling with nvtx ranges. */
+#cmakedefine HAVE_NVTX 1
+
 /* Version number of package */
 #define VERSION @NEST_GPU_VERSION_STRING@
 
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 65bbcf7ff..6a8918b5d 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -75,6 +75,7 @@ nestgpu_C.h
 neuron_models.h
 ngpu_exception.h
 node_group.h
+nvtx_macros.h
 parrot_neuron.h
 poiss_gen.h
 poiss_gen_variables.h
diff --git a/src/nestgpu.cu b/src/nestgpu.cu
index 37747a2d6..7db72abe6 100644
--- a/src/nestgpu.cu
+++ b/src/nestgpu.cu
@@ -52,6 +52,8 @@
 #include "input_spike_buffer.h"
 #include "remote_connect.h"
 
+#include "nvtx_macros.h"
+
 ////////////// TEMPORARY
 #include "scan.h"
 //////////////////////
@@ -134,7 +136,7 @@ NESTGPU::setNHosts( int n_hosts )
   n_remote_nodes_.assign( n_hosts_, 0 );
   external_spike_flag_ = ( n_hosts > 1 ) ? true : false;
   gpuErrchk( cudaMemcpyToSymbolAsync( ExternalSpikeFlag, &external_spike_flag_, sizeof( bool ) ) );
-      
+
   return 0;
 }
 
@@ -150,6 +152,7 @@ NESTGPU::setThisHost( int i_host )
 
 NESTGPU::NESTGPU()
 {
+  PUSH_RANGE( "init", 0 );
   start_real_time_ = getRealTime();
   n_hosts_ = 1;
   this_host_ = 0;
@@ -216,6 +219,7 @@ NESTGPU::NESTGPU()
   ExternalSpikeReset_time_ = 0;
 
   first_simulation_flag_ = true;
+  POP_RANGE
 }
 
 NESTGPU::~NESTGPU()
@@ -341,10 +345,10 @@ NESTGPU::setConnStructType( int conn_struct_type )
 
   // set time resolution in connection object
   conn_->setTimeResolution( time_resolution_ );
-  
+
   // set start real time in connection object
   conn_->setStartRealTime( start_real_time_ );
-  
+
   return 0;
 }
 
@@ -390,6 +394,8 @@ NESTGPU::CheckUncalibrated( std::string message )
 int
 NESTGPU::Calibrate()
 {
+  PUSH_RANGE( "calibrate", 0 );
+
   CheckUncalibrated( "Calibration can be made only once" );
 
   PRINT_TIME;
@@ -419,13 +425,13 @@ NESTGPU::Calibrate()
   conn_->organizeConnections( GetNTotalNodes() );
 
   PRINT_TIME;
-  
+
   conn_->calibrate();
 
   PRINT_TIME;
-    
+
   int max_delay_num = max_spike_buffer_size_;
-  
+
   max_spike_num_ = ( int ) round( max_spike_num_fact_ * GetNTotalNodes() * max_delay_num );
   max_spike_num_ = ( max_spike_num_ > 1 ) ? max_spike_num_ : 1;
 
@@ -434,15 +440,15 @@ NESTGPU::Calibrate()
 
   max_remote_spike_num_ = max_spike_per_host_ * n_hosts_ * max_remote_spike_num_fact_;
   max_remote_spike_num_ = ( max_remote_spike_num_ > 1 ) ? max_remote_spike_num_ : 1;
-  
+
   conn_->initInputSpikeBuffer( GetNLocalNodes(), GetNTotalNodes(), max_remote_spike_num_ );
 
   PRINT_TIME;
-  
+
   poiss_conn::organizeDirectConnections( conn_ );
 
   PRINT_TIME;
-  
+
   for ( unsigned int i = 0; i < node_vect_.size(); i++ )
   {
     if ( node_vect_[ i ]->has_dir_conn_ )
@@ -452,7 +458,7 @@ NESTGPU::Calibrate()
   }
 
   PRINT_TIME;
-  
+
   if ( remove_conn_key_ )
   {
     conn_->freeConnectionKey();
@@ -462,7 +468,7 @@ NESTGPU::Calibrate()
   NestedLoop::Init( n_spike_buffers );
 
   PRINT_TIME;
-  
+
   // temporary
   gpuErrchk( cudaPeekAtLastError() );
   gpuErrchk( cudaDeviceSynchronize() );
@@ -473,15 +479,15 @@ NESTGPU::Calibrate()
   NodeGroupArrayInit();
 
   PRINT_TIME;
-  
+
   SpikeInit( max_spike_num_ );
 
   PRINT_TIME;
-  
+
   spikeBufferInit( GetNTotalNodes(), max_spike_buffer_size_, conn_->getSpikeBufferAlgo() );
 
   PRINT_TIME;
-  
+
   if ( n_hosts_ > 1 )
   {
     conn_->remoteConnectionMapCalibrate( GetNLocalNodes() );
@@ -510,13 +516,15 @@ NESTGPU::Calibrate()
   }
 
   PRINT_TIME;
-  
+
   SynGroupCalibrate();
 
   PRINT_TIME;
 
   gpuErrchk( cudaMemcpyToSymbolAsync( NESTGPUTimeResolution, &time_resolution_, sizeof( float ) ) );
 
+  POP_RANGE
+
   return 0;
 }
 
@@ -530,6 +538,8 @@ NESTGPU::Simulate( float sim_time )
 int
 NESTGPU::Simulate()
 {
+  PUSH_RANGE( "simulate", 0 );
+
   StartSimulation();
 
   for ( long long it = 0; it < Nt_; it++ )
@@ -542,6 +552,8 @@ NESTGPU::Simulate()
   }
   EndSimulation();
 
+  POP_RANGE
+
   return 0;
 }
 
@@ -629,17 +641,27 @@ NESTGPU::SimulationStep()
   {
     StartSimulation();
   }
+
+  PUSH_RANGE( "simulation_step", 1 );
+
   double time_mark;
 
   if ( conn_->getSpikeBufferAlgo() != INPUT_SPIKE_BUFFER_ALGO )
   {
+
+    PUSH_RANGE( "spkie_buffer_update", 2 );
+
     time_mark = getRealTime();
     SpikeBufferUpdate<<< ( GetNTotalNodes() + 1023 ) / 1024, 1024 >>>();
     DBGCUDASYNC;
 
     SpikeBufferUpdate_time_ += ( getRealTime() - time_mark );
+
+    POP_RANGE
   }
 
+  PUSH_RANGE( "symbol_copy", 2 );
+
   time_mark = getRealTime();
   neural_time_ = neur_t0_ + ( double ) time_resolution_ * ( it_ + 1 );
   // std::cout << "neural_time_: " << neural_time_ << "\n";
@@ -647,6 +669,8 @@ NESTGPU::SimulationStep()
   long long time_idx = ( int ) round( neur_t0_ / time_resolution_ ) + it_ + 1;
   gpuErrchk( cudaMemcpyToSymbolAsync( NESTGPUTimeIdx, &time_idx, sizeof( long long ) ) );
 
+  POP_RANGE
+
   /*
   if (ConnectionSpikeTimeFlag) {
     if ( (time_idx & 0xffff) == 0x8000) {
@@ -658,41 +682,78 @@ NESTGPU::SimulationStep()
   }
   */
 
+  PUSH_RANGE( "neuron_update", 2 );
+
   for ( unsigned int i = 0; i < node_vect_.size(); i++ )
   {
     node_vect_[ i ]->Update( it_, neural_time_ );
   }
   DBGCUDASYNC;
 
+  POP_RANGE
+
+  PUSH_RANGE( "write_records", 2 );
+
   neuron_Update_time_ += ( getRealTime() - time_mark );
   multimeter_->WriteRecords( neural_time_, time_idx );
-  
+
+  POP_RANGE
+
   if ( n_hosts_ > 1 )
   {
+
+    PUSH_RANGE( "count_external_spikes", 2 );
+
     int n_ext_spikes;
     time_mark = getRealTime();
     gpuErrchk( cudaMemcpy( &n_ext_spikes, d_ExternalSpikeNum, sizeof( int ), cudaMemcpyDeviceToHost ) );
     copy_ext_spike_time_ += ( getRealTime() - time_mark );
 
+    POP_RANGE
+
     if ( n_ext_spikes != 0 )
     {
+
+      PUSH_RANGE( "organize_external_spikes", 2 );
+
       time_mark = getRealTime();
       organizeExternalSpikes( n_ext_spikes );
       organizeExternalSpike_time_ += ( getRealTime() - time_mark );
+
+      POP_RANGE
     }
+
+    PUSH_RANGE( "send_external_spikes", 2 );
+
     time_mark = getRealTime();
     SendSpikeToRemote( n_ext_spikes );
-
     SendSpikeToRemote_time_ += ( getRealTime() - time_mark );
+
+    POP_RANGE
+
+    PUSH_RANGE( "recv_external_spikes", 2 );
+
     time_mark = getRealTime();
     RecvSpikeFromRemote();
     RecvSpikeFromRemote_time_ += ( getRealTime() - time_mark );
+
+    POP_RANGE
+
+    PUSH_RANGE( "copy_external_spikes", 2 );
+
     CopySpikeFromRemote();
+
+    POP_RANGE
   }
-  
+
   if ( conn_->getSpikeBufferAlgo() == INPUT_SPIKE_BUFFER_ALGO )
   {
+
+    PUSH_RANGE( "deliver_spikes", 2 );
+
     conn_->deliverSpikes();
+
+    POP_RANGE
   }
   else
   {
@@ -700,11 +761,20 @@ NESTGPU::SimulationStep()
 
     // Call will get delayed until ClearGetSpikesArrays()
     // afterwards the value of n_spikes will be available
+
+    PUSH_RANGE( "count_local_spikes", 2 );
+
     gpuErrchk( cudaMemcpyAsync( &n_spikes, d_SpikeNum, sizeof( int ), cudaMemcpyDeviceToHost ) );
     ClearGetSpikeArrays();
     gpuErrchk( cudaDeviceSynchronize() );
+
+    POP_RANGE
+
     if ( n_spikes > 0 )
     {
+
+      PUSH_RANGE( "nested_loop_delivery", 2 );
+
       time_mark = getRealTime();
       switch ( conn_struct_type_ )
       {
@@ -718,9 +788,13 @@ NESTGPU::SimulationStep()
         throw ngpu_exception( "Unrecognized connection structure type index" );
       }
       NestedLoop_time_ += ( getRealTime() - time_mark );
+
+      POP_RANGE
     }
   }
-  
+
+  PUSH_RANGE( "send_direct_spikes", 2 );
+
   time_mark = getRealTime();
   for ( unsigned int i = 0; i < node_vect_.size(); i++ )
   {
@@ -731,6 +805,10 @@ NESTGPU::SimulationStep()
   }
   poisson_generator_time_ += ( getRealTime() - time_mark );
 
+  POP_RANGE
+
+  PUSH_RANGE( "get_input_spikes", 2 );
+
   time_mark = getRealTime();
   for ( unsigned int i = 0; i < node_vect_.size(); i++ )
   {
@@ -775,36 +853,61 @@ NESTGPU::SimulationStep()
     }
   }
 
+  POP_RANGE
+
   GetSpike_time_ += ( getRealTime() - time_mark );
 
+  PUSH_RANGE( "reset_local_spikes", 2 );
+
   time_mark = getRealTime();
   SpikeReset<<< 1, 1 >>>();
   DBGCUDASYNC;
-
   gpuErrchk( cudaPeekAtLastError() );
   SpikeReset_time_ += ( getRealTime() - time_mark );
-  
+
+  POP_RANGE
+
   if ( n_hosts_ > 1 )
   {
+
+    PUSH_RANGE( "reset_external_spikes", 2 );
+
     time_mark = getRealTime();
     ExternalSpikeReset();
     ExternalSpikeReset_time_ += ( getRealTime() - time_mark );
+
+    POP_RANGE
   }
 
   if ( conn_->getSpikeBufferAlgo() != INPUT_SPIKE_BUFFER_ALGO )
   {
     if ( conn_->getNRevConn() > 0 )
     {
+
+      PUSH_RANGE( "reset_reverse_spikes", 2 );
+
       // time_mark = getRealTime();
       revSpikeReset<<< 1, 1 >>>();
       gpuErrchk( cudaPeekAtLastError() );
+
+      POP_RANGE
+
+      PUSH_RANGE( "update_reverse_spike_buffer", 2 );
+
       revSpikeBufferUpdate<<< ( GetNLocalNodes() + 1023 ) / 1024, 1024 >>>( GetNLocalNodes() );
       gpuErrchk( cudaPeekAtLastError() );
       unsigned int n_rev_spikes;
       gpuErrchk(
         cudaMemcpy( &n_rev_spikes, conn_->getDevRevSpikeNumPt(), sizeof( unsigned int ), cudaMemcpyDeviceToHost ) );
+
+      POP_RANGE
+
+
       if ( n_rev_spikes > 0 )
       {
+
+        PUSH_RANGE( "reverse_spikes_nested_loop_deliver", 2 );
+
         switch ( conn_struct_type_ )
         {
         case i_conn12b:
@@ -816,11 +919,15 @@ NESTGPU::SimulationStep()
         default:
           throw ngpu_exception( "Unrecognized connection structure type index" );
         }
+
+        POP_RANGE
       }
       // RevSpikeBufferUpdate_time_ += (getRealTime() - time_mark);
     }
   }
 
+  PUSH_RANGE( "buffer_recorded_spike_times", 2 );
+
   for ( unsigned int i = 0; i < node_vect_.size(); i++ )
   {
     // if spike times recording is activated for node group...
@@ -836,8 +943,12 @@ NESTGPU::SimulationStep()
     }
   }
 
+  POP_RANGE
+
   it_++;
-    
+
+  POP_RANGE
+
   return 0;
 }
 
diff --git a/src/nvtx_macros.h b/src/nvtx_macros.h
new file mode 100644
index 000000000..443ea1bc5
--- /dev/null
+++ b/src/nvtx_macros.h
@@ -0,0 +1,33 @@
+#pragma once
+
+#include <config.h>
+
+#ifdef HAVE_NVTX
+#include <cuda_profiler_api.h>
+#include <nvtx3/nvToolsExt.h>
+
+const uint32_t colors[] = { 0xff00ff00, 0xff0000ff, 0xffffff00, 0xffff00ff, 0xff00ffff, 0xffff0000, 0xffffffff };
+const int num_colors = sizeof( colors ) / sizeof( uint32_t );
+
+#define PUSH_RANGE( name, cid )                        \
+  {                                                    \
+    int color_id = cid;                                \
+    color_id = color_id % num_colors;                  \
+    nvtxEventAttributes_t eventAttrib = { 0 };         \
+    eventAttrib.version = NVTX_VERSION;                \
+    eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;  \
+    eventAttrib.colorType = NVTX_COLOR_ARGB;           \
+    eventAttrib.color = colors[ color_id ];            \
+    eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII; \
+    eventAttrib.message.ascii = name;                  \
+    nvtxRangePushEx( &eventAttrib );                   \
+  }
+#define POP_RANGE nvtxRangePop();
+#define PROFILER_START cudaProfilerStart();
+#define PROFILER_STOP cudaProfilerStop();
+#else
+#define PUSH_RANGE( name, cid )
+#define POP_RANGE
+#define PROFILER_START
+#define PROFILER_STOP
+#endif