Merge pull request #77 from mach3-software/feature_CUDABenchamrk

CUDA Benchamrk
mach3-software · Jul 19, 2024 · ef64244 · ef64244
2 parents f44d96d + 3401fb8
commit ef64244
Show file tree

Hide file tree

Showing 15 changed files with 207 additions and 94 deletions.
diff --git a/.mailmap b/.mailmap
@@ -6,6 +6,8 @@ Edward Atkin <[email protected]> EdAtkin <[email protected]
 
 # Kamil Skwarczynski
 Kamil Skwarczynski <[email protected]> Kamil <[email protected]>
+Kamil Skwarczynski <[email protected]> Kamil Skwarczynski <[email protected]>
+Kamil Skwarczynski <[email protected]> Kamil Skwarczynski <[email protected]>
 
 # Henry Wallace
 Henry Wallace <[email protected]> henry-israel <[email protected]>

diff --git a/cmake/Modules/CUDASamples.cmake b/cmake/Modules/CUDASamples.cmake
@@ -31,3 +31,52 @@ endif()
 
 cmessage(STATUS "Using the following CUDA samples paths: ${CMAKE_CUDA_SAMPLES_PATH}")
 target_include_directories(MaCh3CompilerOptions INTERFACE ${CMAKE_CUDA_SAMPLES_PATH})
+
+
+# KS: Perform fancy CUDA Benchmarking
+DefineEnabledRequiredSwitch(MaCh3_GPU_BENCHMARK FALSE)
+if(MaCh3_GPU_BENCHMARK)
+    cmessage(STATUS "Building CUDA Benchmark")
+
+    # KS: Define directories to iterate over, might be useful to expand
+    set(CUDA_SAMPLES_DIRS
+        "deviceQuery"
+        "bandwidthTest"
+    )
+
+    # KS: Iterate over each directory
+    foreach(sample_dir ${CUDA_SAMPLES_DIRS})
+        # Define source and destination directories
+        set(SRC_DIR "${CMAKE_CUDA_SAMPLES_PATH}/../Samples/1_Utilities/${sample_dir}")
+        set(DST_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/")
+
+        # CW: Copy over the provided nvidia utility
+        # CW: Often we can't write to the CUDA install directory, so let's build it here
+        file(COPY ${SRC_DIR} DESTINATION ${DST_DIR})
+
+        # KS: Change directory to copied sample
+        set(SAMPLE_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/${sample_dir}")
+
+        # Modify Makefile path
+        set(MAKEFILE_PATH "${SAMPLE_DIR}/Makefile")
+
+        # CW: Patch the litle hard-coded NVIDIA makefile
+        execute_process(
+            COMMAND sed -i "s,../../../Common,${CMAKE_CUDA_SAMPLES_PATH},g" ${MAKEFILE_PATH}
+            RESULT_VARIABLE SED_RESULT
+        )
+
+        # Add custom target to run make
+        add_custom_target(run_${sample_dir} ALL
+            COMMAND make
+            WORKING_DIRECTORY ${SAMPLE_DIR}
+        )
+
+        # Add custom target to run sample
+        add_custom_target(run_${sample_dir}_exec ALL
+            COMMAND ./${sample_dir}
+            WORKING_DIRECTORY ${SAMPLE_DIR}
+            DEPENDS run_${sample_dir}
+        )
+    endforeach(sample_dir)
+endif()
diff --git a/manager/CMakeLists.txt b/manager/CMakeLists.txt
@@ -5,6 +5,7 @@ set(HEADERS
     MaCh3Logger.h
     Monitor.h
     MaCh3Exception.h
+    gpuUtils.cuh
 )
 
 add_library(Manager SHARED

diff --git a/manager/Monitor.cpp b/manager/Monitor.cpp
@@ -101,10 +101,10 @@ void GetCPUInfo(){
   MACH3LOG_INFO("{}", TerminalToString("cat /proc/cpuinfo | grep -m 1 MHz"));
   //KS: Below code is convoluted because I mostly work on English based Linux but sometimes on Polish based Linux, this ensures it works on both. We can add support for other languages if needed
   MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i Archit"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1d'"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1i'"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L2'"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L3'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1d |L1d:'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1i |L1i:'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L2 |L2:'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L3 |L3:'"));
   MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'Thread.* per core:|Wątków na rdzeń:'"));
   MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E '^CPU(:|\\(s\\)):?\\s+[0-9]+'"));
 
@@ -127,6 +127,8 @@ void GetGPUInfo(){
   MACH3LOG_INFO("Total VRAM: {} MB", TerminalToString("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits"));
   // Print Driver Version
   MACH3LOG_INFO("Driver Version: {}", TerminalToString("nvidia-smi --query-gpu=driver_version --format=csv,noheader"));
+  // Print N GPU thread
+  MACH3LOG_INFO("Currently used GPU has: {} threads", GetNumGPUThreads());
 #endif
   return;
 }

diff --git a/manager/Monitor.h b/manager/Monitor.h
@@ -19,6 +19,9 @@
 #include "samplePDF/Structs.h"
 #include "manager/YamlHelper.h"
 
+#ifdef CUDA
+#include "manager/gpuUtils.cuh"
+#endif
 
 namespace MaCh3Utils {
   /// @brief KS: Prints welcome message with MaCh3 logo

diff --git a/manager/gpuUtils.cu b/manager/gpuUtils.cu
@@ -1,34 +1,9 @@
-// C i/o  for printf and others
-#include <stdio.h>
-#include <vector>
-
-// CUDA specifics
-
-#include <cuda_runtime.h>
-
-#ifdef CUDA_ERROR_CHECK
-#include <helper_functions.h>
-#include <helper_cuda.h>
-#endif
-
-// Define the macros
-#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
-#define CudaCheckError()  __cudaCheckError(__FILE__, __LINE__)
-
-/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
-#define _BlockSize_ 1024
-
-// CUDA_ERROR_CHECK is now defined in the makefile instead
-//#define CUDA_ERROR_CHECK
-
-// **************************************************
-//             ERROR CHECKING ROUTINES
-// Also exist in helper_cuda.h
-// **************************************************
+// MaCh3 includes
+#include "manager/gpuUtils.cuh"
 
 // **************************************************
-/// @brief Check for a safe call on GPU
-inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
+// Check for a safe call on GPU
+void __cudaSafeCall( cudaError err, const char *file, const int line ) {
 // **************************************************
 #ifdef CUDA_ERROR_CHECK
   if (cudaSuccess != err) {
@@ -40,8 +15,8 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
 }
 
 // **************************************************
-/// @brief Check if there's been an error
-inline void __cudaCheckError( const char *file, const int line ) {
+// Check if there's been an error
+void __cudaCheckError( const char *file, const int line ) {
 // **************************************************
 #ifdef CUDA_ERROR_CHECK
   cudaError err = cudaGetLastError();
@@ -66,8 +41,8 @@ inline void __cudaCheckError( const char *file, const int line ) {
 // *******************************************
 
 // *******************************************
-/// @brief KS: Get some fancy info about VRAM usage
-inline void checkGpuMem() {
+// KS: Get some fancy info about VRAM usage
+void checkGpuMem() {
 // *******************************************
 
   float free_m, total_m,used_m;
@@ -84,8 +59,8 @@ inline void checkGpuMem() {
 }
 
 // *******************************************
-/// @brief KS: Get some fancy info about GPU
-inline void PrintNdevices() {
+// KS: Get some fancy info about GPU
+void PrintNdevices() {
 // *******************************************
 
   int nDevices;
@@ -102,8 +77,8 @@ inline void PrintNdevices() {
 
 
 // *******************************************
-/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
-inline void ResetDevice() {
+// KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
+void ResetDevice() {
 // *******************************************
 
   cudaDeviceReset();
@@ -113,7 +88,7 @@ inline void ResetDevice() {
 
 // *******************************************
 /// @brief Only useful if using multiple GPU
-inline void SetDevice(const int deviceId) {
+void SetDevice(const int deviceId) {
 // *******************************************
 
   // Check if the device ID is valid
@@ -131,8 +106,8 @@ inline void SetDevice(const int deviceId) {
 }
 
 // *******************************************
-/// @brief Get number of GPU threads for currently used GPU
-inline void GetNumGPUThreads(const int Device = 0) {
+// Get number of GPU threads for currently used GPU
+int GetNumGPUThreads(const int Device) {
 // *******************************************
 
   int deviceCount;
@@ -149,5 +124,5 @@ inline void GetNumGPUThreads(const int Device = 0) {
   // Define the number of threads per block
   int nThreadsBlocks = (deviceProp.multiProcessorCount * deviceProp.maxThreadsPerMultiProcessor);
 
-  printf("Currently used GPU has : %i threads \n", nThreadsBlocks);
+  return nThreadsBlocks;
 }
diff --git a/manager/gpuUtils.cuh b/manager/gpuUtils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+// C i/o  for printf and others
+#include <stdio.h>
+#include <vector>
+
+// CUDA specifics
+
+#include <cuda_runtime.h>
+
+#ifdef CUDA_ERROR_CHECK
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#endif
+
+// Define the macros
+#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
+#define CudaCheckError()  __cudaCheckError(__FILE__, __LINE__)
+
+/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
+#define _BlockSize_ 1024
+
+//KS: TODO
+// There is plenty of useful stuff here https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
+// We might want to port some of these utilities, for example having bool if there is unified memory etc.
+
+// CUDA_ERROR_CHECK is now defined in the makefile instead
+//#define CUDA_ERROR_CHECK
+
+// **************************************************
+//             ERROR CHECKING ROUTINES
+// Also exist in helper_cuda.h
+// **************************************************
+
+/// @brief Check for a safe call on GPU
+void __cudaSafeCall( cudaError err, const char *file, const int line );
+
+/// @brief Check if there's been an error
+void __cudaCheckError( const char *file, const int line );
+
+// *******************************************
+//              Utils
+// *******************************************
+
+// *******************************************
+/// @brief KS: Get some fancy info about VRAM usage
+void checkGpuMem();
+
+/// @brief KS: Get some fancy info about GPU
+void PrintNdevices();
+
+/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
+void ResetDevice();
+
+/// @brief KS: Only useful if using multiple GPU
+void SetDevice(const int deviceId);
+
+/// @brief KS: Get number of GPU threads for currently used GPU
+int GetNumGPUThreads(const int Device = 0);
diff --git a/mcmc/CMakeLists.txt b/mcmc/CMakeLists.txt
@@ -8,6 +8,7 @@ set(HEADERS
     SampleSummary.h
     MaCh3Factory.h
     StatisticalUtils.h
+    gpuMCMCProcessorUtils.cuh
 )
 
 add_library(MCMC SHARED

diff --git a/mcmc/MCMCProcessor.cpp b/mcmc/MCMCProcessor.cpp
@@ -2,43 +2,6 @@
 
 #include "TChain.h"
 
-//Only if GPU is enabled
-#ifdef CUDA
-extern void InitGPU_AutoCorr(
-    float **ParStep_gpu,
-    float **NumeratorSum_gpu,
-    float **ParamSums_gpu,
-    float **DenomSum_gpu,
-    int n_Entries,
-    int n_Pars,
-    const int n_Lags);
-
-extern void CopyToGPU_AutoCorr(
-    float *ParStep_cpu,
-    float *NumeratorSum_cpu,
-    float *ParamSums_cpu,
-    float *DenomSum_cpu,
-
-    float *ParStep_gpu,
-    float *NumeratorSum_gpu,
-    float *ParamSums_gpu,
-    float *DenomSum_gpu);
-
-extern void RunGPU_AutoCorr(
-    float *ParStep_gpu,
-    float *ParamSums_gpu,
-    float *NumeratorSum_gpu,
-    float *DenomSum_gpu,
-    float *NumeratorSum_cpu,
-    float *DenomSum_cpu);
-
-extern void CleanupGPU_AutoCorr(
-    float *ParStep_gpu,
-    float *NumeratorSum_gpu,
-    float *ParamSums_gpu,
-    float *DenomSum_gpu);
-#endif
-
 // ****************************
 MCMCProcessor::MCMCProcessor(const std::string &InputFile, bool MakePostfitCorr) : 
   Chain(nullptr), StepCut(""), MakeCorr(MakePostfitCorr), MadePostfit(false) {

diff --git a/mcmc/MCMCProcessor.h b/mcmc/MCMCProcessor.h
@@ -37,6 +37,11 @@
 // MaCh3 includes
 #include "mcmc/StatisticalUtils.h"
 
+//Only if GPU is enabled
+#ifdef CUDA
+#include "mcmc/gpuMCMCProcessorUtils.cuh"
+#endif
+
 //KS: Joy of forward declaration https://gieseanw.wordpress.com/2018/02/25/the-joys-of-forward-declarations-results-from-the-real-world/
 class TChain;
 

diff --git a/mcmc/gpuMCMCProcessorUtils.cu b/mcmc/gpuMCMCProcessorUtils.cu
@@ -1,12 +1,4 @@
-// MaCh3 utils for processing/diagnostic MCMC
-// Written by Kamil Skwarczynski
-//
-// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations
-// Potential extensions:
-// -Covariance matrix calculations and other matrix operations
-// -Effective Sample Size evaluation
-
-#include "manager/gpuUtils.cu"
+#include "mcmc/gpuMCMCProcessorUtils.cuh"
 
 // ******************************************
 // CONSTANTS