Skip to content

Commit

Permalink
Merge pull request #77 from mach3-software/feature_CUDABenchamrk
Browse files Browse the repository at this point in the history
CUDA Benchamrk
  • Loading branch information
KSkwarczynski authored Jul 19, 2024
2 parents f44d96d + 3401fb8 commit ef64244
Show file tree
Hide file tree
Showing 15 changed files with 207 additions and 94 deletions.
2 changes: 2 additions & 0 deletions .mailmap
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Edward Atkin <[email protected]> EdAtkin <[email protected]

# Kamil Skwarczynski
Kamil Skwarczynski <[email protected]> Kamil <[email protected]>
Kamil Skwarczynski <[email protected]> Kamil Skwarczynski <[email protected]>
Kamil Skwarczynski <[email protected]> Kamil Skwarczynski <[email protected]>

# Henry Wallace
Henry Wallace <[email protected]> henry-israel <[email protected]>
Expand Down
49 changes: 49 additions & 0 deletions cmake/Modules/CUDASamples.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,52 @@ endif()

cmessage(STATUS "Using the following CUDA samples paths: ${CMAKE_CUDA_SAMPLES_PATH}")
target_include_directories(MaCh3CompilerOptions INTERFACE ${CMAKE_CUDA_SAMPLES_PATH})


# KS: Perform fancy CUDA Benchmarking
DefineEnabledRequiredSwitch(MaCh3_GPU_BENCHMARK FALSE)
if(MaCh3_GPU_BENCHMARK)
cmessage(STATUS "Building CUDA Benchmark")

# KS: Define directories to iterate over, might be useful to expand
set(CUDA_SAMPLES_DIRS
"deviceQuery"
"bandwidthTest"
)

# KS: Iterate over each directory
foreach(sample_dir ${CUDA_SAMPLES_DIRS})
# Define source and destination directories
set(SRC_DIR "${CMAKE_CUDA_SAMPLES_PATH}/../Samples/1_Utilities/${sample_dir}")
set(DST_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/")

# CW: Copy over the provided nvidia utility
# CW: Often we can't write to the CUDA install directory, so let's build it here
file(COPY ${SRC_DIR} DESTINATION ${DST_DIR})

# KS: Change directory to copied sample
set(SAMPLE_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/${sample_dir}")

# Modify Makefile path
set(MAKEFILE_PATH "${SAMPLE_DIR}/Makefile")

# CW: Patch the litle hard-coded NVIDIA makefile
execute_process(
COMMAND sed -i "s,../../../Common,${CMAKE_CUDA_SAMPLES_PATH},g" ${MAKEFILE_PATH}
RESULT_VARIABLE SED_RESULT
)

# Add custom target to run make
add_custom_target(run_${sample_dir} ALL
COMMAND make
WORKING_DIRECTORY ${SAMPLE_DIR}
)

# Add custom target to run sample
add_custom_target(run_${sample_dir}_exec ALL
COMMAND ./${sample_dir}
WORKING_DIRECTORY ${SAMPLE_DIR}
DEPENDS run_${sample_dir}
)
endforeach(sample_dir)
endif()
1 change: 1 addition & 0 deletions manager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set(HEADERS
MaCh3Logger.h
Monitor.h
MaCh3Exception.h
gpuUtils.cuh
)

add_library(Manager SHARED
Expand Down
10 changes: 6 additions & 4 deletions manager/Monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ void GetCPUInfo(){
MACH3LOG_INFO("{}", TerminalToString("cat /proc/cpuinfo | grep -m 1 MHz"));
//KS: Below code is convoluted because I mostly work on English based Linux but sometimes on Polish based Linux, this ensures it works on both. We can add support for other languages if needed
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i Archit"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1d'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1i'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L2'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L3'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1d |L1d:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1i |L1i:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L2 |L2:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L3 |L3:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'Thread.* per core:|Wątków na rdzeń:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E '^CPU(:|\\(s\\)):?\\s+[0-9]+'"));

Expand All @@ -127,6 +127,8 @@ void GetGPUInfo(){
MACH3LOG_INFO("Total VRAM: {} MB", TerminalToString("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits"));
// Print Driver Version
MACH3LOG_INFO("Driver Version: {}", TerminalToString("nvidia-smi --query-gpu=driver_version --format=csv,noheader"));
// Print N GPU thread
MACH3LOG_INFO("Currently used GPU has: {} threads", GetNumGPUThreads());
#endif
return;
}
Expand Down
3 changes: 3 additions & 0 deletions manager/Monitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "samplePDF/Structs.h"
#include "manager/YamlHelper.h"

#ifdef CUDA
#include "manager/gpuUtils.cuh"
#endif

namespace MaCh3Utils {
/// @brief KS: Prints welcome message with MaCh3 logo
Expand Down
57 changes: 16 additions & 41 deletions manager/gpuUtils.cu
Original file line number Diff line number Diff line change
@@ -1,34 +1,9 @@
// C i/o for printf and others
#include <stdio.h>
#include <vector>

// CUDA specifics

#include <cuda_runtime.h>

#ifdef CUDA_ERROR_CHECK
#include <helper_functions.h>
#include <helper_cuda.h>
#endif

// Define the macros
#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)

/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
#define _BlockSize_ 1024

// CUDA_ERROR_CHECK is now defined in the makefile instead
//#define CUDA_ERROR_CHECK

// **************************************************
// ERROR CHECKING ROUTINES
// Also exist in helper_cuda.h
// **************************************************
// MaCh3 includes
#include "manager/gpuUtils.cuh"

// **************************************************
/// @brief Check for a safe call on GPU
inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
// Check for a safe call on GPU
void __cudaSafeCall( cudaError err, const char *file, const int line ) {
// **************************************************
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err) {
Expand All @@ -40,8 +15,8 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
}

// **************************************************
/// @brief Check if there's been an error
inline void __cudaCheckError( const char *file, const int line ) {
// Check if there's been an error
void __cudaCheckError( const char *file, const int line ) {
// **************************************************
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
Expand All @@ -66,8 +41,8 @@ inline void __cudaCheckError( const char *file, const int line ) {
// *******************************************

// *******************************************
/// @brief KS: Get some fancy info about VRAM usage
inline void checkGpuMem() {
// KS: Get some fancy info about VRAM usage
void checkGpuMem() {
// *******************************************

float free_m, total_m,used_m;
Expand All @@ -84,8 +59,8 @@ inline void checkGpuMem() {
}

// *******************************************
/// @brief KS: Get some fancy info about GPU
inline void PrintNdevices() {
// KS: Get some fancy info about GPU
void PrintNdevices() {
// *******************************************

int nDevices;
Expand All @@ -102,8 +77,8 @@ inline void PrintNdevices() {


// *******************************************
/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
inline void ResetDevice() {
// KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
void ResetDevice() {
// *******************************************

cudaDeviceReset();
Expand All @@ -113,7 +88,7 @@ inline void ResetDevice() {

// *******************************************
/// @brief Only useful if using multiple GPU
inline void SetDevice(const int deviceId) {
void SetDevice(const int deviceId) {
// *******************************************

// Check if the device ID is valid
Expand All @@ -131,8 +106,8 @@ inline void SetDevice(const int deviceId) {
}

// *******************************************
/// @brief Get number of GPU threads for currently used GPU
inline void GetNumGPUThreads(const int Device = 0) {
// Get number of GPU threads for currently used GPU
int GetNumGPUThreads(const int Device) {
// *******************************************

int deviceCount;
Expand All @@ -149,5 +124,5 @@ inline void GetNumGPUThreads(const int Device = 0) {
// Define the number of threads per block
int nThreadsBlocks = (deviceProp.multiProcessorCount * deviceProp.maxThreadsPerMultiProcessor);

printf("Currently used GPU has : %i threads \n", nThreadsBlocks);
return nThreadsBlocks;
}
59 changes: 59 additions & 0 deletions manager/gpuUtils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#pragma once

// C i/o for printf and others
#include <stdio.h>
#include <vector>

// CUDA specifics

#include <cuda_runtime.h>

#ifdef CUDA_ERROR_CHECK
#include <helper_functions.h>
#include <helper_cuda.h>
#endif

// Define the macros
#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)

/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
#define _BlockSize_ 1024

//KS: TODO
// There is plenty of useful stuff here https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
// We might want to port some of these utilities, for example having bool if there is unified memory etc.

// CUDA_ERROR_CHECK is now defined in the makefile instead
//#define CUDA_ERROR_CHECK

// **************************************************
// ERROR CHECKING ROUTINES
// Also exist in helper_cuda.h
// **************************************************

/// @brief Check for a safe call on GPU
void __cudaSafeCall( cudaError err, const char *file, const int line );

/// @brief Check if there's been an error
void __cudaCheckError( const char *file, const int line );

// *******************************************
// Utils
// *******************************************

// *******************************************
/// @brief KS: Get some fancy info about VRAM usage
void checkGpuMem();

/// @brief KS: Get some fancy info about GPU
void PrintNdevices();

/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
void ResetDevice();

/// @brief KS: Only useful if using multiple GPU
void SetDevice(const int deviceId);

/// @brief KS: Get number of GPU threads for currently used GPU
int GetNumGPUThreads(const int Device = 0);
1 change: 1 addition & 0 deletions mcmc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ set(HEADERS
SampleSummary.h
MaCh3Factory.h
StatisticalUtils.h
gpuMCMCProcessorUtils.cuh
)

add_library(MCMC SHARED
Expand Down
37 changes: 0 additions & 37 deletions mcmc/MCMCProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,6 @@

#include "TChain.h"

//Only if GPU is enabled
#ifdef CUDA
extern void InitGPU_AutoCorr(
float **ParStep_gpu,
float **NumeratorSum_gpu,
float **ParamSums_gpu,
float **DenomSum_gpu,
int n_Entries,
int n_Pars,
const int n_Lags);

extern void CopyToGPU_AutoCorr(
float *ParStep_cpu,
float *NumeratorSum_cpu,
float *ParamSums_cpu,
float *DenomSum_cpu,

float *ParStep_gpu,
float *NumeratorSum_gpu,
float *ParamSums_gpu,
float *DenomSum_gpu);

extern void RunGPU_AutoCorr(
float *ParStep_gpu,
float *ParamSums_gpu,
float *NumeratorSum_gpu,
float *DenomSum_gpu,
float *NumeratorSum_cpu,
float *DenomSum_cpu);

extern void CleanupGPU_AutoCorr(
float *ParStep_gpu,
float *NumeratorSum_gpu,
float *ParamSums_gpu,
float *DenomSum_gpu);
#endif

// ****************************
MCMCProcessor::MCMCProcessor(const std::string &InputFile, bool MakePostfitCorr) :
Chain(nullptr), StepCut(""), MakeCorr(MakePostfitCorr), MadePostfit(false) {
Expand Down
5 changes: 5 additions & 0 deletions mcmc/MCMCProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
// MaCh3 includes
#include "mcmc/StatisticalUtils.h"

//Only if GPU is enabled
#ifdef CUDA
#include "mcmc/gpuMCMCProcessorUtils.cuh"
#endif

//KS: Joy of forward declaration https://gieseanw.wordpress.com/2018/02/25/the-joys-of-forward-declarations-results-from-the-real-world/
class TChain;

Expand Down
10 changes: 1 addition & 9 deletions mcmc/gpuMCMCProcessorUtils.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
// MaCh3 utils for processing/diagnostic MCMC
// Written by Kamil Skwarczynski
//
// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations
// Potential extensions:
// -Covariance matrix calculations and other matrix operations
// -Effective Sample Size evaluation

#include "manager/gpuUtils.cu"
#include "mcmc/gpuMCMCProcessorUtils.cuh"

// ******************************************
// CONSTANTS
Expand Down
Loading

0 comments on commit ef64244

Please sign in to comment.