diff --git a/.mailmap b/.mailmap
index c6295118..e1fe3d4c 100644
--- a/.mailmap
+++ b/.mailmap
@@ -6,6 +6,8 @@ Edward Atkin <edhuthwaite@aol.com> EdAtkin <35494466+EdAtkin@users.noreply.githu
 
 # Kamil Skwarczynski
 Kamil Skwarczynski <skwarczynskikamil@gmail.com> Kamil <45295406+KSkwarczynski@users.noreply.github.com>
+Kamil Skwarczynski <skwarczynskikamil@gmail.com> Kamil Skwarczynski <kamilskw@bg12105.int.ets1.calculquebec.ca>
+Kamil Skwarczynski <skwarczynskikamil@gmail.com> Kamil Skwarczynski <kamilskw@cdr293.int.cedar.computecanada.ca>
 
 # Henry Wallace
 Henry Wallace <henryisrael08@gmail.com> henry-israel <henryisrael08@gmail.com>
diff --git a/cmake/Modules/CUDASamples.cmake b/cmake/Modules/CUDASamples.cmake
index 38923618..6729d546 100644
--- a/cmake/Modules/CUDASamples.cmake
+++ b/cmake/Modules/CUDASamples.cmake
@@ -31,3 +31,52 @@ endif()
 
 cmessage(STATUS "Using the following CUDA samples paths: ${CMAKE_CUDA_SAMPLES_PATH}")
 target_include_directories(MaCh3CompilerOptions INTERFACE ${CMAKE_CUDA_SAMPLES_PATH})
+
+
+# KS: Perform fancy CUDA Benchmarking
+DefineEnabledRequiredSwitch(MaCh3_GPU_BENCHMARK FALSE)
+if(MaCh3_GPU_BENCHMARK)
+    cmessage(STATUS "Building CUDA Benchmark")
+
+    # KS: Define directories to iterate over, might be useful to expand
+    set(CUDA_SAMPLES_DIRS
+        "deviceQuery"
+        "bandwidthTest"
+    )
+
+    # KS: Iterate over each directory
+    foreach(sample_dir ${CUDA_SAMPLES_DIRS})
+        # Define source and destination directories
+        set(SRC_DIR "${CMAKE_CUDA_SAMPLES_PATH}/../Samples/1_Utilities/${sample_dir}")
+        set(DST_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/")
+
+        # CW: Copy over the provided nvidia utility
+        # CW: Often we can't write to the CUDA install directory, so let's build it here
+        file(COPY ${SRC_DIR} DESTINATION ${DST_DIR})
+
+        # KS: Change directory to copied sample
+        set(SAMPLE_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/${sample_dir}")
+
+        # Modify Makefile path
+        set(MAKEFILE_PATH "${SAMPLE_DIR}/Makefile")
+
+        # CW: Patch the litle hard-coded NVIDIA makefile
+        execute_process(
+            COMMAND sed -i "s,../../../Common,${CMAKE_CUDA_SAMPLES_PATH},g" ${MAKEFILE_PATH}
+            RESULT_VARIABLE SED_RESULT
+        )
+
+        # Add custom target to run make
+        add_custom_target(run_${sample_dir} ALL
+            COMMAND make
+            WORKING_DIRECTORY ${SAMPLE_DIR}
+        )
+
+        # Add custom target to run sample
+        add_custom_target(run_${sample_dir}_exec ALL
+            COMMAND ./${sample_dir}
+            WORKING_DIRECTORY ${SAMPLE_DIR}
+            DEPENDS run_${sample_dir}
+        )
+    endforeach(sample_dir)
+endif()
diff --git a/manager/CMakeLists.txt b/manager/CMakeLists.txt
index 31e30823..b612c3de 100644
--- a/manager/CMakeLists.txt
+++ b/manager/CMakeLists.txt
@@ -5,6 +5,7 @@ set(HEADERS
     MaCh3Logger.h
     Monitor.h
     MaCh3Exception.h
+    gpuUtils.cuh
 )
 
 add_library(Manager SHARED
diff --git a/manager/Monitor.cpp b/manager/Monitor.cpp
index 282d84a4..6fc17574 100644
--- a/manager/Monitor.cpp
+++ b/manager/Monitor.cpp
@@ -101,10 +101,10 @@ void GetCPUInfo(){
   MACH3LOG_INFO("{}", TerminalToString("cat /proc/cpuinfo | grep -m 1 MHz"));
   //KS: Below code is convoluted because I mostly work on English based Linux but sometimes on Polish based Linux, this ensures it works on both. We can add support for other languages if needed
   MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i Archit"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1d'"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1i'"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L2'"));
-  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L3'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1d |L1d:'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1i |L1i:'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L2 |L2:'"));
+  MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L3 |L3:'"));
   MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'Thread.* per core:|Wątków na rdzeń:'"));
   MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E '^CPU(:|\\(s\\)):?\\s+[0-9]+'"));
 
@@ -127,6 +127,8 @@ void GetGPUInfo(){
   MACH3LOG_INFO("Total VRAM: {} MB", TerminalToString("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits"));
   // Print Driver Version
   MACH3LOG_INFO("Driver Version: {}", TerminalToString("nvidia-smi --query-gpu=driver_version --format=csv,noheader"));
+  // Print N GPU thread
+  MACH3LOG_INFO("Currently used GPU has: {} threads", GetNumGPUThreads());
 #endif
   return;
 }
diff --git a/manager/Monitor.h b/manager/Monitor.h
index 1ef82e07..06def70b 100644
--- a/manager/Monitor.h
+++ b/manager/Monitor.h
@@ -19,6 +19,9 @@
 #include "samplePDF/Structs.h"
 #include "manager/YamlHelper.h"
 
+#ifdef CUDA
+#include "manager/gpuUtils.cuh"
+#endif
 
 namespace MaCh3Utils {
   /// @brief KS: Prints welcome message with MaCh3 logo
diff --git a/manager/gpuUtils.cu b/manager/gpuUtils.cu
index 6fc7d2af..f6e34d60 100644
--- a/manager/gpuUtils.cu
+++ b/manager/gpuUtils.cu
@@ -1,34 +1,9 @@
-// C i/o  for printf and others
-#include <stdio.h>
-#include <vector>
-
-// CUDA specifics
-
-#include <cuda_runtime.h>
-
-#ifdef CUDA_ERROR_CHECK
-#include <helper_functions.h>
-#include <helper_cuda.h>
-#endif
-
-// Define the macros
-#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
-#define CudaCheckError()  __cudaCheckError(__FILE__, __LINE__)
-
-/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
-#define _BlockSize_ 1024
-
-// CUDA_ERROR_CHECK is now defined in the makefile instead
-//#define CUDA_ERROR_CHECK
-
-// **************************************************
-//             ERROR CHECKING ROUTINES
-// Also exist in helper_cuda.h
-// **************************************************
+// MaCh3 includes
+#include "manager/gpuUtils.cuh"
 
 // **************************************************
-/// @brief Check for a safe call on GPU
-inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
+// Check for a safe call on GPU
+void __cudaSafeCall( cudaError err, const char *file, const int line ) {
 // **************************************************
 #ifdef CUDA_ERROR_CHECK
   if (cudaSuccess != err) {
@@ -40,8 +15,8 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
 }
 
 // **************************************************
-/// @brief Check if there's been an error
-inline void __cudaCheckError( const char *file, const int line ) {
+// Check if there's been an error
+void __cudaCheckError( const char *file, const int line ) {
 // **************************************************
 #ifdef CUDA_ERROR_CHECK
   cudaError err = cudaGetLastError();
@@ -66,8 +41,8 @@ inline void __cudaCheckError( const char *file, const int line ) {
 // *******************************************
 
 // *******************************************
-/// @brief KS: Get some fancy info about VRAM usage
-inline void checkGpuMem() {
+// KS: Get some fancy info about VRAM usage
+void checkGpuMem() {
 // *******************************************
 
   float free_m, total_m,used_m;
@@ -84,8 +59,8 @@ inline void checkGpuMem() {
 }
 
 // *******************************************
-/// @brief KS: Get some fancy info about GPU
-inline void PrintNdevices() {
+// KS: Get some fancy info about GPU
+void PrintNdevices() {
 // *******************************************
 
   int nDevices;
@@ -102,8 +77,8 @@ inline void PrintNdevices() {
 
 
 // *******************************************
-/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
-inline void ResetDevice() {
+// KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
+void ResetDevice() {
 // *******************************************
 
   cudaDeviceReset();
@@ -113,7 +88,7 @@ inline void ResetDevice() {
 
 // *******************************************
 /// @brief Only useful if using multiple GPU
-inline void SetDevice(const int deviceId) {
+void SetDevice(const int deviceId) {
 // *******************************************
 
   // Check if the device ID is valid
@@ -131,8 +106,8 @@ inline void SetDevice(const int deviceId) {
 }
 
 // *******************************************
-/// @brief Get number of GPU threads for currently used GPU
-inline void GetNumGPUThreads(const int Device = 0) {
+// Get number of GPU threads for currently used GPU
+int GetNumGPUThreads(const int Device) {
 // *******************************************
 
   int deviceCount;
@@ -149,5 +124,5 @@ inline void GetNumGPUThreads(const int Device = 0) {
   // Define the number of threads per block
   int nThreadsBlocks = (deviceProp.multiProcessorCount * deviceProp.maxThreadsPerMultiProcessor);
 
-  printf("Currently used GPU has : %i threads \n", nThreadsBlocks);
+  return nThreadsBlocks;
 }
diff --git a/manager/gpuUtils.cuh b/manager/gpuUtils.cuh
new file mode 100644
index 00000000..c9d7a341
--- /dev/null
+++ b/manager/gpuUtils.cuh
@@ -0,0 +1,59 @@
+#pragma once
+
+// C i/o  for printf and others
+#include <stdio.h>
+#include <vector>
+
+// CUDA specifics
+
+#include <cuda_runtime.h>
+
+#ifdef CUDA_ERROR_CHECK
+#include <helper_functions.h>
+#include <helper_cuda.h>
+#endif
+
+// Define the macros
+#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
+#define CudaCheckError()  __cudaCheckError(__FILE__, __LINE__)
+
+/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
+#define _BlockSize_ 1024
+
+//KS: TODO
+// There is plenty of useful stuff here https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
+// We might want to port some of these utilities, for example having bool if there is unified memory etc.
+
+// CUDA_ERROR_CHECK is now defined in the makefile instead
+//#define CUDA_ERROR_CHECK
+
+// **************************************************
+//             ERROR CHECKING ROUTINES
+// Also exist in helper_cuda.h
+// **************************************************
+
+/// @brief Check for a safe call on GPU
+void __cudaSafeCall( cudaError err, const char *file, const int line );
+
+/// @brief Check if there's been an error
+void __cudaCheckError( const char *file, const int line );
+
+// *******************************************
+//              Utils
+// *******************************************
+
+// *******************************************
+/// @brief KS: Get some fancy info about VRAM usage
+void checkGpuMem();
+
+/// @brief KS: Get some fancy info about GPU
+void PrintNdevices();
+
+/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
+void ResetDevice();
+
+/// @brief KS: Only useful if using multiple GPU
+void SetDevice(const int deviceId);
+
+/// @brief KS: Get number of GPU threads for currently used GPU
+int GetNumGPUThreads(const int Device = 0);
diff --git a/mcmc/CMakeLists.txt b/mcmc/CMakeLists.txt
index a981119a..493b9042 100644
--- a/mcmc/CMakeLists.txt
+++ b/mcmc/CMakeLists.txt
@@ -8,6 +8,7 @@ set(HEADERS
     SampleSummary.h
     MaCh3Factory.h
     StatisticalUtils.h
+    gpuMCMCProcessorUtils.cuh
 )
 
 add_library(MCMC SHARED
diff --git a/mcmc/MCMCProcessor.cpp b/mcmc/MCMCProcessor.cpp
index 81e0ad5b..b7f270c2 100644
--- a/mcmc/MCMCProcessor.cpp
+++ b/mcmc/MCMCProcessor.cpp
@@ -2,43 +2,6 @@
 
 #include "TChain.h"
 
-//Only if GPU is enabled
-#ifdef CUDA
-extern void InitGPU_AutoCorr(
-    float **ParStep_gpu,
-    float **NumeratorSum_gpu,
-    float **ParamSums_gpu,
-    float **DenomSum_gpu,
-    int n_Entries,
-    int n_Pars,
-    const int n_Lags);
-
-extern void CopyToGPU_AutoCorr(
-    float *ParStep_cpu,
-    float *NumeratorSum_cpu,
-    float *ParamSums_cpu,
-    float *DenomSum_cpu,
-
-    float *ParStep_gpu,
-    float *NumeratorSum_gpu,
-    float *ParamSums_gpu,
-    float *DenomSum_gpu);
-
-extern void RunGPU_AutoCorr(
-    float *ParStep_gpu,
-    float *ParamSums_gpu,
-    float *NumeratorSum_gpu,
-    float *DenomSum_gpu,
-    float *NumeratorSum_cpu,
-    float *DenomSum_cpu);
-
-extern void CleanupGPU_AutoCorr(
-    float *ParStep_gpu,
-    float *NumeratorSum_gpu,
-    float *ParamSums_gpu,
-    float *DenomSum_gpu);
-#endif
-
 // ****************************
 MCMCProcessor::MCMCProcessor(const std::string &InputFile, bool MakePostfitCorr) : 
   Chain(nullptr), StepCut(""), MakeCorr(MakePostfitCorr), MadePostfit(false) {
diff --git a/mcmc/MCMCProcessor.h b/mcmc/MCMCProcessor.h
index 1aeffb20..852a4c7a 100644
--- a/mcmc/MCMCProcessor.h
+++ b/mcmc/MCMCProcessor.h
@@ -37,6 +37,11 @@
 // MaCh3 includes
 #include "mcmc/StatisticalUtils.h"
 
+//Only if GPU is enabled
+#ifdef CUDA
+#include "mcmc/gpuMCMCProcessorUtils.cuh"
+#endif
+
 //KS: Joy of forward declaration https://gieseanw.wordpress.com/2018/02/25/the-joys-of-forward-declarations-results-from-the-real-world/
 class TChain;
 
diff --git a/mcmc/gpuMCMCProcessorUtils.cu b/mcmc/gpuMCMCProcessorUtils.cu
index d7023d10..53e7c39b 100644
--- a/mcmc/gpuMCMCProcessorUtils.cu
+++ b/mcmc/gpuMCMCProcessorUtils.cu
@@ -1,12 +1,4 @@
-// MaCh3 utils for processing/diagnostic MCMC
-// Written by Kamil Skwarczynski
-//
-// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations
-// Potential extensions:
-// -Covariance matrix calculations and other matrix operations
-// -Effective Sample Size evaluation
-
-#include "manager/gpuUtils.cu"
+#include "mcmc/gpuMCMCProcessorUtils.cuh"
 
 // ******************************************
 // CONSTANTS
diff --git a/mcmc/gpuMCMCProcessorUtils.cuh b/mcmc/gpuMCMCProcessorUtils.cuh
new file mode 100644
index 00000000..5c75f894
--- /dev/null
+++ b/mcmc/gpuMCMCProcessorUtils.cuh
@@ -0,0 +1,61 @@
+#pragma once
+// MaCh3 utils for processing/diagnostic MCMC
+// Written by Kamil Skwarczynski
+//
+// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations
+// Potential extensions:
+// -Covariance matrix calculations and other matrix operations
+// -Effective Sample Size evaluation
+
+#include "manager/gpuUtils.cuh"
+
+// *******************************************
+//              INITIALISE GPU
+// *******************************************
+
+/// @brief KS: Initialiser, here we allocate memory for variables and copy constants
+__host__ void InitGPU_AutoCorr(
+                          float **ParStep_gpu,
+                          float **NumeratorSum_gpu,
+                          float **ParamSums_gpu,
+                          float **DenomSum_gpu,
+
+                          int n_Entries,
+                          int n_Pars,
+                          const int n_Lags);
+
+/// @brief KS: Copy necessary variables from CPU to GPU
+__host__ void CopyToGPU_AutoCorr(
+                            float *ParStep_cpu,
+                            float *NumeratorSum_cpu,
+                            float *ParamSums_cpu,
+                            float *DenomSum_cpu,
+
+                            float *ParStep_gpu,
+                            float *NumeratorSum_gpu,
+                            float *ParamSums_gpu,
+                            float *DenomSum_gpu);
+
+
+/// @brief Eval autocorrelations based on Box and Jenkins
+__global__ void EvalOnGPU_AutoCorr(
+    const float* __restrict__ ParStep_gpu,
+    const float* __restrict__ ParamSums_gpu,
+    float*  NumeratorSum_gpu,
+    float*  DenomSum_gpu);
+
+/// @brief KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU
+__host__ void RunGPU_AutoCorr(
+    float*  ParStep_gpu,
+    float*  ParamSums_gpu,
+    float*  NumeratorSum_gpu,
+    float*  DenomSum_gpu,
+    float*  NumeratorSum_cpu,
+    float*  DenomSum_cpu);
+
+/// @brief KS: free memory on gpu
+__host__ void CleanupGPU_AutoCorr(
+    float *ParStep_gpu,
+    float *NumeratorSum_gpu,
+    float *ParamSums_gpu,
+    float *DenomSum_gpu);
diff --git a/samplePDF/kdeGpu.cu b/samplePDF/kdeGpu.cu
index d76b79ce..81ed665d 100644
--- a/samplePDF/kdeGpu.cu
+++ b/samplePDF/kdeGpu.cu
@@ -1,5 +1,5 @@
 //MaCh3 included
-#include "manager/gpuUtils.cu"
+#include "manager/gpuUtils.cuh"
 
 // -*- c++ -*-
 #include "stdio.h"
diff --git a/samplePDF/kdeGpu2.cu b/samplePDF/kdeGpu2.cu
index e904f52d..55bf3e18 100644
--- a/samplePDF/kdeGpu2.cu
+++ b/samplePDF/kdeGpu2.cu
@@ -4,7 +4,7 @@
 #define MULTI_DATA 10
 
 //MaCh3 included
-#include "manager/gpuUtils.cu"
+#include "manager/gpuUtils.cuh"
 
 
 #include "stdio.h"
diff --git a/splines/gpuSplineUtils.cu b/splines/gpuSplineUtils.cu
index 7ab0043c..e34e3812 100644
--- a/splines/gpuSplineUtils.cu
+++ b/splines/gpuSplineUtils.cu
@@ -6,7 +6,7 @@
 // Called from samplePDF/samplePDFND.cpp -> splines/SplineMonolith.cpp -> splines/gpuSplineUtils.cu
 
 //MaCh3 included
-#include "manager/gpuUtils.cu"
+#include "manager/gpuUtils.cuh"
 #include "splines/SplineCommon.h"
 
 // Hard code the number of splines