diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
index f40a007dd4bc6..d712dcba75a8b 100644
--- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc
+++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
@@ -34,14 +34,20 @@ namespace heterogeneous {
return;
}
- // TODO: possible ideas to improve the "assignment" logic include
+ // For startes we "statically" assign the device based on
+ // edm::Stream number. This is suboptimal if the number of
+ // edm::Streams is not a multiple of the number of CUDA devices
+ // (and even then there is no load balancing).
+ //
+ // TODO: improve. Possible ideas include
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
// * the first module of a chain dictates the device for the rest of the chain
// - our own CUDA memory allocator
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
- deviceId_ = cudacore::chooseCUDADevice(id);
+ deviceId_ = id % cudaService->numberOfDevices();
+
cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
// Create the CUDA stream for this module-edm::Stream pair
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
index ce487507500cc..a582ed2f72866 100644
--- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
- return cudaService->devices()[id % cudaService->numberOfDevices()];
+ return id % cudaService->numberOfDevices();
}
}
diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
index 58ce8cc807515..041ed25ba134a 100644
--- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -4,5 +4,4 @@
-
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
index db6e7dd141c19..b24f05adb2213 100644
--- a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -1,7 +1,31 @@
+#include
+#include
#include
+#include
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
+#include
int main() {
- return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
+ int devices = 0;
+ auto status = cudaGetDeviceCount(& devices);
+ if (status != cudaSuccess) {
+ return EXIT_FAILURE;
+ }
+
+ int minimumMajor = 6; // min minor is implicitly 0
+
+ // This approach (requiring all devices are supported) is rather
+ // conservative. In principle we could consider just dropping the
+ // unsupported devices. Currently that would be easiest to achieve
+ // in CUDAService though.
+ for (int i = 0; i < devices; ++i) {
+ cudaDeviceProp properties;
+ cudaGetDeviceProperties(&properties, i);
+
+ if(properties.major < minimumMajor) {
+ return EXIT_FAILURE;
+ }
+ }
+
+ return EXIT_SUCCESS;
}
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 7125b2c0dcf6e..e54ec1be8ad20 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -52,9 +52,6 @@ class CUDAService {
int numberOfDevices() const { return numberOfDevices_; }
- // devices supported by the CUDA configuration and compilation flags
- std::vector const& devices() const { return supportedDevices_; }
-
// major, minor
std::pair computeCapability(int device) { return computeCapabilities_.at(device); }
@@ -155,7 +152,6 @@ class CUDAService {
std::unique_ptr cudaEventCache_;
int numberOfDevices_ = 0;
- std::vector supportedDevices_;
std::vector> computeCapabilities_;
bool enabled_ = false;
};
diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
index 5c1d042a6420b..7b7711c63c502 100644
--- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
+++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -30,7 +30,7 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);
private:
- std::vector devices_;
+ int numberOfDevices_ = 0;
};
CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
@@ -38,7 +38,7 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed
edm::Service cudaService;
if(!cudaService->enabled())
return;
- devices_ = cudaService->devices();
+ numberOfDevices_ = cudaService->numberOfDevices();
if(config.getUntrackedParameter("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
@@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template
- void dumpUsedMemory(T& log, std::vector const& devices) {
+ void dumpUsedMemory(T& log, int num) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
- for(int i: devices) {
+ for(int i = 0; i < num; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
- dumpUsedMemory(log, devices_);
+ dumpUsedMemory(log, numberOfDevices_);
}
void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
- dumpUsedMemory(log, devices_);
+ dumpUsedMemory(log, numberOfDevices_);
}
void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
- dumpUsedMemory(log, devices_);
+ dumpUsedMemory(log, numberOfDevices_);
}
DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index ee63f1c7219de..9db5d89de1f83 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -12,7 +12,6 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
@@ -95,10 +94,10 @@ namespace {
}
}
- void devicePreallocate(CUDAService& cs, const std::vector& bufferSizes) {
+ void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
- for (int i : cs.devices()) {
+ for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique(size, stream);
@@ -122,14 +121,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}
- supportedDevices_ = supportedCUDADevices();
- numberOfDevices_ = supportedDevices_.size();
- if (numberOfDevices_ == 0) {
+ auto status = cudaGetDeviceCount(&numberOfDevices_);
+ if (cudaSuccess != status) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
- log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
+ computeCapabilities_.reserve(numberOfDevices_);
+ log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
auto const& limits = config.getUntrackedParameter("limits");
auto printfFifoSize = limits.getUntrackedParameter("cudaLimitPrintfFifoSize");
@@ -138,9 +137,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("cudaLimitDevRuntimePendingLaunchCount");
- int lastDevice = supportedDevices_.back();
- computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
- for (int i: supportedDevices_) {
+ for (int i = 0; i < numberOfDevices_; ++i) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
@@ -148,10 +145,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
log << "CUDA device " << i << ": " << properties.name << '\n';
// compute capabilities
- computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
+ computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
- log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
+ log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
// compute mode
@@ -294,7 +291,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits::max();
int currentDevice;
cudaCheck(cudaGetDevice(¤tDevice));
- for (int i: supportedDevices_) {
+ for (int i = 0; i < numberOfDevices_; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -343,7 +340,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
enabled_ = true;
// Preallocate buffers if asked to
- devicePreallocate(*this, allocator.getUntrackedParameter >("devicePreallocate"));
+ devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter >("hostPreallocate"));
}
@@ -356,7 +353,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();
- for (int i: supportedDevices_) {
+ for (int i = 0; i < numberOfDevices_; ++i) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
@@ -401,7 +398,7 @@ int CUDAService::deviceWithMostFreeMemory() const {
size_t maxFreeMemory = 0;
int device = -1;
- for (int i: supportedDevices_) {
+ for(int i = 0; i < numberOfDevices_; ++i) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
@@ -435,6 +432,9 @@ struct CUDAService::Allocator {
template
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {}
+ void devicePreallocate(int numberOfDevices, const std::vector& bytes);
+ void hostPreallocate(int numberOfDevices, const std::vector& bytes);
+
size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index 5e1bc65645841..95768bdbd4b58 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -14,7 +14,6 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -30,10 +29,13 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
- int deviceCount = supportedCUDADevices().size();
+ int deviceCount = 0;
+ auto ret = cudaGetDeviceCount( &deviceCount );
- if (deviceCount == 0) {
- WARN("No supported CUDA devices available. Running only tests not requiring devices.");
+ if( ret != cudaSuccess ) {
+ WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+ << ret << ") " << cudaGetErrorString( ret )
+ << ". Running only tests not requiring devices.");
}
SECTION("CUDAService enabled") {
@@ -56,7 +58,6 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}
auto cs = makeCUDAService(ps, ar);
- cudaError_t ret;
SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
diff --git a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
deleted file mode 100644
index 53d984dd2beaa..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h
+++ /dev/null
@@ -1,8 +0,0 @@
-#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
-#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
-
-#include
-
-std::vector supportedCUDADevices();
-
-#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
diff --git a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
index c20f43c5ec794..2d166e5c62840 100644
--- a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
+++ b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
@@ -4,7 +4,6 @@
#include
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
void exitSansCUDADevices() {
int devices = 0;
@@ -17,9 +16,4 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
- int supported = supportedCUDADevices().size();
- if (supported == 0) {
- std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
- exit(EXIT_SUCCESS);
- }
}
diff --git a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
deleted file mode 100644
index 9d629d2fc7554..0000000000000
--- a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
+++ /dev/null
@@ -1,42 +0,0 @@
-#include
-
-#include
-
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
-
-__global__
-void isSupported(bool * result) {
- * result = true;
-}
-
-std::vector supportedCUDADevices() {
- int devices = 0;
- auto status = cudaGetDeviceCount(&devices);
- if (status != cudaSuccess or devices == 0) {
- return {};
- }
-
- std::vector supportedDevices;
- supportedDevices.reserve(devices);
-
- for (int i = 0; i < devices; ++i) {
- cudaCheck(cudaSetDevice(i));
- bool supported = false;
- bool * supported_d;
- cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
- cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
- isSupported<<<1,1>>>(supported_d);
- // swallow any eventual error from launching the kernel on an unsupported device
- cudaGetLastError();
- cudaCheck(cudaDeviceSynchronize());
- cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
- cudaCheck(cudaFree(supported_d));
- if (supported) {
- supportedDevices.push_back(i);
- }
- cudaCheck(cudaDeviceReset());
- }
-
- return supportedDevices;
-}