Use only supported devices

Restrict the CUDAService and its clients to use only the devices supported by the CUDA architectures enabled in SCRAM's cuda.xml toolfile.
cms-patatrack · Mar 14, 2019 · f42f3f3 · f42f3f3
1 parent 177e16b
commit f42f3f3
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 33 deletions.
diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc
@@ -46,7 +46,7 @@ namespace heterogeneous {
     //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
     //   * would probably still need some buffer space/device to hold e.g. conditions data
     //     - for conditions, how to handle multiple lumis per job?
-    deviceId_ = id % cudaService->numberOfDevices();
+    deviceId_ = cudaService->devices()[id % cudaService->numberOfDevices()];
 
     cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
 

diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -13,6 +13,6 @@ namespace cudacore {
     // (and even then there is no load balancing).
     //
     // TODO: improve the "assignment" logic
-    return id % cudaService->numberOfDevices();
+    return cudaService->devices()[id % cudaService->numberOfDevices()];
   }
 }
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -1,8 +1,8 @@
 #ifndef HeterogeneousCore_CUDAServices_CUDAService_h
 #define HeterogeneousCore_CUDAServices_CUDAService_h
 
+#include <map>
 #include <utility>
-#include <vector>
 
 #include <cuda/api_wrappers.h>
 
@@ -55,6 +55,7 @@ class CUDAService {
   bool enabled(unsigned int streamId) const { return enabled_ && (numberOfStreamsTotal_ == 0 || streamId < numberOfStreamsTotal_); } // to make testing easier
 
   int numberOfDevices() const { return numberOfDevices_; }
+  std::vector<int> devices() const;
 
   // major, minor
   std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
@@ -157,7 +158,7 @@ class CUDAService {
 
   int numberOfDevices_ = 0;
   unsigned int numberOfStreamsTotal_ = 0;
-  std::vector<std::pair<int, int>> computeCapabilities_;
+  std::map<int, std::pair<int, int>> computeCapabilities_;
   bool enabled_ = false;
 };
 

diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
@@ -30,15 +30,15 @@ class CUDAMonitoringService {
   void postEvent(edm::StreamContext const& sc);
 
 private:
-  int numberOfDevices_ = 0;
+  std::vector<int> devices_;
 };
 
 CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
   // make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
   edm::Service<CUDAService> cudaService;
   if(!cudaService->enabled())
     return;
-  numberOfDevices_ = cudaService->numberOfDevices();
+  devices_ = cudaService->devices();
 
   if(config.getUntrackedParameter<bool>("memoryConstruction")) {
     registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
@@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
 // activity handlers
 namespace {
   template <typename T>
-  void dumpUsedMemory(T& log, int num) {
+  void dumpUsedMemory(T& log, std::vector<int> const& devices) {
     int old = 0;
     cudaCheck(cudaGetDevice(&old));
-    for(int i = 0; i < num; ++i) {
+    for(int i: devices) {
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -82,19 +82,19 @@ namespace {
 void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
-  dumpUsedMemory(log, numberOfDevices_);
+  dumpUsedMemory(log, devices_);
 }
 
 void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
-  dumpUsedMemory(log, numberOfDevices_);
+  dumpUsedMemory(log, devices_);
 }
 
 void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
   auto log = edm::LogPrint("CUDAMonitoringService");
   log << "CUDA device memory after event";
-  dumpUsedMemory(log, numberOfDevices_);
+  dumpUsedMemory(log, devices_);
 }
 
 DEFINE_FWK_SERVICE(CUDAMonitoringService);
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -11,6 +11,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAServices/interface/supportedCudaDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 #include "CachingDeviceAllocator.h"
@@ -94,10 +95,10 @@ namespace {
     }
   }
 
-  void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
+  void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
     int device;
     cudaCheck(cudaGetDevice(&device));
-    for(int i=0; i<numberOfDevices; ++i) {
+    for (int i : cs.devices()) {
       cudaCheck(cudaSetDevice(i));
       preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
           return cs.make_device_unique<char[]>(size, stream);
@@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     return;
   }
 
-  auto status = cudaGetDeviceCount(&numberOfDevices_);
-  if (cudaSuccess != status) {
+  computeCapabilities_ = supportedCudaDevices();
+  numberOfDevices_ = computeCapabilities_.size();
+  if (numberOfDevices_ == 0) {
     edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
     return;
   }
   edm::LogInfo log("CUDAService");
-  computeCapabilities_.reserve(numberOfDevices_);
-  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
+  log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
 
   auto numberOfStreamsPerDevice = config.getUntrackedParameter<unsigned int>("numberOfStreamsPerDevice");
   if (numberOfStreamsPerDevice > 0) {
@@ -143,7 +144,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   auto devRuntimeSyncDepth          = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
   auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
 
-  for (int i = 0; i < numberOfDevices_; ++i) {
+  // iterate over the supported devices
+  for (auto const& keyval: computeCapabilities_) {
+    int i = keyval.first;
     // read information about the compute device.
     // see the documentation of cudaGetDeviceProperties() for more information.
     cudaDeviceProp properties;
@@ -152,9 +155,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
 
     // compute capabilities
     log << "  compute capability:          " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
-    computeCapabilities_.emplace_back(properties.major, properties.minor);
     log << "  streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
-    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
+    log << "  CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
     log << "  single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
 
     // compute mode
@@ -297,7 +299,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     size_t minCachedBytes = std::numeric_limits<size_t>::max();
     int currentDevice;
     cudaCheck(cudaGetDevice(&currentDevice));
-    for (int i = 0; i < numberOfDevices_; ++i) {
+    // iterate over the supported devices
+    for (auto const& keyval: computeCapabilities_) {
+      int i = keyval.first;
       size_t freeMemory, totalMemory;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -346,7 +350,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   enabled_ = true;
 
   // Preallocate buffers if asked to
-  devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
+  devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
   hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
 }
 
@@ -359,7 +363,9 @@ CUDAService::~CUDAService() {
     cudaEventCache_.reset();
     cudaStreamCache_.reset();
 
-    for (int i = 0; i < numberOfDevices_; ++i) {
+    // iterate over the supported devices
+    for (auto const& keyval: computeCapabilities_) {
+      int i = keyval.first;
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
       // Explicitly destroys and cleans up all resources associated with the current device in the
@@ -398,14 +404,25 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions
   descriptions.add("CUDAService", desc);
 }
 
+std::vector<int> CUDAService::devices() const {
+  std::vector<int> devices;
+  devices.reserve(numberOfDevices_);
+  for (auto const& keyval: computeCapabilities_) {
+    devices.push_back(keyval.first);
+  }
+  return devices;
+}
+
 int CUDAService::deviceWithMostFreeMemory() const {
   // save the current device
   int currentDevice;
   cudaCheck(cudaGetDevice(&currentDevice));
 
   size_t maxFreeMemory = 0;
   int device = -1;
-  for(int i = 0; i < numberOfDevices_; ++i) {
+  // iterate over the supported devices
+  for (auto const& keyval: computeCapabilities_) {
+    int i = keyval.first;
     /*
     // TODO: understand why the api-wrappers version gives same value for all devices
     auto device = cuda::device::get(i);
@@ -439,9 +456,6 @@ struct CUDAService::Allocator {
   template <typename ...Args>
   Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}
 
-  void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
-  void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
-
   size_t maxAllocation;
   notcub::CachingDeviceAllocator deviceAllocator;
   notcub::CachingHostAllocator hostAllocator;

diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -14,6 +14,7 @@
 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAServices/interface/supportedCudaDevices.h"
 
 namespace {
   CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
 
   // Test setup: check if a simple CUDA runtime API call fails:
   // if so, skip the test with the CUDAService enabled
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
+  int deviceCount = supportedCudaDevices().size();
 
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Running only tests not requiring devices.");
+  if (deviceCount == 0) {
+    WARN("No supported CUDA devices available. Running only tests not requiring devices.");
   }
 
   SECTION("CUDAService enabled") {
@@ -59,6 +57,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     }
 
     auto cs = makeCUDAService(ps, ar);
+    cudaError_t ret;
 
     SECTION("CUDA Queries") {
       int driverVersion = 0, runtimeVersion = 0;