diff --git a/HeterogeneousCore/CUDACore/src/GPUCuda.cc b/HeterogeneousCore/CUDACore/src/GPUCuda.cc index f40a007dd4bc6..d712dcba75a8b 100644 --- a/HeterogeneousCore/CUDACore/src/GPUCuda.cc +++ b/HeterogeneousCore/CUDACore/src/GPUCuda.cc @@ -34,14 +34,20 @@ namespace heterogeneous { return; } - // TODO: possible ideas to improve the "assignment" logic include + // For startes we "statically" assign the device based on + // edm::Stream number. This is suboptimal if the number of + // edm::Streams is not a multiple of the number of CUDA devices + // (and even then there is no load balancing). + // + // TODO: improve. Possible ideas include // - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use // * the first module of a chain dictates the device for the rest of the chain // - our own CUDA memory allocator // * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load // * would probably still need some buffer space/device to hold e.g. conditions data // - for conditions, how to handle multiple lumis per job? - deviceId_ = cudacore::chooseCUDADevice(id); + deviceId_ = id % cudaService->numberOfDevices(); + cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_); // Create the CUDA stream for this module-edm::Stream pair diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc index ce487507500cc..a582ed2f72866 100644 --- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc +++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc @@ -13,6 +13,6 @@ namespace cudacore { // (and even then there is no load balancing). // // TODO: improve the "assignment" logic - return cudaService->devices()[id % cudaService->numberOfDevices()]; + return id % cudaService->numberOfDevices(); } } diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml index 58ce8cc807515..041ed25ba134a 100644 --- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml +++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml @@ -4,5 +4,4 @@ - diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp index db6e7dd141c19..b24f05adb2213 100644 --- a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp +++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp @@ -1,7 +1,31 @@ +#include +#include #include +#include -#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" +#include int main() { - return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS; + int devices = 0; + auto status = cudaGetDeviceCount(& devices); + if (status != cudaSuccess) { + return EXIT_FAILURE; + } + + int minimumMajor = 6; // min minor is implicitly 0 + + // This approach (requiring all devices are supported) is rather + // conservative. In principle we could consider just dropping the + // unsupported devices. Currently that would be easiest to achieve + // in CUDAService though. + for (int i = 0; i < devices; ++i) { + cudaDeviceProp properties; + cudaGetDeviceProperties(&properties, i); + + if(properties.major < minimumMajor) { + return EXIT_FAILURE; + } + } + + return EXIT_SUCCESS; } diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h index 7125b2c0dcf6e..e54ec1be8ad20 100644 --- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h +++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h @@ -52,9 +52,6 @@ class CUDAService { int numberOfDevices() const { return numberOfDevices_; } - // devices supported by the CUDA configuration and compilation flags - std::vector const& devices() const { return supportedDevices_; } - // major, minor std::pair computeCapability(int device) { return computeCapabilities_.at(device); } @@ -155,7 +152,6 @@ class CUDAService { std::unique_ptr cudaEventCache_; int numberOfDevices_ = 0; - std::vector supportedDevices_; std::vector> computeCapabilities_; bool enabled_ = false; }; diff --git a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc index 5c1d042a6420b..7b7711c63c502 100644 --- a/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc +++ b/HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc @@ -30,7 +30,7 @@ class CUDAMonitoringService { void postEvent(edm::StreamContext const& sc); private: - std::vector devices_; + int numberOfDevices_ = 0; }; CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) { @@ -38,7 +38,7 @@ CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, ed edm::Service cudaService; if(!cudaService->enabled()) return; - devices_ = cudaService->devices(); + numberOfDevices_ = cudaService->numberOfDevices(); if(config.getUntrackedParameter("memoryConstruction")) { registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction); @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de // activity handlers namespace { template - void dumpUsedMemory(T& log, std::vector const& devices) { + void dumpUsedMemory(T& log, int num) { int old = 0; cudaCheck(cudaGetDevice(&old)); - for(int i: devices) { + for(int i = 0; i < num; ++i) { size_t freeMemory, totalMemory; cudaCheck(cudaSetDevice(i)); cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); @@ -82,19 +82,19 @@ namespace { void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) { auto log = edm::LogPrint("CUDAMonitoringService"); log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")"; - dumpUsedMemory(log, devices_); + dumpUsedMemory(log, numberOfDevices_); } void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) { auto log = edm::LogPrint("CUDAMonitoringService"); log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")"; - dumpUsedMemory(log, devices_); + dumpUsedMemory(log, numberOfDevices_); } void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) { auto log = edm::LogPrint("CUDAMonitoringService"); log << "CUDA device memory after event"; - dumpUsedMemory(log, devices_); + dumpUsedMemory(log, numberOfDevices_); } DEFINE_FWK_SERVICE(CUDAMonitoringService); diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index ee63f1c7219de..9db5d89de1f83 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -12,7 +12,6 @@ #include "FWCore/Utilities/interface/ReusableObjectHolder.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" #include "CachingDeviceAllocator.h" #include "CachingHostAllocator.h" @@ -95,10 +94,10 @@ namespace { } } - void devicePreallocate(CUDAService& cs, const std::vector& bufferSizes) { + void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector& bufferSizes) { int device; cudaCheck(cudaGetDevice(&device)); - for (int i : cs.devices()) { + for(int i=0; i([&](size_t size, cuda::stream_t<>& stream) { return cs.make_device_unique(size, stream); @@ -122,14 +121,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& return; } - supportedDevices_ = supportedCUDADevices(); - numberOfDevices_ = supportedDevices_.size(); - if (numberOfDevices_ == 0) { + auto status = cudaGetDeviceCount(&numberOfDevices_); + if (cudaSuccess != status) { edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService."; return; } edm::LogInfo log("CUDAService"); - log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n"; + computeCapabilities_.reserve(numberOfDevices_); + log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n"; auto const& limits = config.getUntrackedParameter("limits"); auto printfFifoSize = limits.getUntrackedParameter("cudaLimitPrintfFifoSize"); @@ -138,9 +137,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& auto devRuntimeSyncDepth = limits.getUntrackedParameter("cudaLimitDevRuntimeSyncDepth"); auto devRuntimePendingLaunchCount = limits.getUntrackedParameter("cudaLimitDevRuntimePendingLaunchCount"); - int lastDevice = supportedDevices_.back(); - computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0)); - for (int i: supportedDevices_) { + for (int i = 0; i < numberOfDevices_; ++i) { // read information about the compute device. // see the documentation of cudaGetDeviceProperties() for more information. cudaDeviceProp properties; @@ -148,10 +145,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& log << "CUDA device " << i << ": " << properties.name << '\n'; // compute capabilities - computeCapabilities_[i] = std::make_pair(properties.major, properties.minor); log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n"; + computeCapabilities_.emplace_back(properties.major, properties.minor); log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n'; - log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n'; + log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n'; log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n"; // compute mode @@ -294,7 +291,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& size_t minCachedBytes = std::numeric_limits::max(); int currentDevice; cudaCheck(cudaGetDevice(¤tDevice)); - for (int i: supportedDevices_) { + for (int i = 0; i < numberOfDevices_; ++i) { size_t freeMemory, totalMemory; cudaCheck(cudaSetDevice(i)); cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory)); @@ -343,7 +340,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry& enabled_ = true; // Preallocate buffers if asked to - devicePreallocate(*this, allocator.getUntrackedParameter >("devicePreallocate")); + devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter >("devicePreallocate")); hostPreallocate(*this, allocator.getUntrackedParameter >("hostPreallocate")); } @@ -356,7 +353,7 @@ CUDAService::~CUDAService() { cudaEventCache_.reset(); cudaStreamCache_.reset(); - for (int i: supportedDevices_) { + for (int i = 0; i < numberOfDevices_; ++i) { cudaCheck(cudaSetDevice(i)); cudaCheck(cudaDeviceSynchronize()); // Explicitly destroys and cleans up all resources associated with the current device in the @@ -401,7 +398,7 @@ int CUDAService::deviceWithMostFreeMemory() const { size_t maxFreeMemory = 0; int device = -1; - for (int i: supportedDevices_) { + for(int i = 0; i < numberOfDevices_; ++i) { /* // TODO: understand why the api-wrappers version gives same value for all devices auto device = cuda::device::get(i); @@ -435,6 +432,9 @@ struct CUDAService::Allocator { template Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward(args)...) {} + void devicePreallocate(int numberOfDevices, const std::vector& bytes); + void hostPreallocate(int numberOfDevices, const std::vector& bytes); + size_t maxAllocation; notcub::CachingDeviceAllocator deviceAllocator; notcub::CachingHostAllocator hostAllocator; diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp index 5e1bc65645841..95768bdbd4b58 100644 --- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp +++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp @@ -14,7 +14,6 @@ #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h" #include "FWCore/Utilities/interface/Exception.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" -#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" namespace { CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) { @@ -30,10 +29,13 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { // Test setup: check if a simple CUDA runtime API call fails: // if so, skip the test with the CUDAService enabled - int deviceCount = supportedCUDADevices().size(); + int deviceCount = 0; + auto ret = cudaGetDeviceCount( &deviceCount ); - if (deviceCount == 0) { - WARN("No supported CUDA devices available. Running only tests not requiring devices."); + if( ret != cudaSuccess ) { + WARN("Unable to query the CUDA capable devices from the CUDA runtime API: (" + << ret << ") " << cudaGetErrorString( ret ) + << ". Running only tests not requiring devices."); } SECTION("CUDAService enabled") { @@ -56,7 +58,6 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") { } auto cs = makeCUDAService(ps, ar); - cudaError_t ret; SECTION("CUDA Queries") { int driverVersion = 0, runtimeVersion = 0; diff --git a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h b/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h deleted file mode 100644 index 53d984dd2beaa..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h +++ /dev/null @@ -1,8 +0,0 @@ -#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h -#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h - -#include - -std::vector supportedCUDADevices(); - -#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h diff --git a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc index c20f43c5ec794..2d166e5c62840 100644 --- a/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc +++ b/HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc @@ -4,7 +4,6 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h" -#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" void exitSansCUDADevices() { int devices = 0; @@ -17,9 +16,4 @@ void exitSansCUDADevices() { std::cerr << "No CUDA devices available, the test will be skipped." << "\n"; exit(EXIT_SUCCESS); } - int supported = supportedCUDADevices().size(); - if (supported == 0) { - std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n"; - exit(EXIT_SUCCESS); - } } diff --git a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu b/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu deleted file mode 100644 index 9d629d2fc7554..0000000000000 --- a/HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu +++ /dev/null @@ -1,42 +0,0 @@ -#include - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h" - -__global__ -void isSupported(bool * result) { - * result = true; -} - -std::vector supportedCUDADevices() { - int devices = 0; - auto status = cudaGetDeviceCount(&devices); - if (status != cudaSuccess or devices == 0) { - return {}; - } - - std::vector supportedDevices; - supportedDevices.reserve(devices); - - for (int i = 0; i < devices; ++i) { - cudaCheck(cudaSetDevice(i)); - bool supported = false; - bool * supported_d; - cudaCheck(cudaMalloc(&supported_d, sizeof(bool))); - cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool))); - isSupported<<<1,1>>>(supported_d); - // swallow any eventual error from launching the kernel on an unsupported device - cudaGetLastError(); - cudaCheck(cudaDeviceSynchronize()); - cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost)); - cudaCheck(cudaFree(supported_d)); - if (supported) { - supportedDevices.push_back(i); - } - cudaCheck(cudaDeviceReset()); - } - - return supportedDevices; -}