Skip to content

Commit

Permalink
Use only supported devices
Browse files Browse the repository at this point in the history
Restrict the CUDAService and its clients to use only the devices
supported by the CUDA architectures enabled in SCRAM's cuda.xml
toolfile.
  • Loading branch information
fwyzard committed Mar 14, 2019
1 parent 177e16b commit f42f3f3
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 33 deletions.
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace heterogeneous {
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
deviceId_ = id % cudaService->numberOfDevices();
deviceId_ = cudaService->devices()[id % cudaService->numberOfDevices()];

cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
return id % cudaService->numberOfDevices();
return cudaService->devices()[id % cudaService->numberOfDevices()];
}
}
5 changes: 3 additions & 2 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
#ifndef HeterogeneousCore_CUDAServices_CUDAService_h
#define HeterogeneousCore_CUDAServices_CUDAService_h

#include <map>
#include <utility>
#include <vector>

#include <cuda/api_wrappers.h>

Expand Down Expand Up @@ -55,6 +55,7 @@ class CUDAService {
bool enabled(unsigned int streamId) const { return enabled_ && (numberOfStreamsTotal_ == 0 || streamId < numberOfStreamsTotal_); } // to make testing easier

int numberOfDevices() const { return numberOfDevices_; }
std::vector<int> devices() const;

// major, minor
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
Expand Down Expand Up @@ -157,7 +158,7 @@ class CUDAService {

int numberOfDevices_ = 0;
unsigned int numberOfStreamsTotal_ = 0;
std::vector<std::pair<int, int>> computeCapabilities_;
std::map<int, std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
};

Expand Down
14 changes: 7 additions & 7 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);

private:
int numberOfDevices_ = 0;
std::vector<int> devices_;
};

CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled())
return;
numberOfDevices_ = cudaService->numberOfDevices();
devices_ = cudaService->devices();

if(config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
Expand Down Expand Up @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
for(int i = 0; i < num; ++i) {
for(int i: devices) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand All @@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

DEFINE_FWK_SERVICE(CUDAMonitoringService);
46 changes: 30 additions & 16 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAServices/interface/supportedCudaDevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"

#include "CachingDeviceAllocator.h"
Expand Down Expand Up @@ -94,10 +95,10 @@ namespace {
}
}

void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
for(int i=0; i<numberOfDevices; ++i) {
for (int i : cs.devices()) {
cudaCheck(cudaSetDevice(i));
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique<char[]>(size, stream);
Expand All @@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}

auto status = cudaGetDeviceCount(&numberOfDevices_);
if (cudaSuccess != status) {
computeCapabilities_ = supportedCudaDevices();
numberOfDevices_ = computeCapabilities_.size();
if (numberOfDevices_ == 0) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
computeCapabilities_.reserve(numberOfDevices_);
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";

auto numberOfStreamsPerDevice = config.getUntrackedParameter<unsigned int>("numberOfStreamsPerDevice");
if (numberOfStreamsPerDevice > 0) {
Expand All @@ -143,7 +144,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");

for (int i = 0; i < numberOfDevices_; ++i) {
// iterate over the supported devices
for (auto const& keyval: computeCapabilities_) {
int i = keyval.first;
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
Expand All @@ -152,9 +155,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&

// compute capabilities
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";

// compute mode
Expand Down Expand Up @@ -297,7 +299,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
for (int i = 0; i < numberOfDevices_; ++i) {
// iterate over the supported devices
for (auto const& keyval: computeCapabilities_) {
int i = keyval.first;
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand Down Expand Up @@ -346,7 +350,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
enabled_ = true;

// Preallocate buffers if asked to
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
}

Expand All @@ -359,7 +363,9 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();

for (int i = 0; i < numberOfDevices_; ++i) {
// iterate over the supported devices
for (auto const& keyval: computeCapabilities_) {
int i = keyval.first;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
Expand Down Expand Up @@ -398,14 +404,25 @@ void CUDAService::fillDescriptions(edm::ConfigurationDescriptions & descriptions
descriptions.add("CUDAService", desc);
}

std::vector<int> CUDAService::devices() const {
std::vector<int> devices;
devices.reserve(numberOfDevices_);
for (auto const& keyval: computeCapabilities_) {
devices.push_back(keyval.first);
}
return devices;
}

int CUDAService::deviceWithMostFreeMemory() const {
// save the current device
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));

size_t maxFreeMemory = 0;
int device = -1;
for(int i = 0; i < numberOfDevices_; ++i) {
// iterate over the supported devices
for (auto const& keyval: computeCapabilities_) {
int i = keyval.first;
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
Expand Down Expand Up @@ -439,9 +456,6 @@ struct CUDAService::Allocator {
template <typename ...Args>
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}

void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);

size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
Expand Down
11 changes: 5 additions & 6 deletions HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAServices/interface/supportedCudaDevices.h"

namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
Expand All @@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {

// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
int deviceCount = 0;
auto ret = cudaGetDeviceCount( &deviceCount );
int deviceCount = supportedCudaDevices().size();

if( ret != cudaSuccess ) {
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
<< ret << ") " << cudaGetErrorString( ret )
<< ". Running only tests not requiring devices.");
if (deviceCount == 0) {
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
}

SECTION("CUDAService enabled") {
Expand All @@ -59,6 +57,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}

auto cs = makeCUDAService(ps, ar);
cudaError_t ret;

SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
Expand Down

0 comments on commit f42f3f3

Please sign in to comment.