Skip to content

Commit

Permalink
Revert "Use only CUDA devices with a supported architecture (#286)" (#…
Browse files Browse the repository at this point in the history
…290)

This reverts commit 1b21d1c.
  • Loading branch information
felicepantaleo authored Mar 15, 2019
1 parent 0170373 commit dae672a
Show file tree
Hide file tree
Showing 11 changed files with 64 additions and 94 deletions.
10 changes: 8 additions & 2 deletions HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -34,14 +34,20 @@ namespace heterogeneous {
return;
}

// TODO: possible ideas to improve the "assignment" logic include
// For startes we "statically" assign the device based on
// edm::Stream number. This is suboptimal if the number of
// edm::Streams is not a multiple of the number of CUDA devices
// (and even then there is no load balancing).
//
// TODO: improve. Possible ideas include
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
// * the first module of a chain dictates the device for the rest of the chain
// - our own CUDA memory allocator
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
deviceId_ = cudacore::chooseCUDADevice(id);
deviceId_ = id % cudaService->numberOfDevices();

cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

// Create the CUDA stream for this module-edm::Stream pair
Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
return cudaService->devices()[id % cudaService->numberOfDevices()];
return id % cudaService->numberOfDevices();
}
}
1 change: 0 additions & 1 deletion HeterogeneousCore/CUDAServices/bin/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,5 +4,4 @@

<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
<use name="cuda"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
</bin>
28 changes: 26 additions & 2 deletions HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,31 @@
#include <algorithm>
#include <array>
#include <cstdlib>
#include <iostream>

#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
#include <cuda_runtime.h>

int main() {
return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
int devices = 0;
auto status = cudaGetDeviceCount(& devices);
if (status != cudaSuccess) {
return EXIT_FAILURE;
}

int minimumMajor = 6; // min minor is implicitly 0

// This approach (requiring all devices are supported) is rather
// conservative. In principle we could consider just dropping the
// unsupported devices. Currently that would be easiest to achieve
// in CUDAService though.
for (int i = 0; i < devices; ++i) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);

if(properties.major < minimumMajor) {
return EXIT_FAILURE;
}
}

return EXIT_SUCCESS;
}
4 changes: 0 additions & 4 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ class CUDAService {

int numberOfDevices() const { return numberOfDevices_; }

// devices supported by the CUDA configuration and compilation flags
std::vector<int> const& devices() const { return supportedDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }

Expand Down Expand Up @@ -155,7 +152,6 @@ class CUDAService {
std::unique_ptr<CUDAEventCache> cudaEventCache_;

int numberOfDevices_ = 0;
std::vector<int> supportedDevices_;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
};
Expand Down
14 changes: 7 additions & 7 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);

private:
std::vector<int> devices_;
int numberOfDevices_ = 0;
};

CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled())
return;
devices_ = cudaService->devices();
numberOfDevices_ = cudaService->numberOfDevices();

if(config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
Expand Down Expand Up @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
void dumpUsedMemory(T& log, int num) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
for(int i: devices) {
for(int i = 0; i < num; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand All @@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, devices_);
dumpUsedMemory(log, numberOfDevices_);
}

void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, devices_);
dumpUsedMemory(log, numberOfDevices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
dumpUsedMemory(log, devices_);
dumpUsedMemory(log, numberOfDevices_);
}

DEFINE_FWK_SERVICE(CUDAMonitoringService);
32 changes: 16 additions & 16 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
Expand Down Expand Up @@ -95,10 +94,10 @@ namespace {
}
}

void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
for (int i : cs.devices()) {
for(int i=0; i<numberOfDevices; ++i) {
cudaCheck(cudaSetDevice(i));
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique<char[]>(size, stream);
Expand All @@ -122,14 +121,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}

supportedDevices_ = supportedCUDADevices();
numberOfDevices_ = supportedDevices_.size();
if (numberOfDevices_ == 0) {
auto status = cudaGetDeviceCount(&numberOfDevices_);
if (cudaSuccess != status) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
computeCapabilities_.reserve(numberOfDevices_);
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";

auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
Expand All @@ -138,20 +137,18 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");

int lastDevice = supportedDevices_.back();
computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
for (int i: supportedDevices_) {
for (int i = 0; i < numberOfDevices_; ++i) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
cudaCheck(cudaGetDeviceProperties(&properties, i));
log << "CUDA device " << i << ": " << properties.name << '\n';

// compute capabilities
computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";

// compute mode
Expand Down Expand Up @@ -294,7 +291,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
for (int i: supportedDevices_) {
for (int i = 0; i < numberOfDevices_; ++i) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand Down Expand Up @@ -343,7 +340,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
enabled_ = true;

// Preallocate buffers if asked to
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
}

Expand All @@ -356,7 +353,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();

for (int i: supportedDevices_) {
for (int i = 0; i < numberOfDevices_; ++i) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
Expand Down Expand Up @@ -401,7 +398,7 @@ int CUDAService::deviceWithMostFreeMemory() const {

size_t maxFreeMemory = 0;
int device = -1;
for (int i: supportedDevices_) {
for(int i = 0; i < numberOfDevices_; ++i) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
Expand Down Expand Up @@ -435,6 +432,9 @@ struct CUDAService::Allocator {
template <typename ...Args>
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}

void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);

size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
Expand Down
11 changes: 6 additions & 5 deletions HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
Expand All @@ -30,10 +29,13 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {

// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
int deviceCount = supportedCUDADevices().size();
int deviceCount = 0;
auto ret = cudaGetDeviceCount( &deviceCount );

if (deviceCount == 0) {
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
if( ret != cudaSuccess ) {
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
<< ret << ") " << cudaGetErrorString( ret )
<< ". Running only tests not requiring devices.");
}

SECTION("CUDAService enabled") {
Expand All @@ -56,7 +58,6 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}

auto cs = makeCUDAService(ps, ar);
cudaError_t ret;

SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
Expand Down

This file was deleted.

6 changes: 0 additions & 6 deletions HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

void exitSansCUDADevices() {
int devices = 0;
Expand All @@ -17,9 +16,4 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
int supported = supportedCUDADevices().size();
if (supported == 0) {
std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
}
42 changes: 0 additions & 42 deletions HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu

This file was deleted.

0 comments on commit dae672a

Please sign in to comment.