Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use only CUDA devices supported by the SCRAM toolfile #286

Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/GPUCuda.cc
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ namespace heterogeneous {
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
// * would probably still need some buffer space/device to hold e.g. conditions data
// - for conditions, how to handle multiple lumis per job?
deviceId_ = id % cudaService->numberOfDevices();
deviceId_ = cudaService->devices()[id % cudaService->numberOfDevices()];
fwyzard marked this conversation as resolved.
Show resolved Hide resolved

cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);

Expand Down
2 changes: 1 addition & 1 deletion HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ namespace cudacore {
// (and even then there is no load balancing).
//
// TODO: improve the "assignment" logic
return id % cudaService->numberOfDevices();
return cudaService->devices()[id % cudaService->numberOfDevices()];
fwyzard marked this conversation as resolved.
Show resolved Hide resolved
}
}
1 change: 1 addition & 0 deletions HeterogeneousCore/CUDAServices/bin/BuildFile.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,4 +4,5 @@

<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
<use name="cuda"/>
<use name="HeterogeneousCore/CUDAUtilities"/>
</bin>
30 changes: 2 additions & 28 deletions HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
Original file line number Diff line number Diff line change
@@ -1,31 +1,5 @@
#include <algorithm>
#include <array>
#include <cstdlib>
#include <iostream>

#include <cuda_runtime.h>
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

int main() {
int devices = 0;
auto status = cudaGetDeviceCount(& devices);
if (status != cudaSuccess) {
return EXIT_FAILURE;
}

int minimumMajor = 6; // min minor is implicitly 0

// This approach (requiring all devices are supported) is rather
// conservative. In principle we could consider just dropping the
// unsupported devices. Currently that would be easiest to achieve
// in CUDAService though.
for (int i = 0; i < devices; ++i) {
cudaDeviceProp properties;
cudaGetDeviceProperties(&properties, i);

if(properties.major < minimumMajor) {
return EXIT_FAILURE;
}
}

return EXIT_SUCCESS;
return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
}
4 changes: 4 additions & 0 deletions HeterogeneousCore/CUDAServices/interface/CUDAService.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ class CUDAService {

int numberOfDevices() const { return numberOfDevices_; }

// devices supported by the CUDA configuration and compilation flags
std::vector<int> const& devices() const { return supportedDevices_; }

// major, minor
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }

Expand Down Expand Up @@ -152,6 +155,7 @@ class CUDAService {
std::unique_ptr<CUDAEventCache> cudaEventCache_;

int numberOfDevices_ = 0;
std::vector<int> supportedDevices_;
std::vector<std::pair<int, int>> computeCapabilities_;
bool enabled_ = false;
};
Expand Down
14 changes: 7 additions & 7 deletions HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,15 @@ class CUDAMonitoringService {
void postEvent(edm::StreamContext const& sc);

private:
int numberOfDevices_ = 0;
std::vector<int> devices_;
};

CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
edm::Service<CUDAService> cudaService;
if(!cudaService->enabled())
return;
numberOfDevices_ = cudaService->numberOfDevices();
devices_ = cudaService->devices();

if(config.getUntrackedParameter<bool>("memoryConstruction")) {
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
Expand Down Expand Up @@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
// activity handlers
namespace {
template <typename T>
void dumpUsedMemory(T& log, int num) {
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
int old = 0;
cudaCheck(cudaGetDevice(&old));
for(int i = 0; i < num; ++i) {
for(int i: devices) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand All @@ -82,19 +82,19 @@ namespace {
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
auto log = edm::LogPrint("CUDAMonitoringService");
log << "CUDA device memory after event";
dumpUsedMemory(log, numberOfDevices_);
dumpUsedMemory(log, devices_);
}

DEFINE_FWK_SERVICE(CUDAMonitoringService);
38 changes: 22 additions & 16 deletions HeterogeneousCore/CUDAServices/src/CUDAService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

#include "CachingDeviceAllocator.h"
#include "CachingHostAllocator.h"
Expand Down Expand Up @@ -94,10 +95,10 @@ namespace {
}
}

void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
int device;
cudaCheck(cudaGetDevice(&device));
for(int i=0; i<numberOfDevices; ++i) {
for (int i : cs.devices()) {
cudaCheck(cudaSetDevice(i));
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
return cs.make_device_unique<char[]>(size, stream);
Expand All @@ -121,14 +122,23 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
return;
}

auto status = cudaGetDeviceCount(&numberOfDevices_);
if (cudaSuccess != status) {
auto supportedDevices = supportedCUDADevices();
numberOfDevices_ = supportedDevices.size();
if (numberOfDevices_ == 0) {
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
return;
}
edm::LogInfo log("CUDAService");
computeCapabilities_.reserve(numberOfDevices_);
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";

int lastDevice = supportedDevices.rbegin()->first;
supportedDevices_.reserve(numberOfDevices_);
computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
for (auto const& device_caps: supportedDevices) {
int device = device_caps.first;
supportedDevices_.push_back(device);
computeCapabilities_[device] = device_caps.second;
}

auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
Expand All @@ -137,7 +147,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");

for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
// read information about the compute device.
// see the documentation of cudaGetDeviceProperties() for more information.
cudaDeviceProp properties;
Expand All @@ -146,9 +156,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&

// compute capabilities
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
computeCapabilities_.emplace_back(properties.major, properties.minor);
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";

// compute mode
Expand Down Expand Up @@ -291,7 +300,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
size_t minCachedBytes = std::numeric_limits<size_t>::max();
int currentDevice;
cudaCheck(cudaGetDevice(&currentDevice));
for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
size_t freeMemory, totalMemory;
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
Expand Down Expand Up @@ -340,7 +349,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
enabled_ = true;

// Preallocate buffers if asked to
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
}

Expand All @@ -353,7 +362,7 @@ CUDAService::~CUDAService() {
cudaEventCache_.reset();
cudaStreamCache_.reset();

for (int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
cudaCheck(cudaSetDevice(i));
cudaCheck(cudaDeviceSynchronize());
// Explicitly destroys and cleans up all resources associated with the current device in the
Expand Down Expand Up @@ -398,7 +407,7 @@ int CUDAService::deviceWithMostFreeMemory() const {

size_t maxFreeMemory = 0;
int device = -1;
for(int i = 0; i < numberOfDevices_; ++i) {
for (int i: supportedDevices_) {
/*
// TODO: understand why the api-wrappers version gives same value for all devices
auto device = cuda::device::get(i);
Expand Down Expand Up @@ -432,9 +441,6 @@ struct CUDAService::Allocator {
template <typename ...Args>
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}

void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
fwyzard marked this conversation as resolved.
Show resolved Hide resolved
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);

size_t maxAllocation;
notcub::CachingDeviceAllocator deviceAllocator;
notcub::CachingHostAllocator hostAllocator;
Expand Down
11 changes: 5 additions & 6 deletions HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
#include "FWCore/Utilities/interface/Exception.h"
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

namespace {
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
Expand All @@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {

// Test setup: check if a simple CUDA runtime API call fails:
// if so, skip the test with the CUDAService enabled
int deviceCount = 0;
auto ret = cudaGetDeviceCount( &deviceCount );
int deviceCount = supportedCUDADevices().size();

if( ret != cudaSuccess ) {
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
<< ret << ") " << cudaGetErrorString( ret )
<< ". Running only tests not requiring devices.");
if (deviceCount == 0) {
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
Copy link

@makortel makortel Mar 14, 2019

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this be changed to use exitSansCUDADevices()? Or, should that be changed to use supportedCudaDevices() as well?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point.

Yes, if we do not build for any of the available devices, we should skip those tests as well.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What about using the exitSansCUDADevices() here? Or do you prefer to return to that later?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I am not very familiar with how the test harness works, but if I understand the intent, you wanted to run part of the tests even if there are no available devices:

    SECTION("Enabled only if there are CUDA capable GPUs") {
      auto cs = makeCUDAService(ps, ar);
      if(deviceCount <= 0) {
        REQUIRE(cs.enabled() == false);
        WARN("CUDAService is disabled as there are no CUDA GPU devices");
      }
      else {
        REQUIRE(cs.enabled() == true);
        INFO("CUDAService is enabled");
      }
    }

If we call exitSansCUDADevices() the test would never reach that part.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right, so I was not able read my own code.

}

SECTION("CUDAService enabled") {
Expand All @@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
}

auto cs = makeCUDAService(ps, ar);
cudaError_t ret;

SECTION("CUDA Queries") {
int driverVersion = 0, runtimeVersion = 0;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h

#include <map>

std::map<int, std::pair<int, int>> supportedCUDADevices(bool reset = true);

#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
6 changes: 6 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

void exitSansCUDADevices() {
int devices = 0;
Expand All @@ -16,4 +17,9 @@ void exitSansCUDADevices() {
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
int supported = supportedCUDADevices().size();
if (supported == 0) {
std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
exit(EXIT_SUCCESS);
}
}
45 changes: 45 additions & 0 deletions HeterogeneousCore/CUDAUtilities/src/supportedCUDADevices.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#include <map>

#include <cuda_runtime.h>

#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"

__global__
void isSupported(bool * result) {
* result = true;
}

std::map<int, std::pair<int, int>> supportedCUDADevices(bool reset) {
std::map<int, std::pair<int, int>> capabilities;

int devices = 0;
auto status = cudaGetDeviceCount(&devices);
if (cudaSuccess != status) {
return capabilities;
}

for (int i = 0; i < devices; ++i) {
cudaCheck(cudaSetDevice(i));
bool supported = false;
bool * supported_d;
cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
isSupported<<<1,1>>>(supported_d);
// swallow any eventual error from launching the kernel on an unsupported device
cudaGetLastError();
cudaCheck(cudaDeviceSynchronize());
cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
cudaCheck(cudaFree(supported_d));
if (supported) {
cudaDeviceProp properties;
cudaCheck(cudaGetDeviceProperties(&properties, i));
capabilities[i] = std::make_pair(properties.major, properties.minor);
}
if (reset) {
cudaCheck(cudaDeviceReset());
}
}

return capabilities;
}