Skip to content

Commit 8deb1d9

Browse files
committed
Use only CUDA devices with a supported architecture
For each available CUDA device, check if its architecture is supported running a simple kernel. This allows to restrict using only the supported devices - at configuration time, in the SwitchProducerCUDA, via the cudaIsEnabled test; - at run time, in the CUDAService and its clients; - when running tests, via exitSansCUDADevices. Includes Matti's fix for the sizes of stream and event caches (#289).
1 parent 07607a4 commit 8deb1d9

File tree

11 files changed

+97
-66
lines changed

11 files changed

+97
-66
lines changed

HeterogeneousCore/CUDACore/src/GPUCuda.cc

+3-8
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
2+
#include "chooseCUDADevice.h"
23

34
#include "FWCore/MessageLogger/interface/MessageLogger.h"
45
#include "FWCore/ServiceRegistry/interface/Service.h"
@@ -34,20 +35,14 @@ namespace heterogeneous {
3435
return;
3536
}
3637

37-
// For startes we "statically" assign the device based on
38-
// edm::Stream number. This is suboptimal if the number of
39-
// edm::Streams is not a multiple of the number of CUDA devices
40-
// (and even then there is no load balancing).
41-
//
42-
// TODO: improve. Possible ideas include
38+
// TODO: possible ideas to improve the "assignment" logic include
4339
// - allocate M (< N(edm::Streams)) buffers per device per module, choose dynamically which (buffer, device) to use
4440
// * the first module of a chain dictates the device for the rest of the chain
4541
// - our own CUDA memory allocator
4642
// * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
4743
// * would probably still need some buffer space/device to hold e.g. conditions data
4844
// - for conditions, how to handle multiple lumis per job?
49-
deviceId_ = id % cudaService->numberOfDevices();
50-
45+
deviceId_ = cudacore::chooseCUDADevice(id);
5146
cuda::device::current::scoped_override_t<> setDeviceForThisScope(deviceId_);
5247

5348
// Create the CUDA stream for this module-edm::Stream pair

HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,6 @@ namespace cudacore {
1313
// (and even then there is no load balancing).
1414
//
1515
// TODO: improve the "assignment" logic
16-
return id % cudaService->numberOfDevices();
16+
return cudaService->devices()[id % cudaService->numberOfDevices()];
1717
}
1818
}

HeterogeneousCore/CUDAServices/bin/BuildFile.xml

+1
Original file line numberDiff line numberDiff line change
@@ -4,4 +4,5 @@
44

55
<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
66
<use name="cuda"/>
7+
<use name="HeterogeneousCore/CUDAUtilities"/>
78
</bin>
Original file line numberDiff line numberDiff line change
@@ -1,31 +1,7 @@
1-
#include <algorithm>
2-
#include <array>
31
#include <cstdlib>
4-
#include <iostream>
52

6-
#include <cuda_runtime.h>
3+
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
74

85
int main() {
9-
int devices = 0;
10-
auto status = cudaGetDeviceCount(& devices);
11-
if (status != cudaSuccess) {
12-
return EXIT_FAILURE;
13-
}
14-
15-
int minimumMajor = 6; // min minor is implicitly 0
16-
17-
// This approach (requiring all devices are supported) is rather
18-
// conservative. In principle we could consider just dropping the
19-
// unsupported devices. Currently that would be easiest to achieve
20-
// in CUDAService though.
21-
for (int i = 0; i < devices; ++i) {
22-
cudaDeviceProp properties;
23-
cudaGetDeviceProperties(&properties, i);
24-
25-
if(properties.major < minimumMajor) {
26-
return EXIT_FAILURE;
27-
}
28-
}
29-
30-
return EXIT_SUCCESS;
6+
return supportedCUDADevices().empty() ? EXIT_FAILURE : EXIT_SUCCESS;
317
}

HeterogeneousCore/CUDAServices/interface/CUDAService.h

+4
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,9 @@ class CUDAService {
5252

5353
int numberOfDevices() const { return numberOfDevices_; }
5454

55+
// devices supported by the CUDA configuration and compilation flags
56+
std::vector<int> const& devices() const { return supportedDevices_; }
57+
5558
// major, minor
5659
std::pair<int, int> computeCapability(int device) { return computeCapabilities_.at(device); }
5760

@@ -152,6 +155,7 @@ class CUDAService {
152155
std::unique_ptr<CUDAEventCache> cudaEventCache_;
153156

154157
int numberOfDevices_ = 0;
158+
std::vector<int> supportedDevices_;
155159
std::vector<std::pair<int, int>> computeCapabilities_;
156160
bool enabled_ = false;
157161
};

HeterogeneousCore/CUDAServices/plugins/CUDAMonitoringService.cc

+7-7
Original file line numberDiff line numberDiff line change
@@ -30,15 +30,15 @@ class CUDAMonitoringService {
3030
void postEvent(edm::StreamContext const& sc);
3131

3232
private:
33-
int numberOfDevices_ = 0;
33+
std::vector<int> devices_;
3434
};
3535

3636
CUDAMonitoringService::CUDAMonitoringService(edm::ParameterSet const& config, edm::ActivityRegistry& registry) {
3737
// make sure that CUDA is initialised, and that the CUDAService destructor is called after this service's destructor
3838
edm::Service<CUDAService> cudaService;
3939
if(!cudaService->enabled())
4040
return;
41-
numberOfDevices_ = cudaService->numberOfDevices();
41+
devices_ = cudaService->devices();
4242

4343
if(config.getUntrackedParameter<bool>("memoryConstruction")) {
4444
registry.watchPostModuleConstruction(this, &CUDAMonitoringService::postModuleConstruction);
@@ -66,10 +66,10 @@ void CUDAMonitoringService::fillDescriptions(edm::ConfigurationDescriptions & de
6666
// activity handlers
6767
namespace {
6868
template <typename T>
69-
void dumpUsedMemory(T& log, int num) {
69+
void dumpUsedMemory(T& log, std::vector<int> const& devices) {
7070
int old = 0;
7171
cudaCheck(cudaGetDevice(&old));
72-
for(int i = 0; i < num; ++i) {
72+
for(int i: devices) {
7373
size_t freeMemory, totalMemory;
7474
cudaCheck(cudaSetDevice(i));
7575
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -82,19 +82,19 @@ namespace {
8282
void CUDAMonitoringService::postModuleConstruction(edm::ModuleDescription const& desc) {
8383
auto log = edm::LogPrint("CUDAMonitoringService");
8484
log << "CUDA device memory after construction of " << desc.moduleLabel() << " (" << desc.moduleName() << ")";
85-
dumpUsedMemory(log, numberOfDevices_);
85+
dumpUsedMemory(log, devices_);
8686
}
8787

8888
void CUDAMonitoringService::postModuleBeginStream(edm::StreamContext const&, edm::ModuleCallingContext const& mcc) {
8989
auto log = edm::LogPrint("CUDAMonitoringService");
9090
log<< "CUDA device memory after beginStream() of " << mcc.moduleDescription()->moduleLabel() << " (" << mcc.moduleDescription()->moduleName() << ")";
91-
dumpUsedMemory(log, numberOfDevices_);
91+
dumpUsedMemory(log, devices_);
9292
}
9393

9494
void CUDAMonitoringService::postEvent(edm::StreamContext const& sc) {
9595
auto log = edm::LogPrint("CUDAMonitoringService");
9696
log << "CUDA device memory after event";
97-
dumpUsedMemory(log, numberOfDevices_);
97+
dumpUsedMemory(log, devices_);
9898
}
9999

100100
DEFINE_FWK_SERVICE(CUDAMonitoringService);

HeterogeneousCore/CUDAServices/src/CUDAService.cc

+18-18
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
1313
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
1414
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
15+
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
1516

1617
#include "CachingDeviceAllocator.h"
1718
#include "CachingHostAllocator.h"
@@ -94,10 +95,10 @@ namespace {
9495
}
9596
}
9697

97-
void devicePreallocate(CUDAService& cs, int numberOfDevices, const std::vector<unsigned int>& bufferSizes) {
98+
void devicePreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
9899
int device;
99100
cudaCheck(cudaGetDevice(&device));
100-
for(int i=0; i<numberOfDevices; ++i) {
101+
for (int i : cs.devices()) {
101102
cudaCheck(cudaSetDevice(i));
102103
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
103104
return cs.make_device_unique<char[]>(size, stream);
@@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
121122
return;
122123
}
123124

124-
auto status = cudaGetDeviceCount(&numberOfDevices_);
125-
if (cudaSuccess != status) {
125+
supportedDevices_ = supportedCUDADevices();
126+
numberOfDevices_ = supportedDevices_.size();
127+
if (numberOfDevices_ == 0) {
126128
edm::LogWarning("CUDAService") << "Failed to initialize the CUDA runtime.\n" << "Disabling the CUDAService.";
127129
return;
128130
}
129131
edm::LogInfo log("CUDAService");
130-
computeCapabilities_.reserve(numberOfDevices_);
131-
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n";
132+
log << "CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n";
132133

133134
auto const& limits = config.getUntrackedParameter<edm::ParameterSet>("limits");
134135
auto printfFifoSize = limits.getUntrackedParameter<int>("cudaLimitPrintfFifoSize");
@@ -137,18 +138,20 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
137138
auto devRuntimeSyncDepth = limits.getUntrackedParameter<int>("cudaLimitDevRuntimeSyncDepth");
138139
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter<int>("cudaLimitDevRuntimePendingLaunchCount");
139140

140-
for (int i = 0; i < numberOfDevices_; ++i) {
141+
int lastDevice = supportedDevices_.back();
142+
computeCapabilities_.resize(lastDevice + 1, std::make_pair(0, 0));
143+
for (int i: supportedDevices_) {
141144
// read information about the compute device.
142145
// see the documentation of cudaGetDeviceProperties() for more information.
143146
cudaDeviceProp properties;
144147
cudaCheck(cudaGetDeviceProperties(&properties, i));
145148
log << "CUDA device " << i << ": " << properties.name << '\n';
146149

147150
// compute capabilities
151+
computeCapabilities_[i] = std::make_pair(properties.major, properties.minor);
148152
log << " compute capability: " << properties.major << "." << properties.minor << " (sm_" << properties.major << properties.minor << ")\n";
149-
computeCapabilities_.emplace_back(properties.major, properties.minor);
150153
log << " streaming multiprocessors: " << std::setw(13) << properties.multiProcessorCount << '\n';
151-
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor ) << '\n';
154+
log << " CUDA cores: " << std::setw(28) << properties.multiProcessorCount * getCudaCoresPerSM(properties.major, properties.minor) << '\n';
152155
log << " single to double performance: " << std::setw(8) << properties.singleToDoublePrecisionPerfRatio << ":1\n";
153156

154157
// compute mode
@@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
291294
size_t minCachedBytes = std::numeric_limits<size_t>::max();
292295
int currentDevice;
293296
cudaCheck(cudaGetDevice(&currentDevice));
294-
for (int i = 0; i < numberOfDevices_; ++i) {
297+
for (int i: supportedDevices_) {
295298
size_t freeMemory, totalMemory;
296299
cudaCheck(cudaSetDevice(i));
297300
cudaCheck(cudaMemGetInfo(&freeMemory, &totalMemory));
@@ -331,16 +334,16 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
331334
log << "cub::CachingDeviceAllocator disabled\n";
332335
}
333336

334-
cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
335-
cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_);
337+
cudaStreamCache_ = std::make_unique<CUDAStreamCache>(lastDevice+1);
338+
cudaEventCache_ = std::make_unique<CUDAEventCache>(lastDevice+1);
336339

337340
log << "\n";
338341

339342
log << "CUDAService fully initialized";
340343
enabled_ = true;
341344

342345
// Preallocate buffers if asked to
343-
devicePreallocate(*this, numberOfDevices_, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
346+
devicePreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("devicePreallocate"));
344347
hostPreallocate(*this, allocator.getUntrackedParameter<std::vector<unsigned int> >("hostPreallocate"));
345348
}
346349

@@ -353,7 +356,7 @@ CUDAService::~CUDAService() {
353356
cudaEventCache_.reset();
354357
cudaStreamCache_.reset();
355358

356-
for (int i = 0; i < numberOfDevices_; ++i) {
359+
for (int i: supportedDevices_) {
357360
cudaCheck(cudaSetDevice(i));
358361
cudaCheck(cudaDeviceSynchronize());
359362
// Explicitly destroys and cleans up all resources associated with the current device in the
@@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const {
398401

399402
size_t maxFreeMemory = 0;
400403
int device = -1;
401-
for(int i = 0; i < numberOfDevices_; ++i) {
404+
for (int i: supportedDevices_) {
402405
/*
403406
// TODO: understand why the api-wrappers version gives same value for all devices
404407
auto device = cuda::device::get(i);
@@ -432,9 +435,6 @@ struct CUDAService::Allocator {
432435
template <typename ...Args>
433436
Allocator(size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}
434437

435-
void devicePreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
436-
void hostPreallocate(int numberOfDevices, const std::vector<unsigned int>& bytes);
437-
438438
size_t maxAllocation;
439439
notcub::CachingDeviceAllocator deviceAllocator;
440440
notcub::CachingHostAllocator hostAllocator;

HeterogeneousCore/CUDAServices/test/testCUDAService.cpp

+5-6
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
1515
#include "FWCore/Utilities/interface/Exception.h"
1616
#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
17+
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
1718

1819
namespace {
1920
CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -29,13 +30,10 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
2930

3031
// Test setup: check if a simple CUDA runtime API call fails:
3132
// if so, skip the test with the CUDAService enabled
32-
int deviceCount = 0;
33-
auto ret = cudaGetDeviceCount( &deviceCount );
33+
int deviceCount = supportedCUDADevices().size();
3434

35-
if( ret != cudaSuccess ) {
36-
WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
37-
<< ret << ") " << cudaGetErrorString( ret )
38-
<< ". Running only tests not requiring devices.");
35+
if (deviceCount == 0) {
36+
WARN("No supported CUDA devices available. Running only tests not requiring devices.");
3937
}
4038

4139
SECTION("CUDAService enabled") {
@@ -58,6 +56,7 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
5856
}
5957

6058
auto cs = makeCUDAService(ps, ar);
59+
cudaError_t ret;
6160

6261
SECTION("CUDA Queries") {
6362
int driverVersion = 0, runtimeVersion = 0;
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
#ifndef HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
2+
#define HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h
3+
4+
#include <vector>
5+
6+
std::vector<int> supportedCUDADevices();
7+
8+
#endif // HeterogeneousCore_CUDAUtilities_interface_supportedCUDADevices_h

HeterogeneousCore/CUDAUtilities/src/exitSansCUDADevices.cc

+6
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
#include <cuda_runtime.h>
55

66
#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
7+
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
78

89
void exitSansCUDADevices() {
910
int devices = 0;
@@ -16,4 +17,9 @@ void exitSansCUDADevices() {
1617
std::cerr << "No CUDA devices available, the test will be skipped." << "\n";
1718
exit(EXIT_SUCCESS);
1819
}
20+
int supported = supportedCUDADevices().size();
21+
if (supported == 0) {
22+
std::cerr << "No supported CUDA devices available, the test will be skipped." << "\n";
23+
exit(EXIT_SUCCESS);
24+
}
1925
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
#include <vector>
2+
3+
#include <cuda_runtime.h>
4+
5+
#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
6+
#include "HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
7+
8+
__global__
9+
void isSupported(bool * result) {
10+
* result = true;
11+
}
12+
13+
std::vector<int> supportedCUDADevices() {
14+
int devices = 0;
15+
auto status = cudaGetDeviceCount(&devices);
16+
if (status != cudaSuccess or devices == 0) {
17+
return {};
18+
}
19+
20+
std::vector<int> supportedDevices;
21+
supportedDevices.reserve(devices);
22+
23+
for (int i = 0; i < devices; ++i) {
24+
cudaCheck(cudaSetDevice(i));
25+
bool supported = false;
26+
bool * supported_d;
27+
cudaCheck(cudaMalloc(&supported_d, sizeof(bool)));
28+
cudaCheck(cudaMemset(supported_d, 0x00, sizeof(bool)));
29+
isSupported<<<1,1>>>(supported_d);
30+
// swallow any eventual error from launching the kernel on an unsupported device
31+
cudaGetLastError();
32+
cudaCheck(cudaDeviceSynchronize());
33+
cudaCheck(cudaMemcpy(& supported, supported_d, sizeof(bool), cudaMemcpyDeviceToHost));
34+
cudaCheck(cudaFree(supported_d));
35+
if (supported) {
36+
supportedDevices.push_back(i);
37+
}
38+
cudaCheck(cudaDeviceReset());
39+
}
40+
41+
return supportedDevices;
42+
}

0 commit comments

Comments
 (0)