12
12
#include " FWCore/Utilities/interface/ReusableObjectHolder.h"
13
13
#include " HeterogeneousCore/CUDAServices/interface/CUDAService.h"
14
14
#include " HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
15
+ #include " HeterogeneousCore/CUDAUtilities/interface/supportedCUDADevices.h"
15
16
16
17
#include " CachingDeviceAllocator.h"
17
18
#include " CachingHostAllocator.h"
@@ -94,10 +95,10 @@ namespace {
94
95
}
95
96
}
96
97
97
- void devicePreallocate (CUDAService& cs, int numberOfDevices, const std::vector<unsigned int >& bufferSizes) {
98
+ void devicePreallocate (CUDAService& cs, const std::vector<unsigned int >& bufferSizes) {
98
99
int device;
99
100
cudaCheck (cudaGetDevice (&device));
100
- for (int i= 0 ; i<numberOfDevices; ++i ) {
101
+ for (int i : cs. devices () ) {
101
102
cudaCheck (cudaSetDevice (i));
102
103
preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t <>& stream) {
103
104
return cs.make_device_unique <char []>(size, stream);
@@ -121,14 +122,14 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
121
122
return ;
122
123
}
123
124
124
- auto status = cudaGetDeviceCount (&numberOfDevices_);
125
- if (cudaSuccess != status) {
125
+ supportedDevices_ = supportedCUDADevices ();
126
+ numberOfDevices_ = supportedDevices_.size ();
127
+ if (numberOfDevices_ == 0 ) {
126
128
edm::LogWarning (" CUDAService" ) << " Failed to initialize the CUDA runtime.\n " << " Disabling the CUDAService." ;
127
129
return ;
128
130
}
129
131
edm::LogInfo log (" CUDAService" );
130
- computeCapabilities_.reserve (numberOfDevices_);
131
- log << " CUDA runtime successfully initialised, found " << numberOfDevices_ << " compute devices.\n\n " ;
132
+ log << " CUDA runtime successfully initialised, found " << numberOfDevices_ << " supported compute devices.\n\n " ;
132
133
133
134
auto const & limits = config.getUntrackedParameter <edm::ParameterSet>(" limits" );
134
135
auto printfFifoSize = limits.getUntrackedParameter <int >(" cudaLimitPrintfFifoSize" );
@@ -137,18 +138,20 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
137
138
auto devRuntimeSyncDepth = limits.getUntrackedParameter <int >(" cudaLimitDevRuntimeSyncDepth" );
138
139
auto devRuntimePendingLaunchCount = limits.getUntrackedParameter <int >(" cudaLimitDevRuntimePendingLaunchCount" );
139
140
140
- for (int i = 0 ; i < numberOfDevices_; ++i) {
141
+ int lastDevice = supportedDevices_.back ();
142
+ computeCapabilities_.resize (lastDevice + 1 , std::make_pair (0 , 0 ));
143
+ for (int i: supportedDevices_) {
141
144
// read information about the compute device.
142
145
// see the documentation of cudaGetDeviceProperties() for more information.
143
146
cudaDeviceProp properties;
144
147
cudaCheck (cudaGetDeviceProperties (&properties, i));
145
148
log << " CUDA device " << i << " : " << properties.name << ' \n ' ;
146
149
147
150
// compute capabilities
151
+ computeCapabilities_[i] = std::make_pair (properties.major , properties.minor );
148
152
log << " compute capability: " << properties.major << " ." << properties.minor << " (sm_" << properties.major << properties.minor << " )\n " ;
149
- computeCapabilities_.emplace_back (properties.major , properties.minor );
150
153
log << " streaming multiprocessors: " << std::setw (13 ) << properties.multiProcessorCount << ' \n ' ;
151
- log << " CUDA cores: " << std::setw (28 ) << properties.multiProcessorCount * getCudaCoresPerSM (properties.major , properties.minor ) << ' \n ' ;
154
+ log << " CUDA cores: " << std::setw (28 ) << properties.multiProcessorCount * getCudaCoresPerSM (properties.major , properties.minor ) << ' \n ' ;
152
155
log << " single to double performance: " << std::setw (8 ) << properties.singleToDoublePrecisionPerfRatio << " :1\n " ;
153
156
154
157
// compute mode
@@ -291,7 +294,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
291
294
size_t minCachedBytes = std::numeric_limits<size_t >::max ();
292
295
int currentDevice;
293
296
cudaCheck (cudaGetDevice (¤tDevice));
294
- for (int i = 0 ; i < numberOfDevices_; ++i ) {
297
+ for (int i: supportedDevices_ ) {
295
298
size_t freeMemory, totalMemory;
296
299
cudaCheck (cudaSetDevice (i));
297
300
cudaCheck (cudaMemGetInfo (&freeMemory, &totalMemory));
@@ -331,16 +334,16 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
331
334
log << " cub::CachingDeviceAllocator disabled\n " ;
332
335
}
333
336
334
- cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_ );
335
- cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_ );
337
+ cudaStreamCache_ = std::make_unique<CUDAStreamCache>(lastDevice+ 1 );
338
+ cudaEventCache_ = std::make_unique<CUDAEventCache>(lastDevice+ 1 );
336
339
337
340
log << " \n " ;
338
341
339
342
log << " CUDAService fully initialized" ;
340
343
enabled_ = true ;
341
344
342
345
// Preallocate buffers if asked to
343
- devicePreallocate (*this , numberOfDevices_, allocator.getUntrackedParameter <std::vector<unsigned int > >(" devicePreallocate" ));
346
+ devicePreallocate (*this , allocator.getUntrackedParameter <std::vector<unsigned int > >(" devicePreallocate" ));
344
347
hostPreallocate (*this , allocator.getUntrackedParameter <std::vector<unsigned int > >(" hostPreallocate" ));
345
348
}
346
349
@@ -353,7 +356,7 @@ CUDAService::~CUDAService() {
353
356
cudaEventCache_.reset ();
354
357
cudaStreamCache_.reset ();
355
358
356
- for (int i = 0 ; i < numberOfDevices_; ++i ) {
359
+ for (int i: supportedDevices_ ) {
357
360
cudaCheck (cudaSetDevice (i));
358
361
cudaCheck (cudaDeviceSynchronize ());
359
362
// Explicitly destroys and cleans up all resources associated with the current device in the
@@ -398,7 +401,7 @@ int CUDAService::deviceWithMostFreeMemory() const {
398
401
399
402
size_t maxFreeMemory = 0 ;
400
403
int device = -1 ;
401
- for (int i = 0 ; i < numberOfDevices_; ++i ) {
404
+ for (int i: supportedDevices_ ) {
402
405
/*
403
406
// TODO: understand why the api-wrappers version gives same value for all devices
404
407
auto device = cuda::device::get(i);
@@ -432,9 +435,6 @@ struct CUDAService::Allocator {
432
435
template <typename ...Args>
433
436
Allocator (size_t max, Args&&... args): maxAllocation(max), deviceAllocator(args...), hostAllocator(std::forward<Args>(args)...) {}
434
437
435
- void devicePreallocate (int numberOfDevices, const std::vector<unsigned int >& bytes);
436
- void hostPreallocate (int numberOfDevices, const std::vector<unsigned int >& bytes);
437
-
438
438
size_t maxAllocation;
439
439
notcub::CachingDeviceAllocator deviceAllocator;
440
440
notcub::CachingHostAllocator hostAllocator;
0 commit comments