From 388a88d6427fa6563566e510126ad5cc2067986d Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Thu, 21 May 2020 20:21:09 -0500 Subject: [PATCH 1/5] [cudauvm] Move SiPixelFedCalingMapGPUWrapper to managed memory --- .../SiPixelFedCablingMapGPUWrapper.cc | 61 ++++++++----------- .../SiPixelFedCablingMapGPUWrapper.h | 26 ++------ .../SiPixelRawToClusterCUDA.cc | 4 +- 3 files changed, 32 insertions(+), 59 deletions(-) diff --git a/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.cc b/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.cc index 56a3dc7ea..22cfeb41c 100644 --- a/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.cc +++ b/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.cc @@ -9,46 +9,33 @@ // CMSSW includes #include "CUDACore/cudaCheck.h" -#include "CUDACore/device_unique_ptr.h" -#include "CUDACore/host_unique_ptr.h" +#include "CUDACore/deviceCount.h" +#include "CUDACore/ScopedSetDevice.h" +#include "CUDACore/StreamCache.h" #include "CondFormats/SiPixelFedCablingMapGPUWrapper.h" SiPixelFedCablingMapGPUWrapper::SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const& cablingMap, - std::vector modToUnp) - : modToUnpDefault(modToUnp.size()), hasQuality_(true) { - cudaCheck(cudaMallocHost(&cablingMapHost, sizeof(SiPixelFedCablingMapGPU))); - std::memcpy(cablingMapHost, &cablingMap, sizeof(SiPixelFedCablingMapGPU)); - - std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault.begin()); + std::vector const& modToUnp) + : hasQuality_(true) { + cudaCheck(cudaMallocManaged(&cablingMap_, sizeof(SiPixelFedCablingMapGPU))); + *cablingMap_ = cablingMap; + cudaCheck(cudaMallocManaged(&modToUnpDefault_, modToUnp.size())); + std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault_); + for (int device = 0, ndev = cms::cuda::deviceCount(); device < ndev; ++device) { +#ifndef CUDAUVM_DISABLE_ADVICE + cudaCheck(cudaMemAdvise(cablingMap_, sizeof(SiPixelFedCablingMapGPU), cudaMemAdviseSetReadMostly, device)); + cudaCheck(cudaMemAdvise(modToUnpDefault_, sizeof(modToUnp.size()), cudaMemAdviseSetReadMostly, device)); +#endif +#ifndef CUDAUVM_DISABLE_PREFETCH + cms::cuda::ScopedSetDevice guard{device}; + auto stream = cms::cuda::getStreamCache().get(); + cudaCheck(cudaMemPrefetchAsync(cablingMap_, sizeof(SiPixelFedCablingMapGPU), device, stream.get())); + cudaCheck(cudaMemPrefetchAsync(modToUnpDefault_, modToUnp.size(), device, stream.get())); +#endif + } } -SiPixelFedCablingMapGPUWrapper::~SiPixelFedCablingMapGPUWrapper() { cudaCheck(cudaFreeHost(cablingMapHost)); } - -const SiPixelFedCablingMapGPU* SiPixelFedCablingMapGPUWrapper::getGPUProductAsync(cudaStream_t cudaStream) const { - const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) { - // allocate - cudaCheck(cudaMalloc(&data.cablingMapDevice, sizeof(SiPixelFedCablingMapGPU))); - - // transfer - cudaCheck(cudaMemcpyAsync( - data.cablingMapDevice, this->cablingMapHost, sizeof(SiPixelFedCablingMapGPU), cudaMemcpyDefault, stream)); - }); - return data.cablingMapDevice; +SiPixelFedCablingMapGPUWrapper::~SiPixelFedCablingMapGPUWrapper() { + cudaCheck(cudaFree(cablingMap_)); + cudaCheck(cudaFree(modToUnpDefault_)); } - -const unsigned char* SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cudaStream_t cudaStream) const { - const auto& data = - modToUnp_.dataForCurrentDeviceAsync(cudaStream, [this](ModulesToUnpack& data, cudaStream_t stream) { - cudaCheck(cudaMalloc((void**)&data.modToUnpDefault, pixelgpudetails::MAX_SIZE_BYTE_BOOL)); - cudaCheck(cudaMemcpyAsync(data.modToUnpDefault, - this->modToUnpDefault.data(), - this->modToUnpDefault.size() * sizeof(unsigned char), - cudaMemcpyDefault, - stream)); - }); - return data.modToUnpDefault; -} - -SiPixelFedCablingMapGPUWrapper::GPUData::~GPUData() { cudaCheck(cudaFree(cablingMapDevice)); } - -SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::~ModulesToUnpack() { cudaCheck(cudaFree(modToUnpDefault)); } diff --git a/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.h b/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.h index 85477e87a..66f3e1829 100644 --- a/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.h +++ b/src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.h @@ -2,8 +2,6 @@ #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h #include "CUDACore/ESProduct.h" -#include "CUDACore/CUDAHostAllocator.h" -#include "CUDACore/device_unique_ptr.h" #include "CondFormats/SiPixelFedCablingMapGPU.h" #include @@ -12,35 +10,23 @@ class SiPixelFedCablingMapGPUWrapper { public: - explicit SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const &cablingMap, - std::vector modToUnp); + explicit SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const& cablingMap, + std::vector const& modToUnp); ~SiPixelFedCablingMapGPUWrapper(); bool hasQuality() const { return hasQuality_; } // returns pointer to GPU memory - const SiPixelFedCablingMapGPU *getGPUProductAsync(cudaStream_t cudaStream) const; + const SiPixelFedCablingMapGPU* cablingMap() const { return cablingMap_; } // returns pointer to GPU memory - const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const; + const unsigned char* modToUnpAll() const { return modToUnpDefault_; } private: - std::vector> modToUnpDefault; bool hasQuality_; - SiPixelFedCablingMapGPU *cablingMapHost = nullptr; // pointer to struct in CPU - - struct GPUData { - ~GPUData(); - SiPixelFedCablingMapGPU *cablingMapDevice = nullptr; // pointer to struct in GPU - }; - cms::cuda::ESProduct gpuData_; - - struct ModulesToUnpack { - ~ModulesToUnpack(); - unsigned char *modToUnpDefault = nullptr; // pointer to GPU - }; - cms::cuda::ESProduct modToUnp_; + SiPixelFedCablingMapGPU* cablingMap_ = nullptr; + unsigned char* modToUnpDefault_ = nullptr; }; #endif diff --git a/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index 207195bda..1d2624341 100644 --- a/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -72,8 +72,8 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration."); } // get the GPU product already here so that the async transfer can begin - const auto* gpuMap = hgpuMap.getGPUProductAsync(ctx.stream()); - const unsigned char* gpuModulesToUnpack = hgpuMap.getModToUnpAllAsync(ctx.stream()); + const auto* gpuMap = hgpuMap.cablingMap(); + const unsigned char* gpuModulesToUnpack = hgpuMap.modToUnpAll(); auto const& hgains = iSetup.get(); // get the GPU product already here so that the async transfer can begin From d89109e9c029b59cd4eac18a8d423348a2e56397 Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 22 May 2020 13:47:37 -0500 Subject: [PATCH 2/5] [cudauvm] Move SiPixelGainCalibrationForHLTGPU to managed memory --- .../SiPixelGainCalibrationForHLTGPU.cc | 53 +++++++++---------- .../SiPixelGainCalibrationForHLTGPU.h | 15 ++---- .../SiPixelRawToClusterCUDA.cc | 2 +- 3 files changed, 31 insertions(+), 39 deletions(-) diff --git a/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.cc b/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.cc index 76e64e8f3..c3f222410 100644 --- a/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.cc +++ b/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.cc @@ -1,38 +1,37 @@ +#include + #include #include "CondFormats/SiPixelGainCalibrationForHLTGPU.h" #include "CondFormats/SiPixelGainForHLTonGPU.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/deviceCount.h" +#include "CUDACore/ScopedSetDevice.h" +#include "CUDACore/StreamCache.h" SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain, - std::vector gainData) - : gainData_(std::move(gainData)) { - cudaCheck(cudaMallocHost(&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU))); - *gainForHLTonHost_ = gain; -} + std::vector const& gainData) { + cudaCheck(cudaMallocManaged(&gainForHLT_, sizeof(SiPixelGainForHLTonGPU))); + *gainForHLT_ = gain; + cudaCheck(cudaMallocManaged(&gainData_, gainData.size())); + gainForHLT_->v_pedestals = gainData_; -SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); } - -SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() { - cudaCheck(cudaFree(gainForHLTonGPU)); - cudaCheck(cudaFree(gainDataOnGPU)); + std::memcpy(gainData_, gainData.data(), gainData.size()); + for (int device = 0, ndev = cms::cuda::deviceCount(); device < ndev; ++device) { +#ifndef CUDAUVM_DISABLE_ADVICE + cudaCheck(cudaMemAdvise(gainForHLT_, sizeof(SiPixelGainForHLTonGPU), cudaMemAdviseSetReadMostly, device)); + cudaCheck(cudaMemAdvise(gainData_, gainData.size(), cudaMemAdviseSetReadMostly, device)); +#endif +#ifndef CUDAUVM_DISABLE_PREFETCH + cms::cuda::ScopedSetDevice guard{device}; + auto stream = cms::cuda::getStreamCache().get(); + cudaCheck(cudaMemPrefetchAsync(gainForHLT_, sizeof(SiPixelGainForHLTonGPU), device, stream.get())); + cudaCheck(cudaMemPrefetchAsync(gainData_, gainData.size(), device, stream.get())); +#endif + } } -const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cudaStream_t cudaStream) const { - const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) { - cudaCheck(cudaMalloc((void**)&data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU))); - cudaCheck(cudaMalloc((void**)&data.gainDataOnGPU, this->gainData_.size())); - // gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory - cudaCheck( - cudaMemcpyAsync(data.gainDataOnGPU, this->gainData_.data(), this->gainData_.size(), cudaMemcpyDefault, stream)); - - cudaCheck(cudaMemcpyAsync( - data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals), - &(data.gainDataOnGPU), - sizeof(SiPixelGainForHLTonGPU_DecodingStructure*), - cudaMemcpyDefault, - stream)); - }); - return data.gainForHLTonGPU; +SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { + cudaCheck(cudaFree(gainForHLT_)); + cudaCheck(cudaFree(gainData_)); } diff --git a/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.h b/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.h index e5920a08c..bded88ff0 100644 --- a/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.h +++ b/src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.h @@ -8,21 +8,14 @@ struct SiPixelGainForHLTonGPU_DecodingStructure; class SiPixelGainCalibrationForHLTGPU { public: - explicit SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const &gain, std::vector gainData); + explicit SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain, std::vector const& gainData); ~SiPixelGainCalibrationForHLTGPU(); - const SiPixelGainForHLTonGPU *getGPUProductAsync(cudaStream_t cudaStream) const; - const SiPixelGainForHLTonGPU *getCPUProduct() const { return gainForHLTonHost_; } + const SiPixelGainForHLTonGPU* get() const { return gainForHLT_; } private: - SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr; - std::vector gainData_; - struct GPUData { - ~GPUData(); - SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr; - SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr; - }; - cms::cuda::ESProduct gpuData_; + SiPixelGainForHLTonGPU* gainForHLT_ = nullptr; + SiPixelGainForHLTonGPU_DecodingStructure* gainData_ = nullptr; }; #endif // CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h diff --git a/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc b/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc index 1d2624341..71f0a2424 100644 --- a/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc +++ b/src/cudauvm/plugin-SiPixelClusterizer/SiPixelRawToClusterCUDA.cc @@ -77,7 +77,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, auto const& hgains = iSetup.get(); // get the GPU product already here so that the async transfer can begin - const auto* gpuGains = hgains.getGPUProductAsync(ctx.stream()); + const auto* gpuGains = hgains.get(); auto const& fedIds_ = iSetup.get().fedIds(); From a4fdf77c04f3b5646e6522193a5708c97598732f Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Tue, 26 May 2020 03:57:40 +0200 Subject: [PATCH 3/5] [cudauvm] Move PixelCPEFast to managed memory --- src/cudauvm/CondFormats/PixelCPEFast.cc | 107 ++++++++---------- src/cudauvm/CondFormats/PixelCPEFast.h | 33 ++---- .../SiPixelRecHitCUDA.cc | 4 +- 3 files changed, 55 insertions(+), 89 deletions(-) diff --git a/src/cudauvm/CondFormats/PixelCPEFast.cc b/src/cudauvm/CondFormats/PixelCPEFast.cc index dd79bd389..1b7a6bf17 100644 --- a/src/cudauvm/CondFormats/PixelCPEFast.cc +++ b/src/cudauvm/CondFormats/PixelCPEFast.cc @@ -6,80 +6,65 @@ #include "Geometry/phase1PixelTopology.h" #include "CUDACore/cudaCheck.h" +#include "CUDACore/deviceCount.h" +#include "CUDACore/ScopedSetDevice.h" +#include "CUDACore/StreamCache.h" #include "CondFormats/PixelCPEFast.h" -// Services -// this is needed to get errors from templates - -namespace { - constexpr float micronsToCm = 1.0e-4; -} - //----------------------------------------------------------------------------- //! The constructor. //----------------------------------------------------------------------------- PixelCPEFast::PixelCPEFast(std::string const &path) { + unsigned int ndetParams; + + cudaCheck(cudaMallocManaged(&m_params, sizeof(pixelCPEforGPU::ParamsOnGPU))); + cudaCheck(cudaMallocManaged(&m_commonParams, sizeof(pixelCPEforGPU::CommonParams))); + cudaCheck(cudaMallocManaged(&m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry))); + cudaCheck(cudaMallocManaged(&m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry))); + { std::ifstream in(path, std::ios::binary); in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit); - in.read(reinterpret_cast(&m_commonParamsGPU), sizeof(pixelCPEforGPU::CommonParams)); - unsigned int ndetParams; + in.read(reinterpret_cast(m_commonParams), sizeof(pixelCPEforGPU::CommonParams)); in.read(reinterpret_cast(&ndetParams), sizeof(unsigned int)); - m_detParamsGPU.resize(ndetParams); - in.read(reinterpret_cast(m_detParamsGPU.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams)); - in.read(reinterpret_cast(&m_averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry)); - in.read(reinterpret_cast(&m_layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry)); + cudaCheck(cudaMallocManaged(&m_detParams, ndetParams * sizeof(pixelCPEforGPU::DetParams))); + in.read(reinterpret_cast(m_detParams), ndetParams * sizeof(pixelCPEforGPU::DetParams)); + in.read(reinterpret_cast(m_averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry)); + in.read(reinterpret_cast(m_layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry)); } - cpuData_ = { - &m_commonParamsGPU, - m_detParamsGPU.data(), - &m_layerGeometry, - &m_averageGeometry, - }; -} - -const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cudaStream_t cudaStream) const { - const auto &data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData &data, cudaStream_t stream) { - // and now copy to device... - cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams))); - cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_detParams, - this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams))); - cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry))); - cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry))); - cudaCheck(cudaMalloc((void **)&data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU))); + m_params->m_commonParams = m_commonParams; + m_params->m_detParams = m_detParams; + m_params->m_layerGeometry = m_layerGeometry; + m_params->m_averageGeometry = m_averageGeometry; - cudaCheck(cudaMemcpyAsync( - data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream)); - cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_commonParams, - &this->m_commonParamsGPU, - sizeof(pixelCPEforGPU::CommonParams), - cudaMemcpyDefault, - stream)); - cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_averageGeometry, - &this->m_averageGeometry, - sizeof(pixelCPEforGPU::AverageGeometry), - cudaMemcpyDefault, - stream)); - cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_layerGeometry, - &this->m_layerGeometry, - sizeof(pixelCPEforGPU::LayerGeometry), - cudaMemcpyDefault, - stream)); - cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_detParams, - this->m_detParamsGPU.data(), - this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams), - cudaMemcpyDefault, - stream)); - }); - return data.d_paramsOnGPU; + for (int device = 0, ndev = cms::cuda::deviceCount(); device < ndev; ++device) { +#ifndef CUDAUVM_DISABLE_ADVICE + cudaCheck(cudaMemAdvise(m_params, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemAdviseSetReadMostly, device)); + cudaCheck(cudaMemAdvise(m_commonParams, sizeof(pixelCPEforGPU::CommonParams), cudaMemAdviseSetReadMostly, device)); + cudaCheck( + cudaMemAdvise(m_detParams, ndetParams * sizeof(pixelCPEforGPU::DetParams), cudaMemAdviseSetReadMostly, device)); + cudaCheck( + cudaMemAdvise(m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry), cudaMemAdviseSetReadMostly, device)); + cudaCheck( + cudaMemAdvise(m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry), cudaMemAdviseSetReadMostly, device)); +#endif +#ifndef CUDAUVM_DISABLE_PREFETCH + cms::cuda::ScopedSetDevice guard{device}; + auto stream = cms::cuda::getStreamCache().get(); + cudaCheck(cudaMemPrefetchAsync(m_params, sizeof(pixelCPEforGPU::ParamsOnGPU), device, stream.get())); + cudaCheck(cudaMemPrefetchAsync(m_commonParams, sizeof(pixelCPEforGPU::CommonParams), device, stream.get())); + cudaCheck(cudaMemPrefetchAsync(m_detParams, ndetParams * sizeof(pixelCPEforGPU::DetParams), device, stream.get())); + cudaCheck(cudaMemPrefetchAsync(m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry), device, stream.get())); + cudaCheck(cudaMemPrefetchAsync(m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry), device, stream.get())); +#endif + } } -PixelCPEFast::GPUData::~GPUData() { - if (d_paramsOnGPU != nullptr) { - cudaFree((void *)h_paramsOnGPU.m_commonParams); - cudaFree((void *)h_paramsOnGPU.m_detParams); - cudaFree((void *)h_paramsOnGPU.m_averageGeometry); - cudaFree(d_paramsOnGPU); - } +PixelCPEFast::~PixelCPEFast() { + cudaFree(m_params); + cudaFree(m_commonParams); + cudaFree(m_detParams); + cudaFree(m_layerGeometry); + cudaFree(m_averageGeometry); } diff --git a/src/cudauvm/CondFormats/PixelCPEFast.h b/src/cudauvm/CondFormats/PixelCPEFast.h index 16e8f2b68..4856c26fb 100644 --- a/src/cudauvm/CondFormats/PixelCPEFast.h +++ b/src/cudauvm/CondFormats/PixelCPEFast.h @@ -9,35 +9,18 @@ class PixelCPEFast { public: - PixelCPEFast(std::string const &path); + PixelCPEFast(std::string const& path); - ~PixelCPEFast() = default; + ~PixelCPEFast(); - // The return value can only be used safely in kernels launched on - // the same cudaStream, or after cudaStreamSynchronize. - const pixelCPEforGPU::ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const; - - pixelCPEforGPU::ParamsOnGPU const &getCPUProduct() const { return cpuData_; } + pixelCPEforGPU::ParamsOnGPU const* get() const { return m_params; } private: - // allocate it with posix malloc to be ocmpatible with cpu wf - std::vector m_detParamsGPU; - // std::vector> m_detParamsGPU; - pixelCPEforGPU::CommonParams m_commonParamsGPU; - pixelCPEforGPU::LayerGeometry m_layerGeometry; - pixelCPEforGPU::AverageGeometry m_averageGeometry; - - pixelCPEforGPU::ParamsOnGPU cpuData_; - - struct GPUData { - ~GPUData(); - // not needed if not used on CPU... - pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU; - pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr; // copy of the above on the Device - }; - cms::cuda::ESProduct gpuData_; - - void fillParamsForGpu(); + pixelCPEforGPU::ParamsOnGPU* m_params = nullptr; + pixelCPEforGPU::CommonParams* m_commonParams = nullptr; + pixelCPEforGPU::DetParams* m_detParams = nullptr; + pixelCPEforGPU::LayerGeometry* m_layerGeometry = nullptr; + pixelCPEforGPU::AverageGeometry* m_averageGeometry = nullptr; }; #endif // RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h diff --git a/src/cudauvm/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc b/src/cudauvm/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc index a82e23eab..f1b4bf2cd 100644 --- a/src/cudauvm/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc +++ b/src/cudauvm/plugin-SiPixelRecHits/SiPixelRecHitCUDA.cc @@ -53,9 +53,7 @@ void SiPixelRecHitCUDA::produce(edm::Event& iEvent, const edm::EventSetup& es) { std::cout << "Clusters/Hits Overflow " << nHits << " >= " << TrackingRecHit2DSOAView::maxHits() << std::endl; } - ctx.emplace(iEvent, - tokenHit_, - gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.getGPUProductAsync(ctx.stream()), ctx.stream())); + ctx.emplace(iEvent, tokenHit_, gpuAlgo_.makeHitsAsync(digis, clusters, bs, fcpe.get(), ctx.stream())); } DEFINE_FWK_MODULE(SiPixelRecHitCUDA); From 53f30d90ec3b276b05268c7997abcbc55d206f9a Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 29 May 2020 18:47:08 -0700 Subject: [PATCH 4/5] [cudauvm] Print if cudaMemAdvise or cudaMemPrefetchAsync is disabled --- src/cudauvm/bin/main.cc | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/cudauvm/bin/main.cc b/src/cudauvm/bin/main.cc index 9fc1d4363..214f83443 100644 --- a/src/cudauvm/bin/main.cc +++ b/src/cudauvm/bin/main.cc @@ -88,6 +88,13 @@ int main(int argc, char** argv) { } std::cout << "Found " << numberOfDevices << " devices" << std::endl; +#ifdef CUDAUVM_DISABLE_ADVICE + std::cout << "cudaMemAdvise() calls are disabled" << std::endl; +#endif +#ifdef CUDAUVM_DISABLE_PREFETCH + std::cout << "cudaMemPrefetchAsync() calls are disabled" << std::endl; +#endif + // Initialize EventProcessor std::vector edmodules; std::vector esmodules; From db094b5a7a77a0d1f2cba0841471811a0627da2c Mon Sep 17 00:00:00 2001 From: Matti Kortelainen Date: Fri, 29 May 2020 18:47:17 -0700 Subject: [PATCH 5/5] Add cudauvm to run-scan.py --- run-scan.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/run-scan.py b/run-scan.py index f9175cd11..13921f7d9 100755 --- a/run-scan.py +++ b/run-scan.py @@ -12,7 +12,8 @@ n_events_unit = 1000 n_blocks_per_stream = { "fwtest": 1, - "cuda": {"": 400, "transfer": 350} + "cuda": {"": 400, "transfer": 350}, + "cudauvm": {"": 400, "transfer": 350}, } result_re = re.compile("Processed (?P\d+) events in (?P