Skip to content

[cudauvm] Move conditions to use managed memory #53

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 5 commits into from
Jun 2, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion run-scan.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,8 @@
n_events_unit = 1000
n_blocks_per_stream = {
"fwtest": 1,
"cuda": {"": 400, "transfer": 350}
"cuda": {"": 400, "transfer": 350},
"cudauvm": {"": 400, "transfer": 350},
}

result_re = re.compile("Processed (?P<events>\d+) events in (?P<time>\S+) seconds, throughput (?P<throughput>\S+) events/s")
Expand Down
107 changes: 46 additions & 61 deletions src/cudauvm/CondFormats/PixelCPEFast.cc
Original file line number Diff line number Diff line change
Expand Up @@ -6,80 +6,65 @@

#include "Geometry/phase1PixelTopology.h"
#include "CUDACore/cudaCheck.h"
#include "CUDACore/deviceCount.h"
#include "CUDACore/ScopedSetDevice.h"
#include "CUDACore/StreamCache.h"
#include "CondFormats/PixelCPEFast.h"

// Services
// this is needed to get errors from templates

namespace {
constexpr float micronsToCm = 1.0e-4;
}

//-----------------------------------------------------------------------------
//! The constructor.
//-----------------------------------------------------------------------------
PixelCPEFast::PixelCPEFast(std::string const &path) {
unsigned int ndetParams;

cudaCheck(cudaMallocManaged(&m_params, sizeof(pixelCPEforGPU::ParamsOnGPU)));
cudaCheck(cudaMallocManaged(&m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
cudaCheck(cudaMallocManaged(&m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
cudaCheck(cudaMallocManaged(&m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry)));

{
std::ifstream in(path, std::ios::binary);
in.exceptions(std::ifstream::badbit | std::ifstream::failbit | std::ifstream::eofbit);
in.read(reinterpret_cast<char *>(&m_commonParamsGPU), sizeof(pixelCPEforGPU::CommonParams));
unsigned int ndetParams;
in.read(reinterpret_cast<char *>(m_commonParams), sizeof(pixelCPEforGPU::CommonParams));
in.read(reinterpret_cast<char *>(&ndetParams), sizeof(unsigned int));
m_detParamsGPU.resize(ndetParams);
in.read(reinterpret_cast<char *>(m_detParamsGPU.data()), ndetParams * sizeof(pixelCPEforGPU::DetParams));
in.read(reinterpret_cast<char *>(&m_averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry));
in.read(reinterpret_cast<char *>(&m_layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry));
cudaCheck(cudaMallocManaged(&m_detParams, ndetParams * sizeof(pixelCPEforGPU::DetParams)));
in.read(reinterpret_cast<char *>(m_detParams), ndetParams * sizeof(pixelCPEforGPU::DetParams));
in.read(reinterpret_cast<char *>(m_averageGeometry), sizeof(pixelCPEforGPU::AverageGeometry));
in.read(reinterpret_cast<char *>(m_layerGeometry), sizeof(pixelCPEforGPU::LayerGeometry));
}

cpuData_ = {
&m_commonParamsGPU,
m_detParamsGPU.data(),
&m_layerGeometry,
&m_averageGeometry,
};
}

const pixelCPEforGPU::ParamsOnGPU *PixelCPEFast::getGPUProductAsync(cudaStream_t cudaStream) const {
const auto &data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData &data, cudaStream_t stream) {
// and now copy to device...
cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_commonParams, sizeof(pixelCPEforGPU::CommonParams)));
cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_detParams,
this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams)));
cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry)));
cudaCheck(cudaMalloc((void **)&data.h_paramsOnGPU.m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry)));
cudaCheck(cudaMalloc((void **)&data.d_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU)));
m_params->m_commonParams = m_commonParams;
m_params->m_detParams = m_detParams;
m_params->m_layerGeometry = m_layerGeometry;
m_params->m_averageGeometry = m_averageGeometry;

cudaCheck(cudaMemcpyAsync(
data.d_paramsOnGPU, &data.h_paramsOnGPU, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemcpyDefault, stream));
cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_commonParams,
&this->m_commonParamsGPU,
sizeof(pixelCPEforGPU::CommonParams),
cudaMemcpyDefault,
stream));
cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_averageGeometry,
&this->m_averageGeometry,
sizeof(pixelCPEforGPU::AverageGeometry),
cudaMemcpyDefault,
stream));
cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_layerGeometry,
&this->m_layerGeometry,
sizeof(pixelCPEforGPU::LayerGeometry),
cudaMemcpyDefault,
stream));
cudaCheck(cudaMemcpyAsync((void *)data.h_paramsOnGPU.m_detParams,
this->m_detParamsGPU.data(),
this->m_detParamsGPU.size() * sizeof(pixelCPEforGPU::DetParams),
cudaMemcpyDefault,
stream));
});
return data.d_paramsOnGPU;
for (int device = 0, ndev = cms::cuda::deviceCount(); device < ndev; ++device) {
#ifndef CUDAUVM_DISABLE_ADVICE
cudaCheck(cudaMemAdvise(m_params, sizeof(pixelCPEforGPU::ParamsOnGPU), cudaMemAdviseSetReadMostly, device));
cudaCheck(cudaMemAdvise(m_commonParams, sizeof(pixelCPEforGPU::CommonParams), cudaMemAdviseSetReadMostly, device));
cudaCheck(
cudaMemAdvise(m_detParams, ndetParams * sizeof(pixelCPEforGPU::DetParams), cudaMemAdviseSetReadMostly, device));
cudaCheck(
cudaMemAdvise(m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry), cudaMemAdviseSetReadMostly, device));
cudaCheck(
cudaMemAdvise(m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry), cudaMemAdviseSetReadMostly, device));
#endif
#ifndef CUDAUVM_DISABLE_PREFETCH
cms::cuda::ScopedSetDevice guard{device};
auto stream = cms::cuda::getStreamCache().get();
cudaCheck(cudaMemPrefetchAsync(m_params, sizeof(pixelCPEforGPU::ParamsOnGPU), device, stream.get()));
cudaCheck(cudaMemPrefetchAsync(m_commonParams, sizeof(pixelCPEforGPU::CommonParams), device, stream.get()));
cudaCheck(cudaMemPrefetchAsync(m_detParams, ndetParams * sizeof(pixelCPEforGPU::DetParams), device, stream.get()));
cudaCheck(cudaMemPrefetchAsync(m_layerGeometry, sizeof(pixelCPEforGPU::LayerGeometry), device, stream.get()));
cudaCheck(cudaMemPrefetchAsync(m_averageGeometry, sizeof(pixelCPEforGPU::AverageGeometry), device, stream.get()));
#endif
}
}

PixelCPEFast::GPUData::~GPUData() {
if (d_paramsOnGPU != nullptr) {
cudaFree((void *)h_paramsOnGPU.m_commonParams);
cudaFree((void *)h_paramsOnGPU.m_detParams);
cudaFree((void *)h_paramsOnGPU.m_averageGeometry);
cudaFree(d_paramsOnGPU);
}
PixelCPEFast::~PixelCPEFast() {
cudaFree(m_params);
cudaFree(m_commonParams);
cudaFree(m_detParams);
cudaFree(m_layerGeometry);
cudaFree(m_averageGeometry);
}
33 changes: 8 additions & 25 deletions src/cudauvm/CondFormats/PixelCPEFast.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,35 +9,18 @@

class PixelCPEFast {
public:
PixelCPEFast(std::string const &path);
PixelCPEFast(std::string const& path);

~PixelCPEFast() = default;
~PixelCPEFast();

// The return value can only be used safely in kernels launched on
// the same cudaStream, or after cudaStreamSynchronize.
const pixelCPEforGPU::ParamsOnGPU *getGPUProductAsync(cudaStream_t cudaStream) const;

pixelCPEforGPU::ParamsOnGPU const &getCPUProduct() const { return cpuData_; }
pixelCPEforGPU::ParamsOnGPU const* get() const { return m_params; }

private:
// allocate it with posix malloc to be ocmpatible with cpu wf
std::vector<pixelCPEforGPU::DetParams> m_detParamsGPU;
// std::vector<pixelCPEforGPU::DetParams, CUDAHostAllocator<pixelCPEforGPU::DetParams>> m_detParamsGPU;
pixelCPEforGPU::CommonParams m_commonParamsGPU;
pixelCPEforGPU::LayerGeometry m_layerGeometry;
pixelCPEforGPU::AverageGeometry m_averageGeometry;

pixelCPEforGPU::ParamsOnGPU cpuData_;

struct GPUData {
~GPUData();
// not needed if not used on CPU...
pixelCPEforGPU::ParamsOnGPU h_paramsOnGPU;
pixelCPEforGPU::ParamsOnGPU *d_paramsOnGPU = nullptr; // copy of the above on the Device
};
cms::cuda::ESProduct<GPUData> gpuData_;

void fillParamsForGpu();
pixelCPEforGPU::ParamsOnGPU* m_params = nullptr;
pixelCPEforGPU::CommonParams* m_commonParams = nullptr;
pixelCPEforGPU::DetParams* m_detParams = nullptr;
pixelCPEforGPU::LayerGeometry* m_layerGeometry = nullptr;
pixelCPEforGPU::AverageGeometry* m_averageGeometry = nullptr;
};

#endif // RecoLocalTracker_SiPixelRecHits_PixelCPEFast_h
61 changes: 24 additions & 37 deletions src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -9,46 +9,33 @@

// CMSSW includes
#include "CUDACore/cudaCheck.h"
#include "CUDACore/device_unique_ptr.h"
#include "CUDACore/host_unique_ptr.h"
#include "CUDACore/deviceCount.h"
#include "CUDACore/ScopedSetDevice.h"
#include "CUDACore/StreamCache.h"
#include "CondFormats/SiPixelFedCablingMapGPUWrapper.h"

SiPixelFedCablingMapGPUWrapper::SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const& cablingMap,
std::vector<unsigned char> modToUnp)
: modToUnpDefault(modToUnp.size()), hasQuality_(true) {
cudaCheck(cudaMallocHost(&cablingMapHost, sizeof(SiPixelFedCablingMapGPU)));
std::memcpy(cablingMapHost, &cablingMap, sizeof(SiPixelFedCablingMapGPU));

std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault.begin());
std::vector<unsigned char> const& modToUnp)
: hasQuality_(true) {
cudaCheck(cudaMallocManaged(&cablingMap_, sizeof(SiPixelFedCablingMapGPU)));
*cablingMap_ = cablingMap;
cudaCheck(cudaMallocManaged(&modToUnpDefault_, modToUnp.size()));
std::copy(modToUnp.begin(), modToUnp.end(), modToUnpDefault_);
for (int device = 0, ndev = cms::cuda::deviceCount(); device < ndev; ++device) {
#ifndef CUDAUVM_DISABLE_ADVICE
cudaCheck(cudaMemAdvise(cablingMap_, sizeof(SiPixelFedCablingMapGPU), cudaMemAdviseSetReadMostly, device));
cudaCheck(cudaMemAdvise(modToUnpDefault_, sizeof(modToUnp.size()), cudaMemAdviseSetReadMostly, device));
#endif
#ifndef CUDAUVM_DISABLE_PREFETCH
cms::cuda::ScopedSetDevice guard{device};
auto stream = cms::cuda::getStreamCache().get();
cudaCheck(cudaMemPrefetchAsync(cablingMap_, sizeof(SiPixelFedCablingMapGPU), device, stream.get()));
cudaCheck(cudaMemPrefetchAsync(modToUnpDefault_, modToUnp.size(), device, stream.get()));
#endif
}
}

SiPixelFedCablingMapGPUWrapper::~SiPixelFedCablingMapGPUWrapper() { cudaCheck(cudaFreeHost(cablingMapHost)); }

const SiPixelFedCablingMapGPU* SiPixelFedCablingMapGPUWrapper::getGPUProductAsync(cudaStream_t cudaStream) const {
const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
// allocate
cudaCheck(cudaMalloc(&data.cablingMapDevice, sizeof(SiPixelFedCablingMapGPU)));

// transfer
cudaCheck(cudaMemcpyAsync(
data.cablingMapDevice, this->cablingMapHost, sizeof(SiPixelFedCablingMapGPU), cudaMemcpyDefault, stream));
});
return data.cablingMapDevice;
SiPixelFedCablingMapGPUWrapper::~SiPixelFedCablingMapGPUWrapper() {
cudaCheck(cudaFree(cablingMap_));
cudaCheck(cudaFree(modToUnpDefault_));
}

const unsigned char* SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cudaStream_t cudaStream) const {
const auto& data =
modToUnp_.dataForCurrentDeviceAsync(cudaStream, [this](ModulesToUnpack& data, cudaStream_t stream) {
cudaCheck(cudaMalloc((void**)&data.modToUnpDefault, pixelgpudetails::MAX_SIZE_BYTE_BOOL));
cudaCheck(cudaMemcpyAsync(data.modToUnpDefault,
this->modToUnpDefault.data(),
this->modToUnpDefault.size() * sizeof(unsigned char),
cudaMemcpyDefault,
stream));
});
return data.modToUnpDefault;
}

SiPixelFedCablingMapGPUWrapper::GPUData::~GPUData() { cudaCheck(cudaFree(cablingMapDevice)); }

SiPixelFedCablingMapGPUWrapper::ModulesToUnpack::~ModulesToUnpack() { cudaCheck(cudaFree(modToUnpDefault)); }
26 changes: 6 additions & 20 deletions src/cudauvm/CondFormats/SiPixelFedCablingMapGPUWrapper.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@
#define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h

#include "CUDACore/ESProduct.h"
#include "CUDACore/CUDAHostAllocator.h"
#include "CUDACore/device_unique_ptr.h"
#include "CondFormats/SiPixelFedCablingMapGPU.h"

#include <cuda_runtime.h>
Expand All @@ -12,35 +10,23 @@

class SiPixelFedCablingMapGPUWrapper {
public:
explicit SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const &cablingMap,
std::vector<unsigned char> modToUnp);
explicit SiPixelFedCablingMapGPUWrapper(SiPixelFedCablingMapGPU const& cablingMap,
std::vector<unsigned char> const& modToUnp);
~SiPixelFedCablingMapGPUWrapper();

bool hasQuality() const { return hasQuality_; }

// returns pointer to GPU memory
const SiPixelFedCablingMapGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
const SiPixelFedCablingMapGPU* cablingMap() const { return cablingMap_; }

// returns pointer to GPU memory
const unsigned char *getModToUnpAllAsync(cudaStream_t cudaStream) const;
const unsigned char* modToUnpAll() const { return modToUnpDefault_; }

private:
std::vector<unsigned char, CUDAHostAllocator<unsigned char>> modToUnpDefault;
bool hasQuality_;

SiPixelFedCablingMapGPU *cablingMapHost = nullptr; // pointer to struct in CPU

struct GPUData {
~GPUData();
SiPixelFedCablingMapGPU *cablingMapDevice = nullptr; // pointer to struct in GPU
};
cms::cuda::ESProduct<GPUData> gpuData_;

struct ModulesToUnpack {
~ModulesToUnpack();
unsigned char *modToUnpDefault = nullptr; // pointer to GPU
};
cms::cuda::ESProduct<ModulesToUnpack> modToUnp_;
SiPixelFedCablingMapGPU* cablingMap_ = nullptr;
unsigned char* modToUnpDefault_ = nullptr;
};

#endif
53 changes: 26 additions & 27 deletions src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.cc
Original file line number Diff line number Diff line change
@@ -1,38 +1,37 @@
#include <cstring>

#include <cuda.h>

#include "CondFormats/SiPixelGainCalibrationForHLTGPU.h"
#include "CondFormats/SiPixelGainForHLTonGPU.h"
#include "CUDACore/cudaCheck.h"
#include "CUDACore/deviceCount.h"
#include "CUDACore/ScopedSetDevice.h"
#include "CUDACore/StreamCache.h"

SiPixelGainCalibrationForHLTGPU::SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain,
std::vector<char> gainData)
: gainData_(std::move(gainData)) {
cudaCheck(cudaMallocHost(&gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU)));
*gainForHLTonHost_ = gain;
}
std::vector<char> const& gainData) {
cudaCheck(cudaMallocManaged(&gainForHLT_, sizeof(SiPixelGainForHLTonGPU)));
*gainForHLT_ = gain;
cudaCheck(cudaMallocManaged(&gainData_, gainData.size()));
gainForHLT_->v_pedestals = gainData_;

SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() { cudaCheck(cudaFreeHost(gainForHLTonHost_)); }

SiPixelGainCalibrationForHLTGPU::GPUData::~GPUData() {
cudaCheck(cudaFree(gainForHLTonGPU));
cudaCheck(cudaFree(gainDataOnGPU));
std::memcpy(gainData_, gainData.data(), gainData.size());
for (int device = 0, ndev = cms::cuda::deviceCount(); device < ndev; ++device) {
#ifndef CUDAUVM_DISABLE_ADVICE
cudaCheck(cudaMemAdvise(gainForHLT_, sizeof(SiPixelGainForHLTonGPU), cudaMemAdviseSetReadMostly, device));
cudaCheck(cudaMemAdvise(gainData_, gainData.size(), cudaMemAdviseSetReadMostly, device));
#endif
#ifndef CUDAUVM_DISABLE_PREFETCH
cms::cuda::ScopedSetDevice guard{device};
auto stream = cms::cuda::getStreamCache().get();
cudaCheck(cudaMemPrefetchAsync(gainForHLT_, sizeof(SiPixelGainForHLTonGPU), device, stream.get()));
cudaCheck(cudaMemPrefetchAsync(gainData_, gainData.size(), device, stream.get()));
#endif
}
}

const SiPixelGainForHLTonGPU* SiPixelGainCalibrationForHLTGPU::getGPUProductAsync(cudaStream_t cudaStream) const {
const auto& data = gpuData_.dataForCurrentDeviceAsync(cudaStream, [this](GPUData& data, cudaStream_t stream) {
cudaCheck(cudaMalloc((void**)&data.gainForHLTonGPU, sizeof(SiPixelGainForHLTonGPU)));
cudaCheck(cudaMalloc((void**)&data.gainDataOnGPU, this->gainData_.size()));
// gains.data().data() is used also for non-GPU code, we cannot allocate it on aligned and write-combined memory
cudaCheck(
cudaMemcpyAsync(data.gainDataOnGPU, this->gainData_.data(), this->gainData_.size(), cudaMemcpyDefault, stream));

cudaCheck(cudaMemcpyAsync(
data.gainForHLTonGPU, this->gainForHLTonHost_, sizeof(SiPixelGainForHLTonGPU), cudaMemcpyDefault, stream));
cudaCheck(cudaMemcpyAsync(&(data.gainForHLTonGPU->v_pedestals),
&(data.gainDataOnGPU),
sizeof(SiPixelGainForHLTonGPU_DecodingStructure*),
cudaMemcpyDefault,
stream));
});
return data.gainForHLTonGPU;
SiPixelGainCalibrationForHLTGPU::~SiPixelGainCalibrationForHLTGPU() {
cudaCheck(cudaFree(gainForHLT_));
cudaCheck(cudaFree(gainData_));
}
15 changes: 4 additions & 11 deletions src/cudauvm/CondFormats/SiPixelGainCalibrationForHLTGPU.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,21 +8,14 @@ struct SiPixelGainForHLTonGPU_DecodingStructure;

class SiPixelGainCalibrationForHLTGPU {
public:
explicit SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const &gain, std::vector<char> gainData);
explicit SiPixelGainCalibrationForHLTGPU(SiPixelGainForHLTonGPU const& gain, std::vector<char> const& gainData);
~SiPixelGainCalibrationForHLTGPU();

const SiPixelGainForHLTonGPU *getGPUProductAsync(cudaStream_t cudaStream) const;
const SiPixelGainForHLTonGPU *getCPUProduct() const { return gainForHLTonHost_; }
const SiPixelGainForHLTonGPU* get() const { return gainForHLT_; }

private:
SiPixelGainForHLTonGPU *gainForHLTonHost_ = nullptr;
std::vector<char> gainData_;
struct GPUData {
~GPUData();
SiPixelGainForHLTonGPU *gainForHLTonGPU = nullptr;
SiPixelGainForHLTonGPU_DecodingStructure *gainDataOnGPU = nullptr;
};
cms::cuda::ESProduct<GPUData> gpuData_;
SiPixelGainForHLTonGPU* gainForHLT_ = nullptr;
SiPixelGainForHLTonGPU_DecodingStructure* gainData_ = nullptr;
};

#endif // CalibTracker_SiPixelESProducers_interface_SiPixelGainCalibrationForHLTGPU_h
Loading