diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertices.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertices.cc index c2644dc1542e3..792c7878087ae 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertices.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelCompareVertices.cc @@ -9,20 +9,19 @@ // // Author: Suvankar Roy Chowdhury // -#include "FWCore/Framework/interface/Frameworkfwd.h" + +#include "DQMServices/Core/interface/DQMEDAnalyzer.h" +#include "DQMServices/Core/interface/DQMStore.h" +#include "DQMServices/Core/interface/MonitorElement.h" +#include "DataFormats/BeamSpot/interface/BeamSpot.h" +#include "DataFormats/Common/interface/Handle.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" #include "FWCore/Framework/interface/Event.h" +#include "FWCore/Framework/interface/Frameworkfwd.h" #include "FWCore/Framework/interface/MakerMacros.h" #include "FWCore/MessageLogger/interface/MessageLogger.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "FWCore/Utilities/interface/InputTag.h" -#include "DataFormats/Common/interface/Handle.h" -// DQM Histograming -#include "DQMServices/Core/interface/MonitorElement.h" -#include "DQMServices/Core/interface/DQMEDAnalyzer.h" -#include "DQMServices/Core/interface/DQMStore.h" -#include "DataFormats/VertexSoA/interface/ZVertexHost.h" -#include "CUDADataFormats/Vertex/interface/ZVertexSoAHeterogeneousHost.h" -#include "DataFormats/BeamSpot/interface/BeamSpot.h" // TODO: change class name to SiPixelCompareVerticesSoA when CUDA code is removed class SiPixelCompareVertices : public DQMEDAnalyzer { @@ -81,7 +80,7 @@ void SiPixelCompareVertices::analyzeSeparate(U tokenRef, V tokenTar, const edm:: out << "reference vertices not found; "; } if (not vsoaHandleTar) { - out << "Refget vertices not found; "; + out << "target vertices not found; "; } out << "the comparison will not run."; return; @@ -94,8 +93,8 @@ void SiPixelCompareVertices::analyzeSeparate(U tokenRef, V tokenTar, const edm:: auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; - if (!bsHandle.isValid()) { - edm::LogWarning("SiPixelCompareVertices") << "No beamspot found. returning vertexes with (0,0,Z) "; + if (not bsHandle.isValid()) { + edm::LogWarning("SiPixelCompareVertices") << "No beamspot found, returning vertexes with (0,0,Z)."; } else { const reco::BeamSpot& bs = *bsHandle; x0 = bs.x0(); @@ -112,7 +111,7 @@ void SiPixelCompareVertices::analyzeSeparate(U tokenRef, V tokenTar, const edm:: auto yc = y0 + dydz * zc; zc += z0; - auto ndofRef = vsoaRef.view()[sic].ndof(); + auto ndofRef = vsoaRef.template view()[sic].ndof(); auto chi2Ref = vsoaRef.view()[sic].chi2(); const int32_t notFound = -1; @@ -138,7 +137,7 @@ void SiPixelCompareVertices::analyzeSeparate(U tokenRef, V tokenTar, const edm:: auto xg = x0 + dxdz * zg; auto yg = y0 + dydz * zg; zg += z0; - auto ndofTar = vsoaTar.view()[closestVtxidx].ndof(); + auto ndofTar = vsoaTar.template view()[closestVtxidx].ndof(); auto chi2Tar = vsoaTar.view()[closestVtxidx].chi2(); hx_->Fill(xc - x0, xg - x0); diff --git a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc index d3121f77bccb8..7b488553626b8 100644 --- a/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc +++ b/DQM/SiPixelHeterogeneous/plugins/SiPixelMonitorVertexSoAAlpaka.cc @@ -67,7 +67,9 @@ void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: } auto const& vsoa = *vsoaHandle; - int nVertices = vsoa.view().nvFinal(); + auto vtx_view = vsoa.view(); + auto trk_view = vsoa.view(); + int nVertices = vtx_view.nvFinal(); auto bsHandle = iEvent.getHandle(tokenBeamSpot_); float x0 = 0., y0 = 0., z0 = 0., dxdz = 0., dydz = 0.; if (!bsHandle.isValid()) { @@ -82,8 +84,8 @@ void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: } for (int iv = 0; iv < nVertices; iv++) { - auto si = vsoa.view()[iv].sortInd(); - auto z = vsoa.view()[si].zv(); + auto si = vtx_view[iv].sortInd(); + auto z = vtx_view[si].zv(); auto x = x0 + dxdz * z; auto y = y0 + dydz * z; @@ -91,10 +93,10 @@ void SiPixelMonitorVertexSoAAlpaka::analyze(const edm::Event& iEvent, const edm: hx->Fill(x); hy->Fill(y); hz->Fill(z); - auto ndof = vsoa.view()[si].ndof(); - hchi2->Fill(vsoa.view()[si].chi2()); - hchi2oNdof->Fill(vsoa.view()[si].chi2() / ndof); - hptv2->Fill(vsoa.view()[si].ptv2()); + auto ndof = trk_view[si].ndof(); + hchi2->Fill(vtx_view[si].chi2()); + hchi2oNdof->Fill(vtx_view[si].chi2() / ndof); + hptv2->Fill(vtx_view[si].ptv2()); hntrks->Fill(ndof + 1); } hnVertex->Fill(nVertices); diff --git a/DataFormats/VertexSoA/README.md b/DataFormats/VertexSoA/README.md index 54172eda14281..36d20478a2508 100644 --- a/DataFormats/VertexSoA/README.md +++ b/DataFormats/VertexSoA/README.md @@ -1,45 +1,32 @@ # Vertex Portable Data Formats -`DataFormat`s meant to be used on Host (CPU) or Device (GPU) for -storing information about vertices created during the Pixel-local Reconstruction -chain. It stores data in an SoA manner. It contains the data that was previously -contained in the deprecated `ZVertexSoA` class. +`DataFormat`s meant to be used on Host (CPU) or Device (GPUs) for storing +information about reconstructed pixel vertices in Structure of Array (SoA) +format. -The host format is inheriting from `DataFormats/Common/interface/PortableHostCollection.h`, -while the device format is inheriting from `DataFormats/Common/interface/PortableDeviceCollection.h` +The host collection is an instantiation of `PortableHostMultiCollection`, while +the device collection is an instantiation of `PortableDeviceMultiCollection`. -Both formats use the same SoA Layout (`ZVertexLayout`) which is generated -via the `GENERATE_SOA_LAYOUT` macro in the `ZVertexUtilities.h` file. +Both collections use two SoA layouts (`ZVertexLayout` and `ZVertexTracksLayout`) +with different number of elements, defined at run-time. +The layouts are defined by the `GENERATE_SOA_LAYOUT` macro in +`DataFormats/VertexSoA/interface/ZVertexSoA.h`. -## Notes -- Initially, `ZVertexSoA` had distinct array sizes for each attribute (e.g. `zv` was `MAXVTX` elements -long, `ndof` was `MAXTRACKS` elements long). All columns are now of uniform `MAXTRACKS` size, -meaning that there will be some wasted space (appx. 190kB). -- Host and Device classes should **not** be created via inheritance, as they're done here, -but via composition. See [this discussion](https://github.com/cms-sw/cmssw/pull/40465#discussion_r1066039309). - -## ZVertexHeterogeneousHost +## `ZVertexHost` The version of the data format to be used for storing vertex data on the CPU. Instances of this class are to be used for: + - having a place to copy data to host from device, which is usually taken care + of automatically by the framework; + - running host-side algorithms using data stored in an SoA manner. -- Having a place to copy data to host from device, via `cudaMemcpy`, or -- Running host-side algorithms using data stored in an SoA manner. -## ZVertexHeterogeneousDevice +## `ZVertexDevice` The version of the data format to be used for storing vertex data on the GPU. - -Instances of `ZVertexHeterogeneousDevice` are to be created on host and be -used on device only. To do so, the instance's `view()` method is to be called -to pass a `View` to any kernel launched. Accessing data from the `view()` is not -possible on the host side. - -## Utilities - -Apart from `ZVertexLayout`, `ZVertexUtilities.h` also contains -a collection of methods which were originally -defined as class methods inside the `ZVertexSoA` class -which have been adapted to operate on `View` instances, so that they are callable -from within `__global__` kernels, on both CPU and CPU. +Instances of `ZVertexDevice` are created on the host, and can only be used only +on the device. To do so, the instance's `view()` or `const_view()` methods are +called, and the resulting `View` or `ConstView` are passed to a kernel launch. +The data from the instance, `view()` or `const_view()` is not accessible on the +host side. diff --git a/DataFormats/VertexSoA/interface/ZVertexDefinitions.h b/DataFormats/VertexSoA/interface/ZVertexDefinitions.h deleted file mode 100644 index 0645305f4fd7c..0000000000000 --- a/DataFormats/VertexSoA/interface/ZVertexDefinitions.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef DataFormats_VertexSoA_ZVertexDefinitions_h -#define DataFormats_VertexSoA_ZVertexDefinitions_h - -#include - -namespace zVertex { - - constexpr uint32_t MAXTRACKS = 32 * 1024; - constexpr uint32_t MAXVTX = 1024; - - // - // FIXME: MAXTRACKS is low for Phase2 pixel triplets with PU=200 - // and the txiplets wfs in those conditions will fail. - // - // Not rising it since to the needed 128*1024 since it causes - // a 4-5% jump in memory usage. - // - // The original sin is that, for the moment, the VertexSoA - // has a unique index for the tracks and the vertices and - // so it needs to be sized for the bigger of the two (the tracks, of course). - // This means we are wasting memory (this is true also for Phase1). - // We need to split the two indices (as is for CUDA, since we were not using - // the PortableCollection + Layout was not yet used). - // - -} // namespace zVertex - -#endif diff --git a/DataFormats/VertexSoA/interface/ZVertexDevice.h b/DataFormats/VertexSoA/interface/ZVertexDevice.h index 8d120ae190f3c..3a8b95e7ca816 100644 --- a/DataFormats/VertexSoA/interface/ZVertexDevice.h +++ b/DataFormats/VertexSoA/interface/ZVertexDevice.h @@ -4,23 +4,12 @@ #include #include + #include "DataFormats/VertexSoA/interface/ZVertexSoA.h" -#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" #include "DataFormats/VertexSoA/interface/ZVertexHost.h" #include "DataFormats/Portable/interface/PortableDeviceCollection.h" -template -class ZVertexDeviceSoA : public PortableDeviceCollection, TDev> { -public: - ZVertexDeviceSoA() = default; // necessary for ROOT dictionaries - - // Constructor which specifies the SoA size - template - explicit ZVertexDeviceSoA(TQueue queue) : PortableDeviceCollection, TDev>(S, queue) {} -}; - -using namespace ::zVertex; template -using ZVertexDevice = ZVertexDeviceSoA; +using ZVertexDevice = PortableDeviceMultiCollection; #endif // DataFormats_VertexSoA_interface_ZVertexDevice_h diff --git a/DataFormats/VertexSoA/interface/ZVertexHost.h b/DataFormats/VertexSoA/interface/ZVertexHost.h index 2d72b83bfe385..a805495ce5fe0 100644 --- a/DataFormats/VertexSoA/interface/ZVertexHost.h +++ b/DataFormats/VertexSoA/interface/ZVertexHost.h @@ -5,25 +5,10 @@ #include -#include "HeterogeneousCore/AlpakaInterface/interface/config.h" -#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" -#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" #include "DataFormats/Portable/interface/PortableHostCollection.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" -template -class ZVertexHostSoA : public PortableHostCollection { -public: - ZVertexHostSoA() = default; - - // Constructor which specifies the queue - template - explicit ZVertexHostSoA(TQueue queue) : PortableHostCollection(S, queue) {} - - // Constructor which specifies the DevHost - explicit ZVertexHostSoA(alpaka_common::DevHost const& host) : PortableHostCollection(S, host) {} -}; - -//using namespace ::zVertex; -using ZVertexHost = ZVertexHostSoA; +using ZVertexHost = PortableHostCollection2; #endif // DataFormats_VertexSoA_ZVertexHost_H diff --git a/DataFormats/VertexSoA/interface/ZVertexSoA.h b/DataFormats/VertexSoA/interface/ZVertexSoA.h index 045603618acd7..f84eeb1e4e812 100644 --- a/DataFormats/VertexSoA/interface/ZVertexSoA.h +++ b/DataFormats/VertexSoA/interface/ZVertexSoA.h @@ -10,20 +10,29 @@ namespace reco { GENERATE_SOA_LAYOUT(ZVertexLayout, - SOA_COLUMN(int16_t, idv), - SOA_COLUMN(float, zv), - SOA_COLUMN(float, wv), - SOA_COLUMN(float, chi2), - SOA_COLUMN(float, ptv2), - SOA_COLUMN(int32_t, ndof), - SOA_COLUMN(uint16_t, sortInd), - SOA_SCALAR(uint32_t, nvFinal)) + SOA_COLUMN(float, zv), // output z-posistion of found vertices + SOA_COLUMN(float, wv), // output weight (1/error^2) on the above + SOA_COLUMN(float, chi2), // vertices chi2 + SOA_COLUMN(float, ptv2), // vertices pt^2 + SOA_COLUMN(uint16_t, sortInd), // sorted index (by pt2) ascending + SOA_SCALAR(uint32_t, nvFinal)) // the number of vertices + + GENERATE_SOA_LAYOUT(ZVertexTracksLayout, + SOA_COLUMN(int16_t, idv), // vertex index for each associated (original) track + // (-1 == not associate) + SOA_COLUMN(int32_t, ndof)) // vertices number of dof + // FIXME: reused as workspace for the number of nearest neighbours // Common types for both Host and Device code using ZVertexSoA = ZVertexLayout<>; using ZVertexSoAView = ZVertexSoA::View; using ZVertexSoAConstView = ZVertexSoA::ConstView; + // Common types for both Host and Device code + using ZVertexTracksSoA = ZVertexTracksLayout<>; + using ZVertexTracksSoAView = ZVertexTracksSoA::View; + using ZVertexTracksSoAConstView = ZVertexTracksSoA::ConstView; + ALPAKA_FN_HOST_ACC ALPAKA_FN_INLINE void init(ZVertexSoAView &vertices) { vertices.nvFinal() = 0; } } // namespace reco diff --git a/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h b/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h index 636a07e2bd978..212256558fc01 100644 --- a/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h +++ b/DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h @@ -4,13 +4,13 @@ #include #include + #include "DataFormats/Portable/interface/alpaka/PortableCollection.h" -#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" -#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" -#include "DataFormats/VertexSoA/interface/ZVertexHost.h" #include "DataFormats/VertexSoA/interface/ZVertexDevice.h" -#include "HeterogeneousCore/AlpakaInterface/interface/config.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" #include "HeterogeneousCore/AlpakaInterface/interface/CopyToHost.h" +#include "HeterogeneousCore/AlpakaInterface/interface/config.h" namespace ALPAKA_ACCELERATOR_NAMESPACE { @@ -19,21 +19,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } // namespace ALPAKA_ACCELERATOR_NAMESPACE -namespace cms::alpakatools { - template - struct CopyToHost> { - template - static auto copyAsync(TQueue& queue, ZVertexDevice const& deviceData) { - ZVertexHost hostData(queue); - alpaka::memcpy(queue, hostData.buffer(), deviceData.buffer()); -#ifdef GPU_DEBUG - printf("ZVertexSoACollection: I'm copying to host.\n"); -#endif - return hostData; - } - }; -} // namespace cms::alpakatools - ASSERT_DEVICE_MATCHES_HOST_COLLECTION(ZVertexSoACollection, ZVertexHost); #endif // DataFormats_VertexSoA_interface_ZVertexSoACollection_h diff --git a/DataFormats/VertexSoA/src/classes.cc b/DataFormats/VertexSoA/src/classes.cc index edffb6e08a9e5..d333705981ff2 100644 --- a/DataFormats/VertexSoA/src/classes.cc +++ b/DataFormats/VertexSoA/src/classes.cc @@ -1,4 +1,4 @@ #include "DataFormats/Portable/interface/PortableHostCollectionReadRules.h" -#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" +#include "DataFormats/VertexSoA/interface/ZVertexHost.h" -SET_PORTABLEHOSTCOLLECTION_READ_RULES(PortableHostCollection); +SET_PORTABLEHOSTMULTICOLLECTION_READ_RULES(ZVertexHost); diff --git a/DataFormats/VertexSoA/src/classes_def.xml b/DataFormats/VertexSoA/src/classes_def.xml index 820d28ecc3493..5dfc139ec66a1 100644 --- a/DataFormats/VertexSoA/src/classes_def.xml +++ b/DataFormats/VertexSoA/src/classes_def.xml @@ -1,8 +1,18 @@ - - - - - + + + + + + + + + + + + + + + diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc index 7c9d17a767682..79a2964c2db85 100644 --- a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.cc @@ -18,7 +18,6 @@ #include -#include "DataFormats/VertexSoA/interface/ZVertexDevice.h" #include "DataFormats/VertexSoA/interface/ZVertexHost.h" #include "DataFormats/VertexSoA/interface/alpaka/ZVertexSoACollection.h" #include "FWCore/Utilities/interface/stringize.h" @@ -31,6 +30,10 @@ using namespace ALPAKA_ACCELERATOR_NAMESPACE; +// Run 3 values, used for testing +constexpr uint32_t maxTracks = 32 * 1024; +constexpr uint32_t maxVertices = 1024; + int main() { // Get the list of devices on the current platform auto const& devices = cms::alpakatools::devices(); @@ -48,15 +51,19 @@ int main() { { // Instantiate vertices on device. PortableCollection allocates // SoA on device automatically. - ZVertexSoACollection zvertex_d(queue); - testZVertexSoAT::runKernels(zvertex_d.view(), queue); + ZVertexSoACollection zvertex_d({{maxTracks, maxVertices}}, queue); + testZVertexSoAT::runKernels(zvertex_d.view(), zvertex_d.view(), queue); - // Instantate vertices on host. This is where the data will be - // copied to from device. - ZVertexHost zvertex_h(queue); - std::cout << zvertex_h.view().metadata().size() << std::endl; - alpaka::memcpy(queue, zvertex_h.buffer(), zvertex_d.const_buffer()); + // If the device is actually the host, use the collection as-is. + // Otherwise, copy the data from the device to the host. + ZVertexHost zvertex_h; +#ifdef ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED + zvertex_h = std::move(zvertex_d); +#else + zvertex_h = cms::alpakatools::CopyToHost::copyAsync(queue, zvertex_d); +#endif alpaka::wait(queue); + std::cout << zvertex_h.view().metadata().size() << std::endl; // Print results std::cout << "idv\t" @@ -68,11 +75,13 @@ int main() { << "sortInd\t" << "nvFinal\n"; + auto vtx_v = zvertex_h.view(); + auto trk_v = zvertex_h.view(); for (int i = 0; i < 10; ++i) { - std::cout << (int)zvertex_h.view()[i].idv() << '\t' << zvertex_h.view()[i].zv() << '\t' - << zvertex_h.view()[i].wv() << '\t' << zvertex_h.view()[i].chi2() << '\t' - << zvertex_h.view()[i].ptv2() << '\t' << (int)zvertex_h.view()[i].ndof() << '\t' - << (int)zvertex_h.view()[i].sortInd() << '\t' << (int)zvertex_h.view().nvFinal() << '\n'; + auto vi = vtx_v[i]; + auto ti = trk_v[i]; + std::cout << (int)ti.idv() << "\t" << vi.zv() << "\t" << vi.wv() << "\t" << vi.chi2() << "\t" << vi.ptv2() + << "\t" << (int)ti.ndof() << "\t" << vi.sortInd() << "\t" << (int)vtx_v.nvFinal() << std::endl; } } } diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc index 1af92d5050943..e2fd5ae94924e 100644 --- a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.dev.cc @@ -11,49 +11,57 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::testZVertexSoAT { class TestFillKernel { public: template >> - ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, + reco::ZVertexSoAView zvertex_view, + reco::ZVertexTracksSoAView ztracks_view) const { if (cms::alpakatools::once_per_grid(acc)) { zvertex_view.nvFinal() = 420; } for (int32_t j : cms::alpakatools::uniform_elements(acc, zvertex_view.metadata().size())) { - zvertex_view[j].idv() = (int16_t)j; zvertex_view[j].zv() = (float)j; zvertex_view[j].wv() = (float)j; zvertex_view[j].chi2() = (float)j; zvertex_view[j].ptv2() = (float)j; - zvertex_view[j].ndof() = (int32_t)j; zvertex_view[j].sortInd() = (uint16_t)j; } + for (int32_t j : cms::alpakatools::uniform_elements(acc, ztracks_view.metadata().size())) { + ztracks_view[j].idv() = (int16_t)j; + ztracks_view[j].ndof() = (int32_t)j; + } } }; class TestVerifyKernel { public: template >> - ALPAKA_FN_ACC void operator()(TAcc const& acc, reco::ZVertexSoAView zvertex_view) const { + ALPAKA_FN_ACC void operator()(TAcc const& acc, + reco::ZVertexSoAView zvertex_view, + reco::ZVertexTracksSoAView ztracks_view) const { if (cms::alpakatools::once_per_grid(acc)) { ALPAKA_ASSERT_ACC(zvertex_view.nvFinal() == 420); } for (int32_t j : cms::alpakatools::uniform_elements(acc, zvertex_view.nvFinal())) { - ALPAKA_ASSERT_ACC(zvertex_view[j].idv() == j); - ALPAKA_ASSERT_ACC(zvertex_view[j].zv() - (float)j < 0.0001); - ALPAKA_ASSERT_ACC(zvertex_view[j].wv() - (float)j < 0.0001); - ALPAKA_ASSERT_ACC(zvertex_view[j].chi2() - (float)j < 0.0001); - ALPAKA_ASSERT_ACC(zvertex_view[j].ptv2() - (float)j < 0.0001); - ALPAKA_ASSERT_ACC(zvertex_view[j].ndof() == j); - ALPAKA_ASSERT_ACC(zvertex_view[j].sortInd() == uint32_t(j)); + ALPAKA_ASSERT(zvertex_view[j].zv() - (float)j < 0.0001); + ALPAKA_ASSERT(zvertex_view[j].wv() - (float)j < 0.0001); + ALPAKA_ASSERT(zvertex_view[j].chi2() - (float)j < 0.0001); + ALPAKA_ASSERT(zvertex_view[j].ptv2() - (float)j < 0.0001); + ALPAKA_ASSERT(zvertex_view[j].sortInd() == uint32_t(j)); + } + for (int32_t j : cms::alpakatools::uniform_elements(acc, ztracks_view.metadata().size())) { + ALPAKA_ASSERT(ztracks_view[j].idv() == j); + ALPAKA_ASSERT(ztracks_view[j].ndof() == j); } } }; - void runKernels(reco::ZVertexSoAView zvertex_view, Queue& queue) { + void runKernels(reco::ZVertexSoAView zvertex_view, reco::ZVertexTracksSoAView ztracks_view, Queue& queue) { uint32_t items = 64; uint32_t groups = cms::alpakatools::divide_up_by(zvertex_view.metadata().size(), items); auto workDiv = cms::alpakatools::make_workdiv(groups, items); - alpaka::exec(queue, workDiv, TestFillKernel{}, zvertex_view); - alpaka::exec(queue, workDiv, TestVerifyKernel{}, zvertex_view); + alpaka::exec(queue, workDiv, TestFillKernel{}, zvertex_view, ztracks_view); + alpaka::exec(queue, workDiv, TestVerifyKernel{}, zvertex_view, ztracks_view); } } // namespace ALPAKA_ACCELERATOR_NAMESPACE::testZVertexSoAT diff --git a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.h b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.h index bad69a4d92bb5..fa54fffa8ce38 100644 --- a/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.h +++ b/DataFormats/VertexSoA/test/alpaka/ZVertexSoA_test.h @@ -6,7 +6,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::testZVertexSoAT { - void runKernels(reco::ZVertexSoAView zvertex_view, Queue& queue); + void runKernels(reco::ZVertexSoAView zvertex_view, reco::ZVertexTracksSoAView ztracks_view, Queue& queue); } // namespace ALPAKA_ACCELERATOR_NAMESPACE::testZVertexSoAT diff --git a/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc b/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc index 9940650cfb7af..f44787e711a0a 100644 --- a/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc +++ b/RecoTauTag/HLTProducers/src/L2TauTagNNProducerAlpaka.cc @@ -595,7 +595,7 @@ void L2TauNNProducerAlpaka::selectGoodTracksAndVertices(const ZVertexHost& patav if (nHits == 0) { break; } - int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); + int vtx_ass_to_track = patavtx_soa.view()[trk_idx].idv(); if (vtx_ass_to_track >= 0 && vtx_ass_to_track < nv) { auto patatrackPt = patatracks_tsoa.view()[trk_idx].pt(); ++nTrkAssociated[vtx_ass_to_track]; @@ -692,7 +692,7 @@ void L2TauNNProducerAlpaka::fillPatatracks(tensorflow::Tensor& cellGridMatrix, continue; const int patatrackNdof = 2 * std::min(6, nHits) - 5; - const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); + const int vtx_idx_assTrk = patavtx_soa.view()[it].idv(); if (reco::deltaR2(patatrackEta, patatrackPhi, tauEta, tauPhi) < dR2_max) { std::tie(deta, dphi, eta_idx, phi_idx) = getEtaPhiIndices(patatrackEta, patatrackPhi, allTaus[tau_idx]->polarP4()); diff --git a/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc index c4f0b97dba8a9..6ccb7789fc098 100644 --- a/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc +++ b/RecoTracker/PixelTrackFitting/plugins/PixelTrackDumpAlpaka.cc @@ -59,12 +59,12 @@ void PixelTrackDumpAlpakaT::analyze(edm::StreamID streamID, assert(tracks.view().nTracks()); auto const& vertices = iEvent.get(tokenSoAVertex_); - assert(vertices.view().idv()); + assert(vertices.view().idv()); assert(vertices.view().zv()); assert(vertices.view().wv()); assert(vertices.view().chi2()); assert(vertices.view().ptv2()); - assert(vertices.view().ndof()); + assert(vertices.view().ndof()); assert(vertices.view().sortInd()); assert(vertices.view().nvFinal()); } diff --git a/RecoVertex/Configuration/python/RecoPixelVertexing_cff.py b/RecoVertex/Configuration/python/RecoPixelVertexing_cff.py index eb953cc702502..0a2bb0d2b63b7 100644 --- a/RecoVertex/Configuration/python/RecoPixelVertexing_cff.py +++ b/RecoVertex/Configuration/python/RecoPixelVertexing_cff.py @@ -106,8 +106,8 @@ from RecoVertex.PixelVertexFinding.pixelVertexProducerAlpakaPhase2_cfi import pixelVertexProducerAlpakaPhase2 as _pixelVerticesAlpakaPhase2 from RecoVertex.PixelVertexFinding.pixelVertexProducerAlpakaHIonPhase1_cfi import pixelVertexProducerAlpakaHIonPhase1 as _pixelVerticesAlpakaHIonPhase1 pixelVerticesAlpaka = _pixelVerticesAlpakaPhase1.clone() -phase2_tracker.toReplaceWith(pixelVerticesAlpaka,_pixelVerticesAlpakaPhase2.clone()) -(pp_on_AA & ~phase2_tracker).toReplaceWith(pixelVerticesAlpaka,_pixelVerticesAlpakaHIonPhase1.clone(doSplitting = False)) +phase2_tracker.toReplaceWith(pixelVerticesAlpaka,_pixelVerticesAlpakaPhase2.clone( maxVertices = 512)) +(pp_on_AA & ~phase2_tracker).toReplaceWith(pixelVerticesAlpaka,_pixelVerticesAlpakaHIonPhase1.clone(doSplitting = False, maxVertices = 32)) from RecoVertex.PixelVertexFinding.pixelVertexFromSoAAlpaka_cfi import pixelVertexFromSoAAlpaka as _pixelVertexFromSoAAlpaka alpaka.toReplaceWith(pixelVertices, _pixelVertexFromSoAAlpaka.clone()) diff --git a/RecoVertex/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc b/RecoVertex/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc index 6e542f7870c2e..86561db386303 100644 --- a/RecoVertex/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc +++ b/RecoVertex/PixelVertexFinding/plugins/PixelVertexProducerFromSoAAlpaka.cc @@ -103,7 +103,7 @@ void PixelVertexProducerFromSoAAlpaka::produce(edm::StreamID streamID, err(2, 2) *= 2.; // artifically inflate error //Copy also the tracks (no intention to be efficient....) for (auto k = 0U; k < indToEdm.size(); ++k) { - if (soa.view()[k].idv() == int16_t(i)) + if (soa.view()[k].idv() == int16_t(i)) itrk.push_back(k); } auto nt = itrk.size(); @@ -117,7 +117,8 @@ void PixelVertexProducerFromSoAAlpaka::produce(edm::StreamID streamID, itrk.clear(); continue; } // remove outliers - (*vertexes).emplace_back(reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); + (*vertexes).emplace_back( + reco::Vertex::Point(x, y, z), err, soa.view()[i].chi2(), soa.view()[i].ndof(), nt); auto &v = (*vertexes).back(); v.reserve(itrk.size()); for (auto it : itrk) { diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc b/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc index d572a181ccf85..8770239f132ab 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexProducerAlpaka.cc @@ -42,6 +42,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const Algo algo_; + // Maximum number of vertices that will be reconstructed + const int maxVertices_; + // Tracking cuts before sending tracks to vertex algo const float ptMin_; const float ptMax_; @@ -61,6 +64,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { conf.getParameter("eps"), conf.getParameter("errmax"), conf.getParameter("chi2max")), + maxVertices_(conf.getParameter("maxVertices")), ptMin_(conf.getParameter("PtMin")), // 0.5 GeV ptMax_(conf.getParameter("PtMax")), // 75. Onsumes tokenDeviceTrack_(consumes(conf.getParameter("pixelTrackSrc"))), @@ -83,6 +87,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { desc.add("errmax", 0.01); // max error to be "seed" desc.add("chi2max", 9.); // max normalized distance to cluster + desc.add("maxVertices", 256); desc.add("PtMin", 0.5); desc.add("PtMax", 75.); desc.add("pixelTrackSrc", edm::InputTag("pixelTracksAlpaka")); @@ -96,7 +101,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { const device::EventSetup& iSetup) const { auto const& hTracks = iEvent.get(tokenDeviceTrack_); - iEvent.emplace(tokenDeviceVertex_, algo_.makeAsync(iEvent.queue(), hTracks.view(), ptMin_, ptMax_)); + iEvent.emplace(tokenDeviceVertex_, algo_.makeAsync(iEvent.queue(), hTracks.view(), maxVertices_, ptMin_, ptMax_)); } using PixelVertexProducerAlpakaPhase1 = PixelVertexProducerAlpaka; diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h index f21a51e819d1e..232cb6f475412 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/PixelVertexWorkSpaceSoADeviceAlpaka.h @@ -4,7 +4,6 @@ #include #include "DataFormats/Portable/interface/alpaka/PortableCollection.h" -#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" #include "HeterogeneousCore/AlpakaInterface/interface/config.h" #include "RecoVertex/PixelVertexFinding/interface/PixelVertexWorkSpaceLayout.h" #include "RecoVertex/PixelVertexFinding/plugins/PixelVertexWorkSpaceSoAHostAlpaka.h" diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h index 3574cfd350bc9..835e2f3327d7c 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksByDensity.h @@ -17,51 +17,40 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { - using VtxSoAView = ::reco::ZVertexSoAView; - using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - // this algo does not really scale as it works in a single block... - // enough for <10K tracks we have + // This algo does not really scale as it works in a single block... + // It should be good enough for <10K tracks we have. // - // based on Rodrighez&Laio algo - // - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void __attribute__((always_inline)) - clusterTracksByDensity(const TAcc& acc, - VtxSoAView& pdata, - WsSoAView& pws, - int minT, // min number of neighbours to be "seed" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster + // Based on Rodrighez&Laio algo. + ALPAKA_FN_ACC ALPAKA_FN_INLINE void clusterTracksByDensity(Acc1D const& acc, + VtxSoAView& data, + TrkSoAView& trkdata, + WsSoAView& ws, + int minT, // min number of neighbours to be "seed" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster ) { - using namespace vertexFinder; - constexpr bool verbose = false; // in principle the compiler should optmize out if false - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + constexpr bool verbose = false; if constexpr (verbose) { if (cms::alpakatools::once_per_block(acc)) printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); } - auto er2mx = errmax * errmax; - auto& __restrict__ data = pdata; - auto& __restrict__ ws = pws; auto nt = ws.ntrks(); + ALPAKA_ASSERT_ACC(static_cast(nt) <= ws.metadata().size()); + ALPAKA_ASSERT_ACC(static_cast(nt) <= trkdata.metadata().size()); + float const* __restrict__ zt = ws.zt(); float const* __restrict__ ezt2 = ws.ezt2(); - - uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt(); - int32_t* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv(); - + int32_t* __restrict__ nn = trkdata.ndof(); ALPAKA_ASSERT_ACC(zt); ALPAKA_ASSERT_ACC(ezt2); ALPAKA_ASSERT_ACC(izt); - ALPAKA_ASSERT_ACC(nn); ALPAKA_ASSERT_ACC(iv); + ALPAKA_ASSERT_ACC(nn); using Hist = cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); @@ -70,43 +59,48 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) { hist.off[j] = 0; } + for (auto j : cms::alpakatools::uniform_elements(acc, 32)) { + hws[j] = 0; // used by prefix scan in hist.finalize() + } alpaka::syncBlockThreads(acc); if constexpr (verbose) { if (cms::alpakatools::once_per_block(acc)) printf("booked hist with %d bins, size %d for %d tracks\n", hist.totbins(), hist.capacity(), nt); } + + ALPAKA_ASSERT_ACC(static_cast(nt) <= std::numeric_limits::max()); ALPAKA_ASSERT_ACC(static_cast(nt) <= hist.capacity()); + ALPAKA_ASSERT_ACC(eps <= 0.1f); // see below - // fill hist (bin shall be wider than "eps") + // fill hist (bin shall be wider than "eps") for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - ALPAKA_ASSERT_ACC(i < ::zVertex::MAXTRACKS); - int iz = int(zt[i] * 10.); // valid if eps<=0.1 - // iz = std::clamp(iz, INT8_MIN, INT8_MAX); // sorry c++17 only - iz = std::min(std::max(iz, INT8_MIN), INT8_MAX); - izt[i] = iz - INT8_MIN; + int iz = static_cast(zt[i] * 10.f); // valid if eps <= 0.1 + iz = std::clamp(iz, INT8_MIN, INT8_MAX); ALPAKA_ASSERT_ACC(iz - INT8_MIN >= 0); ALPAKA_ASSERT_ACC(iz - INT8_MIN < 256); + izt[i] = iz - INT8_MIN; hist.count(acc, izt[i]); iv[i] = i; nn[i] = 0; } alpaka::syncBlockThreads(acc); - if (threadIdxLocal < 32) - hws[threadIdxLocal] = 0; // used by prefix scan... - alpaka::syncBlockThreads(acc); + hist.finalize(acc, hws); alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_ACC(hist.size() == nt); for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { hist.fill(acc, izt[i], uint16_t(i)); } alpaka::syncBlockThreads(acc); + // count neighbours + const auto errmax2 = errmax * errmax; for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - if (ezt2[i] > er2mx) + if (ezt2[i] > errmax2) continue; - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (i == j) return; auto dist = std::abs(zt[i] - zt[j]); @@ -115,16 +109,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) return; nn[i]++; - }; - - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); } alpaka::syncBlockThreads(acc); // find closest above me .... (we ignore the possibility of two j at same distance from i) for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { float mdist = eps; - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (nn[j] < nn[i]) return; if (nn[j] == nn[i] && zt[j] >= zt[i]) @@ -136,8 +128,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { return; // (break natural order???) mdist = dist; iv[i] = j; // assign to cluster (better be unique??) - }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); } alpaka::syncBlockThreads(acc); @@ -168,11 +159,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { #endif #ifdef GPU_DEBUG - // and verify that we did not spit any cluster... + // and verify that we did not split any cluster... for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { auto minJ = i; auto mdist = eps; - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (nn[j] < nn[i]) return; if (nn[j] == nn[i] && zt[j] >= zt[i]) @@ -184,8 +175,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { return; mdist = dist; minJ = j; - }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); + // should belong to the same cluster... ALPAKA_ASSERT_ACC(iv[i] == iv[minJ]); ALPAKA_ASSERT_ACC(nn[i] <= nn[iv[i]]); @@ -211,7 +202,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { } alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_ACC(foundClusters < ::zVertex::MAXVTX); + ALPAKA_ASSERT_ACC(static_cast(foundClusters) < data.metadata().size()); // propagate the negative id to all the tracks in the cluster. for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { @@ -227,24 +218,27 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { iv[i] = -iv[i] - 1; } - nvIntermediate = nvFinal = foundClusters; + ws.nvIntermediate() = foundClusters; + data.nvFinal() = foundClusters; + if constexpr (verbose) { if (cms::alpakatools::once_per_block(acc)) printf("found %d proto vertices\n", foundClusters); } } + class ClusterTracksByDensityKernel { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, - VtxSoAView pdata, - WsSoAView pws, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + VtxSoAView data, + TrkSoAView trkdata, + WsSoAView ws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster ) const { - clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + clusterTracksByDensity(acc, data, trkdata, ws, minT, eps, errmax, chi2max); } }; diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h index d2cf3601650ae..a1e5c28da2083 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksDBSCAN.h @@ -17,46 +17,40 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { - using VtxSoAView = ::reco::ZVertexSoAView; - using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - // this algo does not really scale as it works in a single block... - // enough for <10K tracks we have + // This algo does not really scale as it works in a single block... + // It should be good enough for <10K tracks we have. class ClusterTracksDBSCAN { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, - VtxSoAView pdata, - WsSoAView pws, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + VtxSoAView data, + TrkSoAView trkdata, + WsSoAView ws, int minT, // min number of neighbours to be "core" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster ) const { - constexpr bool verbose = false; // in principle the compiler should optmize out if false - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); + constexpr bool verbose = false; + if constexpr (verbose) { if (cms::alpakatools::once_per_block(acc)) printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); } - auto er2mx = errmax * errmax; - auto& __restrict__ data = pdata; - auto& __restrict__ ws = pws; auto nt = ws.ntrks(); + ALPAKA_ASSERT_ACC(static_cast(nt) <= ws.metadata().size()); + ALPAKA_ASSERT_ACC(static_cast(nt) <= trkdata.metadata().size()); + float const* __restrict__ zt = ws.zt(); float const* __restrict__ ezt2 = ws.ezt2(); - - uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate(); - uint8_t* __restrict__ izt = ws.izt(); - int32_t* __restrict__ nn = data.ndof(); int32_t* __restrict__ iv = ws.iv(); - + int32_t* __restrict__ nn = trkdata.ndof(); ALPAKA_ASSERT_ACC(zt); + ALPAKA_ASSERT_ACC(ezt2); + ALPAKA_ASSERT_ACC(izt); ALPAKA_ASSERT_ACC(iv); ALPAKA_ASSERT_ACC(nn); - ALPAKA_ASSERT_ACC(ezt2); using Hist = cms::alpakatools::HistoContainer; auto& hist = alpaka::declareSharedVar(acc); @@ -65,6 +59,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) { hist.off[j] = 0; } + for (auto j : cms::alpakatools::uniform_elements(acc, 32)) { + hws[j] = 0; // used by prefix scan in hist.finalize() + } alpaka::syncBlockThreads(acc); if constexpr (verbose) { @@ -72,12 +69,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); } + ALPAKA_ASSERT_ACC(static_cast(nt) <= std::numeric_limits::max()); ALPAKA_ASSERT_ACC(static_cast(nt) <= hist.capacity()); + ALPAKA_ASSERT_ACC(eps <= 0.1f); // see below - // fill hist (bin shall be wider than "eps") + // fill hist (bin shall be wider than "eps") for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - ALPAKA_ASSERT_ACC(i < ::zVertex::MAXTRACKS); - int iz = int(zt[i] * 10.); // valid if eps<=0.1 + int iz = static_cast(zt[i] * 10.f); // valid if eps <= 0.1 iz = std::clamp(iz, INT8_MIN, INT8_MAX); izt[i] = iz - INT8_MIN; ALPAKA_ASSERT_ACC(iz - INT8_MIN >= 0); @@ -87,34 +85,30 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { nn[i] = 0; } alpaka::syncBlockThreads(acc); - if (threadIdxLocal < 32) - hws[threadIdxLocal] = 0; // used by prefix scan... - alpaka::syncBlockThreads(acc); + hist.finalize(acc, hws); alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_ACC(hist.size() == nt); for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - hist.fill(acc, izt[i], uint32_t(i)); + hist.fill(acc, izt[i], uint16_t(i)); } alpaka::syncBlockThreads(acc); // count neighbours + const auto errmax2 = errmax * errmax; for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - if (ezt2[i] > er2mx) + if (ezt2[i] > errmax2) continue; - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (i == j) return; auto dist = std::abs(zt[i] - zt[j]); if (dist > eps) return; - // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; nn[i]++; - }; - - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); } - alpaka::syncBlockThreads(acc); // find NN with smaller z... @@ -122,7 +116,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { if (nn[i] < minT) continue; // DBSCAN core rule float mz = zt[i]; - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (zt[j] >= mz) return; if (nn[j] < minT) @@ -130,13 +124,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { auto dist = std::abs(zt[i] - zt[j]); if (dist > eps) return; - // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; mz = zt[j]; iv[i] = j; // assign to cluster (better be unique??) - }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); } - alpaka::syncBlockThreads(acc); #ifdef GPU_DEBUG @@ -155,7 +146,6 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { m = iv[m]; iv[i] = m; } - alpaka::syncBlockThreads(acc); #ifdef GPU_DEBUG @@ -168,27 +158,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { #endif #ifdef GPU_DEBUG - // and verify that we did not spit any cluster... + // and verify that we did not split any cluster... for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { if (nn[i] < minT) continue; // DBSCAN core rule ALPAKA_ASSERT_ACC(zt[iv[i]] <= zt[i]); - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (nn[j] < minT) return; // DBSCAN core rule auto dist = std::abs(zt[i] - zt[j]); if (dist > eps) return; - // if (dist*dist>chi2max*(ezt2[i]+ezt2[j])) return; // they should belong to the same cluster, isn't it? if (iv[i] != iv[j]) { printf("ERROR %d %d %f %f %d\n", i, iv[i], zt[i], zt[iv[i]], iv[iv[i]]); printf(" %d %d %f %f %d\n", j, iv[j], zt[j], zt[iv[j]], iv[iv[j]]); - ; } ALPAKA_ASSERT_ACC(iv[i] == iv[j]); - }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); } alpaka::syncBlockThreads(acc); #endif @@ -199,7 +186,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { if (nn[i] >= minT) continue; // DBSCAN edge rule float mdist = eps; - auto loop = [&](uint32_t j) { + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { if (nn[j] < minT) return; // DBSCAN core rule auto dist = std::abs(zt[i] - zt[j]); @@ -209,8 +196,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { return; // needed? mdist = dist; iv[i] = iv[j]; // assign to cluster (better be unique??) - }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + }); } auto& foundClusters = alpaka::declareSharedVar(acc); @@ -231,7 +217,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { } alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_ACC(foundClusters < ::zVertex::MAXVTX); + ALPAKA_ASSERT_ACC(static_cast(foundClusters) < data.metadata().size()); // propagate the negative id to all the tracks in the cluster. for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { @@ -247,7 +233,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { iv[i] = -iv[i] - 1; } - nvIntermediate = nvFinal = foundClusters; + ws.nvIntermediate() = foundClusters; + data.nvFinal() = foundClusters; if constexpr (verbose) { if (cms::alpakatools::once_per_block(acc)) diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h index c779f5d4affe2..9285cd8e214d2 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/clusterTracksIterative.h @@ -7,7 +7,7 @@ #include -#include "DataFormats/VertexSoA/interface/ZVertexDefinitions.h" +#include "DataFormats/VertexSoA/interface/ZVertexSoA.h" #include "HeterogeneousCore/AlpakaInterface/interface/HistoContainer.h" #include "HeterogeneousCore/AlpakaInterface/interface/config.h" #include "HeterogeneousCore/AlpakaInterface/interface/workdivision.h" @@ -15,219 +15,212 @@ #include "vertexFinder.h" -namespace ALPAKA_ACCELERATOR_NAMESPACE { - namespace vertexFinder { - - // this algo does not really scale as it works in a single block... - // enough for <10K tracks we have - class ClusterTracksIterative { - public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, - VtxSoAView pdata, - WsSoAView pws, - int minT, // min number of neighbours to be "core" - float eps, // max absolute distance to cluster - float errmax, // max error to be "seed" - float chi2max // max normalized distance to cluster - ) const { - constexpr bool verbose = false; // in principle the compiler should optmize out if false - const uint32_t threadIdxLocal(alpaka::getIdx(acc)[0u]); - if constexpr (verbose) { - if (cms::alpakatools::once_per_block(acc)) - printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); - } - auto er2mx = errmax * errmax; - - auto& __restrict__ data = pdata; - auto& __restrict__ ws = pws; - auto nt = ws.ntrks(); - float const* __restrict__ zt = ws.zt(); - float const* __restrict__ ezt2 = ws.ezt2(); - - uint32_t& nvFinal = data.nvFinal(); - uint32_t& nvIntermediate = ws.nvIntermediate(); - - uint8_t* __restrict__ izt = ws.izt(); - int32_t* __restrict__ nn = data.ndof(); - int32_t* __restrict__ iv = ws.iv(); - - ALPAKA_ASSERT_ACC(zt); - ALPAKA_ASSERT_ACC(nn); - ALPAKA_ASSERT_ACC(iv); - ALPAKA_ASSERT_ACC(ezt2); - - using Hist = cms::alpakatools::HistoContainer; - auto& hist = alpaka::declareSharedVar(acc); - auto& hws = alpaka::declareSharedVar(acc); - - for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) { - hist.off[j] = 0; - } - alpaka::syncBlockThreads(acc); +namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { + + // This algo does not really scale as it works in a single block... + // It should be good enough for <10K tracks we have. + class ClusterTracksIterative { + public: + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + VtxSoAView data, + TrkSoAView trkdata, + WsSoAView ws, + int minT, // min number of neighbours to be "core" + float eps, // max absolute distance to cluster + float errmax, // max error to be "seed" + float chi2max // max normalized distance to cluster + ) const { + constexpr bool verbose = false; + + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("params %d %f %f %f\n", minT, eps, errmax, chi2max); + } - if constexpr (verbose) { - if (cms::alpakatools::once_per_block(acc)) - printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); - } + auto nt = ws.ntrks(); + ALPAKA_ASSERT_ACC(static_cast(nt) <= ws.metadata().size()); + ALPAKA_ASSERT_ACC(static_cast(nt) <= trkdata.metadata().size()); + + float const* __restrict__ zt = ws.zt(); + float const* __restrict__ ezt2 = ws.ezt2(); + uint8_t* __restrict__ izt = ws.izt(); + int32_t* __restrict__ iv = ws.iv(); + int32_t* __restrict__ nn = trkdata.ndof(); + ALPAKA_ASSERT_ACC(zt); + ALPAKA_ASSERT_ACC(ezt2); + ALPAKA_ASSERT_ACC(izt); + ALPAKA_ASSERT_ACC(iv); + ALPAKA_ASSERT_ACC(nn); + + using Hist = cms::alpakatools::HistoContainer; + auto& hist = alpaka::declareSharedVar(acc); + auto& hws = alpaka::declareSharedVar(acc); + + for (auto j : cms::alpakatools::uniform_elements(acc, Hist::totbins())) { + hist.off[j] = 0; + } + for (auto j : cms::alpakatools::uniform_elements(acc, 32)) { + hws[j] = 0; // used by prefix scan in hist.finalize() + } + alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_ACC(static_cast(nt) <= hist.capacity()); - - // fill hist (bin shall be wider than "eps") - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - ALPAKA_ASSERT_ACC(i < ::zVertex::MAXTRACKS); - int iz = int(zt[i] * 10.); // valid if eps<=0.1 - iz = std::clamp(iz, INT8_MIN, INT8_MAX); - izt[i] = iz - INT8_MIN; - ALPAKA_ASSERT_ACC(iz - INT8_MIN >= 0); - ALPAKA_ASSERT_ACC(iz - INT8_MIN < 256); - hist.count(acc, izt[i]); - iv[i] = i; - nn[i] = 0; - } - alpaka::syncBlockThreads(acc); + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("booked hist with %d bins, size %d for %d tracks\n", hist.nbins(), hist.capacity(), nt); + } - if (threadIdxLocal < 32) - hws[threadIdxLocal] = 0; // used by prefix scan... - alpaka::syncBlockThreads(acc); + ALPAKA_ASSERT_ACC(static_cast(nt) <= std::numeric_limits::max()); + ALPAKA_ASSERT_ACC(static_cast(nt) <= hist.capacity()); + ALPAKA_ASSERT_ACC(eps <= 0.1f); // see below + + // fill hist (bin shall be wider than "eps") + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + int iz = static_cast(zt[i] * 10.f); // valid if eps <= 0.1 + iz = std::clamp(iz, INT8_MIN, INT8_MAX); + izt[i] = iz - INT8_MIN; + ALPAKA_ASSERT_ACC(iz - INT8_MIN >= 0); + ALPAKA_ASSERT_ACC(iz - INT8_MIN < 256); + hist.count(acc, izt[i]); + iv[i] = i; + nn[i] = 0; + } + alpaka::syncBlockThreads(acc); - hist.finalize(acc, hws); - alpaka::syncBlockThreads(acc); + hist.finalize(acc, hws); + alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_ACC(hist.size() == nt); - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - hist.fill(acc, izt[i], uint16_t(i)); - } - alpaka::syncBlockThreads(acc); - - // count neighbours - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - if (ezt2[i] > er2mx) - continue; - auto loop = [&](uint32_t j) { - if (i == j) - return; - auto dist = std::abs(zt[i] - zt[j]); - if (dist > eps) - return; - if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) - return; - nn[i]++; - }; - - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); - } + ALPAKA_ASSERT_ACC(hist.size() == nt); + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + hist.fill(acc, izt[i], uint16_t(i)); + } + alpaka::syncBlockThreads(acc); + + // count neighbours + const auto errmax2 = errmax * errmax; + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + if (ezt2[i] > errmax2) + continue; + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](uint32_t j) { + if (i == j) + return; + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; + nn[i]++; + }); + } - auto& nloops = alpaka::declareSharedVar(acc); - nloops = 0; - - alpaka::syncBlockThreads(acc); - - // cluster seeds only - bool more = true; - while (alpaka::syncBlockThreadsPredicate(acc, more)) { - if (1 == nloops % 2) { - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - auto m = iv[i]; - while (m != iv[m]) - m = iv[m]; - iv[i] = m; - } - } else { - more = false; - for (auto k : cms::alpakatools::uniform_elements(acc, hist.size())) { - auto p = hist.begin() + k; - auto i = (*p); - auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); - if (nn[i] < minT) - continue; // DBSCAN core rule - auto loop = [&](uint32_t j) { - ALPAKA_ASSERT_ACC(i != j); - if (nn[j] < minT) - return; // DBSCAN core rule - auto dist = std::abs(zt[i] - zt[j]); - if (dist > eps) - return; - if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) - return; - auto old = alpaka::atomicMin(acc, &iv[j], iv[i], alpaka::hierarchy::Blocks{}); - if (old != iv[i]) { - // end the loop only if no changes were applied - more = true; - } - alpaka::atomicMin(acc, &iv[i], old, alpaka::hierarchy::Blocks{}); - }; - ++p; - for (; p < hist.end(be); ++p) - loop(*p); - } // for i + auto& nloops = alpaka::declareSharedVar(acc); + nloops = 0; + alpaka::syncBlockThreads(acc); + + // cluster seeds only + bool more = true; + while (alpaka::syncBlockThreadsPredicate(acc, more)) { + if (1 == nloops % 2) { + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + auto m = iv[i]; + while (m != iv[m]) + m = iv[m]; + iv[i] = m; } - if (threadIdxLocal == 0) - ++nloops; - } // while - - // collect edges (assign to closest cluster of closest point??? here to closest point) - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule - if (nn[i] >= minT) - continue; // DBSCAN edge rule - float mdist = eps; - auto loop = [&](int j) { - if (nn[j] < minT) - return; // DBSCAN core rule - auto dist = std::abs(zt[i] - zt[j]); - if (dist > mdist) - return; - if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) - return; // needed? - mdist = dist; - iv[i] = iv[j]; // assign to cluster (better be unique??) - }; - cms::alpakatools::forEachInBins(hist, izt[i], 1, loop); + } else { + more = false; + for (auto k : cms::alpakatools::uniform_elements(acc, hist.size())) { + auto p = hist.begin() + k; + auto i = *p; + ++p; + if (nn[i] < minT) + continue; // DBSCAN core rule + auto be = std::min(Hist::bin(izt[i]) + 1, int(hist.nbins() - 1)); + for (; p < hist.end(be); ++p) { + uint32_t j = *p; + ALPAKA_ASSERT_ACC(i != j); + if (nn[j] < minT) + continue; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > eps) + continue; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + continue; + auto old = alpaka::atomicMin(acc, &iv[j], iv[i], alpaka::hierarchy::Threads{}); + if (old != iv[i]) { + // end the loop only if no changes were applied + more = true; + } + alpaka::atomicMin(acc, &iv[i], old, alpaka::hierarchy::Threads{}); + } // inner loop on p (and j) + } // outer loop on k (and i) } + if (cms::alpakatools::once_per_block(acc)) + ++nloops; + } // while + + // collect edges (assign to closest cluster of closest point??? here to closest point) + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + // if (nn[i]==0 || nn[i]>=minT) continue; // DBSCAN edge rule + if (nn[i] >= minT) + continue; // DBSCAN edge rule + float mdist = eps; + cms::alpakatools::forEachInBins(hist, izt[i], 1, [&](int j) { + if (nn[j] < minT) + return; // DBSCAN core rule + auto dist = std::abs(zt[i] - zt[j]); + if (dist > mdist) + return; + if (dist * dist > chi2max * (ezt2[i] + ezt2[j])) + return; // needed? + mdist = dist; + iv[i] = iv[j]; // assign to cluster (better be unique??) + }); + } - auto& foundClusters = alpaka::declareSharedVar(acc); - foundClusters = 0; - alpaka::syncBlockThreads(acc); - - // find the number of different clusters, identified by a tracks with clus[i] == i; - // mark these tracks with a negative id. - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - if (iv[i] == int(i)) { - if (nn[i] >= minT) { - auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{}); - iv[i] = -(old + 1); - } else { // noise - iv[i] = -9998; - } + auto& foundClusters = alpaka::declareSharedVar(acc); + foundClusters = 0; + alpaka::syncBlockThreads(acc); + + // find the number of different clusters, identified by a tracks with clus[i] == i; + // mark these tracks with a negative id. + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + if (iv[i] == int(i)) { + if (nn[i] >= minT) { + auto old = alpaka::atomicInc(acc, &foundClusters, 0xffffffff, alpaka::hierarchy::Threads{}); + iv[i] = -(old + 1); + } else { // noise + iv[i] = -9998; } } - alpaka::syncBlockThreads(acc); + } + alpaka::syncBlockThreads(acc); - ALPAKA_ASSERT_ACC(foundClusters < ::zVertex::MAXVTX); + ALPAKA_ASSERT_ACC(static_cast(foundClusters) < data.metadata().size()); - // propagate the negative id to all the tracks in the cluster. - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - if (iv[i] >= 0) { - // mark each track in a cluster with the same id as the first one - iv[i] = iv[iv[i]]; - } + // propagate the negative id to all the tracks in the cluster. + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + if (iv[i] >= 0) { + // mark each track in a cluster with the same id as the first one + iv[i] = iv[iv[i]]; } - alpaka::syncBlockThreads(acc); + } + alpaka::syncBlockThreads(acc); - // adjust the cluster id to be a positive value starting from 0 - for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - iv[i] = -iv[i] - 1; - } + // adjust the cluster id to be a positive value starting from 0 + for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { + iv[i] = -iv[i] - 1; + } - nvIntermediate = nvFinal = foundClusters; + ws.nvIntermediate() = foundClusters; + data.nvFinal() = foundClusters; - if constexpr (verbose) { - if (cms::alpakatools::once_per_block(acc)) - printf("found %d proto vertices\n", foundClusters); - } + if constexpr (verbose) { + if (cms::alpakatools::once_per_block(acc)) + printf("found %d proto vertices\n", foundClusters); } - }; - } // namespace vertexFinder -} // namespace ALPAKA_ACCELERATOR_NAMESPACE -#endif // RecoVertex_PixelVertexFinding_plugins_clusterTracksIterativeAlpaka_h + } + }; + +} // namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder + +#endif // RecoTracker_PixelVertexFinding_plugins_clusterTracksIterativeAlpaka_h diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/fitVertices.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/fitVertices.h index 371eab61bb599..28fa923bcf82e 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/fitVertices.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/fitVertices.h @@ -15,15 +15,16 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices(const TAcc& acc, + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void fitVertices(Acc1D const& acc, VtxSoAView& pdata, + TrkSoAView& ptrkdata, WsSoAView& pws, float chi2Max // for outlier rejection ) { constexpr bool verbose = false; // in principle the compiler should optmize out if false auto& __restrict__ data = pdata; + auto& __restrict__ trkdata = ptrkdata; auto& __restrict__ ws = pws; auto nt = ws.ntrks(); float const* __restrict__ zt = ws.zt(); @@ -34,7 +35,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { uint32_t& nvFinal = data.nvFinal(); uint32_t& nvIntermediate = ws.nvIntermediate(); - int32_t* __restrict__ nn = data.ndof(); + int32_t* __restrict__ nn = trkdata.ndof(); int32_t* __restrict__ iv = ws.iv(); ALPAKA_ASSERT_ACC(nvFinal <= nvIntermediate); @@ -123,13 +124,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { class FitVerticesKernel { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, VtxSoAView pdata, + TrkSoAView ptrkdata, WsSoAView pws, float chi2Max // for outlier rejection ) const { - fitVertices(acc, pdata, pws, chi2Max); + fitVertices(acc, pdata, ptrkdata, pws, chi2Max); } }; diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/sortByPt2.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/sortByPt2.h index eae01e2061117..6b715c1f5579b 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/sortByPt2.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/sortByPt2.h @@ -20,10 +20,10 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { using VtxSoAView = ::reco::ZVertexSoAView; + using TrkSoAView = ::reco::ZVertexTracksSoAView; using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(const TAcc& acc, VtxSoAView& data, WsSoAView& ws) { + ALPAKA_FN_ACC ALPAKA_FN_INLINE void sortByPt2(Acc1D const& acc, VtxSoAView& data, TrkSoAView& trkdata, WsSoAView& ws) { auto nt = ws.ntrks(); float const* __restrict__ ptt2 = ws.ptt2(); uint32_t const& nvFinal = data.nvFinal(); @@ -37,7 +37,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { // fill indexing for (auto i : cms::alpakatools::uniform_elements(acc, nt)) { - data.idv()[ws.itrk()[i]] = iv[i]; + trkdata.idv()[ws.itrk()[i]] = iv[i]; }; // can be done asynchronously at the end of previous event @@ -60,7 +60,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { return; } - if constexpr (not cms::alpakatools::requires_single_thread_per_block_v) { + if constexpr (not cms::alpakatools::requires_single_thread_per_block_v) { auto& sws = alpaka::declareSharedVar(acc); // sort using only 16 bits cms::alpakatools::radixSort(acc, ptv2, sortInd, sws, nvFinal); @@ -73,9 +73,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { class SortByPt2Kernel { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const { - sortByPt2(acc, pdata, pws); + ALPAKA_FN_ACC void operator()(Acc1D const& acc, VtxSoAView pdata, TrkSoAView ptrkdata, WsSoAView pws) const { + sortByPt2(acc, pdata, ptrkdata, pws); } }; diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/splitVertices.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/splitVertices.h index a286b454b0716..6517421e62193 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/splitVertices.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/splitVertices.h @@ -16,12 +16,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { using VtxSoAView = ::reco::ZVertexSoAView; + using TrkSoAView = ::reco::ZVertexTracksSoAView; using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; - template - ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices(const TAcc& acc, - VtxSoAView& data, - WsSoAView& ws, - float maxChi2) { + + ALPAKA_FN_ACC ALPAKA_FN_INLINE __attribute__((always_inline)) void splitVertices( + Acc1D const& acc, VtxSoAView& data, TrkSoAView& trkdata, WsSoAView& ws, float maxChi2) { constexpr bool verbose = false; // in principle the compiler should optmize out if false constexpr uint32_t MAXTK = 512; @@ -33,7 +32,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { // one vertex per block for (auto kv : cms::alpakatools::independent_groups(acc, data.nvFinal())) { - int32_t ndof = data[kv].ndof(); + int32_t ndof = trkdata[kv].ndof(); if (ndof < 4) continue; if (data[kv].chi2() < maxChi2 * float(ndof)) @@ -140,9 +139,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { class SplitVerticesKernel { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView data, WsSoAView ws, float maxChi2) const { - splitVertices(acc, data, ws, maxChi2); + ALPAKA_FN_ACC void operator()( + Acc1D const& acc, VtxSoAView data, TrkSoAView trkdata, WsSoAView ws, float maxChi2) const { + splitVertices(acc, data, trkdata, ws, maxChi2); } }; diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc b/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc index 2a0172262ad3a..2350f73a76cbb 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.dev.cc @@ -28,11 +28,11 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { template class LoadTracks { public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, reco::TrackSoAConstView tracks_view, - VtxSoAView soa, - WsSoAView pws, + VtxSoAView data, + TrkSoAView trkdata, + WsSoAView ws, float ptMin, float ptMax) const { auto const* quality = tracks_view.quality(); @@ -42,103 +42,109 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { [[maybe_unused]] auto nHits = helper::nHits(tracks_view, idx); ALPAKA_ASSERT_ACC(nHits >= 3); - // initialize soa... - soa[idx].idv() = -1; + // initialize the track data + trkdata[idx].idv() = -1; + // do not use triplets if (reco::isTriplet(tracks_view, idx)) - continue; // no triplets + continue; + + // use only "high purity" track if (quality[idx] < ::pixelTrack::Quality::highPurity) continue; auto pt = tracks_view[idx].pt(); - + // pT min cut if (pt < ptMin) continue; - // clamp pt + // clamp pT to the pTmax pt = std::min(pt, ptMax); - auto& data = pws; - auto it = alpaka::atomicAdd(acc, &data.ntrks(), 1u, alpaka::hierarchy::Blocks{}); - data[it].itrk() = idx; - data[it].zt() = reco::zip(tracks_view, idx); - data[it].ezt2() = tracks_view[idx].covariance()(14); - data[it].ptt2() = pt * pt; + // load the track data into the workspace + auto it = alpaka::atomicAdd(acc, &ws.ntrks(), 1u, alpaka::hierarchy::Blocks{}); + ws[it].itrk() = idx; + ws[it].zt() = reco::zip(tracks_view, idx); + ws[it].ezt2() = tracks_view[idx].covariance()(14); + ws[it].ptt2() = pt * pt; } } }; + // #define THREE_KERNELS #ifndef THREE_KERNELS class VertexFinderOneKernel { public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc& acc, - VtxSoAView pdata, - WsSoAView pws, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + VtxSoAView data, + TrkSoAView trkdata, + WsSoAView ws, bool doSplit, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) const { - clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + clusterTracksByDensity(acc, data, trkdata, ws, minT, eps, errmax, chi2max); alpaka::syncBlockThreads(acc); - fitVertices(acc, pdata, pws, maxChi2ForFirstFit); + fitVertices(acc, data, trkdata, ws, maxChi2ForFirstFit); alpaka::syncBlockThreads(acc); if (doSplit) { - splitVertices(acc, pdata, pws, maxChi2ForSplit); + splitVertices(acc, data, trkdata, ws, maxChi2ForSplit); alpaka::syncBlockThreads(acc); - fitVertices(acc, pdata, pws, maxChi2ForFinalFit); + fitVertices(acc, data, trkdata, ws, maxChi2ForFinalFit); alpaka::syncBlockThreads(acc); } - sortByPt2(acc, pdata, pws); + sortByPt2(acc, data, trkdata, ws); } }; #else class VertexFinderKernel1 { public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc& acc, - VtxSoAView pdata, - WsSoAView pws, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, + VtxSoAView data, + WsSoAView ws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) const { - clusterTracksByDensity(pdata, pws, minT, eps, errmax, chi2max); + clusterTracksByDensity(acc, data, ws, minT, eps, errmax, chi2max); alpaka::syncBlockThreads(acc); - fitVertices(pdata, pws, maxChi2ForFirstFit); + fitVertices(acc, data, ws, maxChi2ForFirstFit); } }; + class VertexFinderKernel2 { public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc& acc, VtxSoAView pdata, WsSoAView pws) const { - fitVertices(pdata, pws, maxChi2ForFinalFit); + ALPAKA_FN_ACC void operator()(Acc1D const& acc, VtxSoAView data, WsSoAView ws) const { + fitVertices(acc, data, ws, maxChi2ForFinalFit); alpaka::syncBlockThreads(acc); - sortByPt2(pdata, pws); + sortByPt2(data, ws); } }; #endif template ZVertexSoACollection Producer::makeAsync(Queue& queue, - const reco::TrackSoAConstView& tracks_view, + reco::TrackSoAConstView const& tracks_view, + int maxVertices, float ptMin, float ptMax) const { #ifdef PIXVERTEX_DEBUG_PRODUCE std::cout << "producing Vertices on GPU" << std::endl; #endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexSoACollection vertices(queue); - - auto soa = vertices.view(); + const auto maxTracks = tracks_view.metadata().size(); + ZVertexSoACollection vertices({{maxVertices, maxTracks}}, queue); + auto data = vertices.view(); + auto trkdata = vertices.view(); - auto ws_d = PixelVertexWorkSpaceSoADevice(::zVertex::MAXTRACKS, queue); + PixelVertexWorkSpaceSoADevice workspace(maxTracks, queue); + auto ws = workspace.view(); // Initialize const auto initWorkDiv = cms::alpakatools::make_workdiv(1, 1); - alpaka::exec(queue, initWorkDiv, Init{}, soa, ws_d.view()); + alpaka::exec(queue, initWorkDiv, Init{}, data, ws); // Load Tracks const uint32_t blockSize = 128; @@ -146,7 +152,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { cms::alpakatools::divide_up_by(tracks_view.metadata().size() + blockSize - 1, blockSize); const auto loadTracksWorkDiv = cms::alpakatools::make_workdiv(numberOfBlocks, blockSize); alpaka::exec( - queue, loadTracksWorkDiv, LoadTracks{}, tracks_view, soa, ws_d.view(), ptMin, ptMax); + queue, loadTracksWorkDiv, LoadTracks{}, tracks_view, data, trkdata, ws, ptMin, ptMax); // Running too many thread lead to problems when printf is enabled. const auto finderSorterWorkDiv = cms::alpakatools::make_workdiv(1, 1024 - 128); @@ -158,8 +164,9 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { alpaka::exec(queue, finderSorterWorkDiv, VertexFinderOneKernel{}, - soa, - ws_d.view(), + data, + trkdata, + ws, doSplitting_, minT, eps, @@ -167,34 +174,34 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { chi2max); #else alpaka::exec( - queue, finderSorterWorkDiv, VertexFinderOneKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, VertexFinderOneKernel{}, data, trkdata, ws, minT, eps, errmax, chi2max); // one block per vertex... if (doSplitting_) - alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit); - alpaka::exec(queue, finderSorterWorkDiv{}, soa, ws_d.view()); + alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, data, trkdata, ws, maxChi2ForSplit); + alpaka::exec(queue, finderSorterWorkDiv{}, data, ws); #endif } else { // five kernels if (useDensity_) { alpaka::exec( - queue, finderSorterWorkDiv, ClusterTracksByDensityKernel{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, ClusterTracksByDensityKernel{}, data, trkdata, ws, minT, eps, errmax, chi2max); } else if (useDBSCAN_) { alpaka::exec( - queue, finderSorterWorkDiv, ClusterTracksDBSCAN{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, ClusterTracksDBSCAN{}, data, trkdata, ws, minT, eps, errmax, chi2max); } else if (useIterative_) { alpaka::exec( - queue, finderSorterWorkDiv, ClusterTracksIterative{}, soa, ws_d.view(), minT, eps, errmax, chi2max); + queue, finderSorterWorkDiv, ClusterTracksIterative{}, data, trkdata, ws, minT, eps, errmax, chi2max); } - alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFirstFit); + alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, data, trkdata, ws, maxChi2ForFirstFit); // one block per vertex... if (doSplitting_) { - alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, soa, ws_d.view(), maxChi2ForSplit); + alpaka::exec(queue, splitterFitterWorkDiv, SplitVerticesKernel{}, data, trkdata, ws, maxChi2ForSplit); - alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, soa, ws_d.view(), maxChi2ForFinalFit); + alpaka::exec(queue, finderSorterWorkDiv, FitVerticesKernel{}, data, trkdata, ws, maxChi2ForFinalFit); } - alpaka::exec(queue, finderSorterWorkDiv, SortByPt2Kernel{}, soa, ws_d.view()); + alpaka::exec(queue, finderSorterWorkDiv, SortByPt2Kernel{}, data, trkdata, ws); } return vertices; diff --git a/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.h b/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.h index f01ec7e445c8a..82745b1de3a30 100644 --- a/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.h +++ b/RecoVertex/PixelVertexFinding/plugins/alpaka/vertexFinder.h @@ -19,14 +19,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { using namespace cms::alpakatools; using VtxSoAView = ::reco::ZVertexSoAView; + using TrkSoAView = ::reco::ZVertexTracksSoAView; using WsSoAView = ::vertexFinder::PixelVertexWorkSpaceSoAView; class Init { public: - template >> - ALPAKA_FN_ACC void operator()(const TAcc &acc, VtxSoAView pdata, WsSoAView pws) const { - pdata.nvFinal() = 0; // initialization - ::vertexFinder::init(pws); + ALPAKA_FN_ACC void operator()(Acc1D const &acc, VtxSoAView data, WsSoAView ws) const { + data.nvFinal() = 0; // initialization + ::vertexFinder::init(ws); } }; @@ -57,7 +57,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE::vertexFinder { ~Producer() = default; - ZVertexSoACollection makeAsync(Queue &queue, const TkSoAConstView &tracks_view, float ptMin, float ptMax) const; + ZVertexSoACollection makeAsync( + Queue &queue, TkSoAConstView const &tracks_view, int maxVertices, float ptMin, float ptMax) const; private: const bool oneKernel_; // run everything (cluster,fit,split,sort) in one kernel. Uses only density clusterizer diff --git a/RecoVertex/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc b/RecoVertex/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc index 6e5c50f0dfb36..4e2e500745bff 100644 --- a/RecoVertex/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc +++ b/RecoVertex/PixelVertexFinding/test/alpaka/VertexFinder_t.dev.cc @@ -84,24 +84,24 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { #ifdef ONE_KERNEL class VertexFinderOneKernel { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, vertexFinder::VtxSoAView pdata, + vertexFinder::TrkSoAView ptrkdata, vertexFinder::WsSoAView pws, int minT, // min number of neighbours to be "seed" float eps, // max absolute distance to cluster float errmax, // max error to be "seed" float chi2max // max normalized distance to cluster, ) const { - vertexFinder::clusterTracksByDensity(acc, pdata, pws, minT, eps, errmax, chi2max); + vertexFinder::clusterTracksByDensity(acc, pdata, ptrkdata, pws, minT, eps, errmax, chi2max); alpaka::syncBlockThreads(acc); - vertexFinder::fitVertices(acc, pdata, pws, 50.); + vertexFinder::fitVertices(acc, pdata, ptrkdata, pws, 50.); alpaka::syncBlockThreads(acc); - vertexFinder::splitVertices(acc, pdata, pws, 9.f); + vertexFinder::splitVertices(acc, pdata, ptrkdata, pws, 9.f); alpaka::syncBlockThreads(acc); - vertexFinder::fitVertices(acc, pdata, pws, 5000.); + vertexFinder::fitVertices(acc, pdata, ptrkdata, pws, 5000.); alpaka::syncBlockThreads(acc); - vertexFinder::sortByPt2(acc, pdata, pws); + vertexFinder::sortByPt2(acc, pdata, ptrkdata, pws); alpaka::syncBlockThreads(acc); } }; @@ -109,8 +109,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { class Kernel_print { public: - template - ALPAKA_FN_ACC void operator()(const TAcc& acc, + ALPAKA_FN_ACC void operator()(Acc1D const& acc, vertexFinder::VtxSoAView pdata, vertexFinder::WsSoAView pws) const { printf("nt,nv %d %d,%d\n", pws.ntrks(), pdata.nvFinal(), pws.nvIntermediate()); @@ -118,10 +117,14 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { }; void runKernels(Queue& queue) { - vertexFinder::PixelVertexWorkSpaceSoADevice ws_d(zVertex::MAXTRACKS, queue); - vertexFinder::PixelVertexWorkSpaceSoAHost ws_h(zVertex::MAXTRACKS, queue); - ZVertexHost vertices_h(queue); - ZVertexSoACollection vertices_d(queue); + // Run 3 values, used for testing + constexpr uint32_t maxTracks = 32 * 1024; + constexpr uint32_t maxVertices = 1024; + + vertexFinder::PixelVertexWorkSpaceSoADevice ws_d(maxTracks, queue); + vertexFinder::PixelVertexWorkSpaceSoAHost ws_h(maxTracks, queue); + ZVertexHost vertices_h({{maxVertices, maxTracks}}, queue); + ZVertexSoACollection vertices_d({{maxVertices, maxTracks}}, queue); float eps = 0.1f; std::array par{{eps, 0.01f, 9.0f}}; @@ -157,14 +160,23 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { workDivClusterizer, VertexFinderOneKernel{}, vertices_d.view(), + vertices_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); #else - alpaka::exec( - queue, workDivClusterizer, CLUSTERIZE{}, vertices_d.view(), ws_d.view(), kk, par[0], par[1], par[2]); + alpaka::exec(queue, + workDivClusterizer, + CLUSTERIZE{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + kk, + par[0], + par[1], + par[2]); #endif alpaka::wait(queue); alpaka::exec(queue, workDiv1D, Kernel_print{}, vertices_d.view(), ws_d.view()); @@ -172,8 +184,13 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto workDivFitter = make_workdiv(1, 1024 - 256); - alpaka::exec( - queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f); + alpaka::exec(queue, + workDivFitter, + vertexFinder::FitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 50.f); alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); alpaka::wait(queue); @@ -184,8 +201,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) - if (vertices_h.view().ndof()[j] > 0) - vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); { auto mx = std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); @@ -193,14 +210,19 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { << *mx.second << std::endl; } - alpaka::exec( - queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 50.f); + alpaka::exec(queue, + workDivFitter, + vertexFinder::FitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 50.f); alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); alpaka::wait(queue); for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) - if (vertices_h.view().ndof()[j] > 0) - vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); { auto mx = std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); @@ -211,17 +233,32 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { auto workDivSplitter = make_workdiv(1024, 64); // one vertex per block!!! - alpaka::exec( - queue, workDivSplitter, vertexFinder::SplitVerticesKernel{}, vertices_d.view(), ws_d.view(), 9.f); + alpaka::exec(queue, + workDivSplitter, + vertexFinder::SplitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 9.f); alpaka::memcpy(queue, ws_h.buffer(), ws_d.buffer()); alpaka::wait(queue); std::cout << "after split " << ws_h.view().nvIntermediate() << std::endl; - alpaka::exec( - queue, workDivFitter, vertexFinder::FitVerticesKernel{}, vertices_d.view(), ws_d.view(), 5000.f); + alpaka::exec(queue, + workDivFitter, + vertexFinder::FitVerticesKernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view(), + 5000.f); auto workDivSorter = make_workdiv(1, 256); - alpaka::exec(queue, workDivSorter, vertexFinder::SortByPt2Kernel{}, vertices_d.view(), ws_d.view()); + alpaka::exec(queue, + workDivSorter, + vertexFinder::SortByPt2Kernel{}, + vertices_d.view(), + vertices_d.view(), + ws_d.view()); alpaka::memcpy(queue, vertices_h.buffer(), vertices_d.buffer()); alpaka::wait(queue); @@ -231,8 +268,8 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { } for (auto j = 0U; j < vertices_h.view().nvFinal(); ++j) - if (vertices_h.view().ndof()[j] > 0) - vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); + if (vertices_h.view().ndof()[j] > 0) + vertices_h.view().chi2()[j] /= float(vertices_h.view().ndof()[j]); { auto mx = std::minmax_element(vertices_h.view().chi2(), vertices_h.view().chi2() + vertices_h.view().nvFinal()); @@ -279,7 +316,7 @@ namespace ALPAKA_ACCELERATOR_NAMESPACE { std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; } // loop on events - } // lopp on ave vert + } // loop on ave vert } } // namespace vertexfinder_t } // namespace ALPAKA_ACCELERATOR_NAMESPACE