Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[12_3_X] Backport of 37617 + 37798 #37933

Merged
merged 2 commits into from
May 17, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 119 additions & 23 deletions CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@
template <typename Traits>
class TrackingRecHit2DHeterogeneous {
public:
enum class Storage32 {
kXLocal = 0,
kYLocal = 1,
kXerror = 2,
kYerror = 3,
kCharge = 4,
kXGlobal = 5,
kYGlobal = 6,
kZGlobal = 7,
kRGlobal = 8,
kPhiStorage = 9,
kLayers = 10
};

enum class Storage16 {
kDetId = 0,
kPhi = 1,
kXSize = 2,
kYSize = 3,
};

template <typename T>
using unique_ptr = typename Traits::template unique_ptr<T>;

Expand All @@ -24,6 +45,8 @@ class TrackingRecHit2DHeterogeneous {
cudaStream_t stream,
TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);

explicit TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
~TrackingRecHit2DHeterogeneous() = default;

TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
Expand All @@ -44,18 +67,21 @@ class TrackingRecHit2DHeterogeneous {
auto phiBinnerStorage() { return m_phiBinnerStorage; }
auto iphi() { return m_iphi; }

// only the local coord and detector index
cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;

cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;

cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;

// needs specialization for Host
void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);

private:
static constexpr uint32_t n16 = 4; // number of elements in m_store16
static constexpr uint32_t n32 = 10; // number of elements in m_store32
static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious

static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
unique_ptr<uint16_t[]> m_store16; //!
unique_ptr<float[]> m_store32; //!

Expand Down Expand Up @@ -108,7 +134,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(

// if empy do not bother
if (0 == nHits) {
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version
Expand All @@ -123,7 +149,7 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
// so unless proven VERY inefficient we keep it ordered as generated

// host copy is "reduced" (to be reviewed at some point)
if constexpr (std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
// it has to compile for ALL cases
copyFromGPU(input, stream);
} else {
Expand All @@ -139,43 +165,113 @@ TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));

auto get32 = [&](int i) { return m_store32.get() + i * nHits; };
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };

// copy all the pointers
m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_phiBinnerStorage = view->m_phiBinnerStorage =
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(9));
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));

view->m_xl = get32(0);
view->m_yl = get32(1);
view->m_xerr = get32(2);
view->m_yerr = get32(3);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(4));
view->m_xl = get32(Storage32::kXLocal);
view->m_yl = get32(Storage32::kYLocal);
view->m_xerr = get32(Storage32::kXerror);
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));

if constexpr (!std::is_same<Traits, cms::cudacompat::HostTraits>::value) {
if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
assert(input == nullptr);
view->m_xg = get32(5);
view->m_yg = get32(6);
view->m_zg = get32(7);
view->m_rg = get32(8);
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
view->m_zg = get32(Storage32::kZGlobal);
view->m_rg = get32(Storage32::kRGlobal);

auto get16 = [&](int i) { return m_store16.get() + i * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(1));
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));

view->m_xsize = reinterpret_cast<int16_t*>(get16(2));
view->m_ysize = reinterpret_cast<int16_t*>(get16(3));
view->m_detInd = get16(0);
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
view->m_detInd = get16(Storage16::kDetId);

m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(n32));
m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
}

// transfer view
if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version
}
}

//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
template <typename Traits>
TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
: m_nHits(nHits), m_hitsModuleStart(modules) {
auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);

m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);

view->m_nHits = nHits;

if (0 == nHits) {
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view = std::move(view);
}
return;
}

m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);

view->m_averageGeometry = m_AverageGeometryStore.get();
view->m_hitsModuleStart = m_hitsModuleStart;

//store transfer
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_store16, store16, stream);
cms::cuda::copyAsync(m_store32, store32, stream);
} else {
std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it
std::copy(store16, store16 + nHits * n16, m_store16.get());
}

//getters
auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };

//Store 32
view->m_xl = get32(Storage32::kXLocal);
view->m_yl = get32(Storage32::kYLocal);
view->m_xerr = get32(Storage32::kXerror);
view->m_yerr = get32(Storage32::kYerror);
view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
view->m_xg = get32(Storage32::kXGlobal);
view->m_yg = get32(Storage32::kYGlobal);
view->m_zg = get32(Storage32::kZGlobal);
view->m_rg = get32(Storage32::kRGlobal);

m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
m_phiBinnerStorage = view->m_phiBinnerStorage =
reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));

//Store 16
view->m_detInd = get16(Storage16::kDetId);
m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));

// transfer view
if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
cms::cuda::copyAsync(m_view, view, stream);
} else {
m_view = std::move(view);
}
}

#endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,20 @@ cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(
return ret;
}

template <>
cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
return ret;
}

template <>
cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,11 @@
from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1MonitorVertexSoA_cfi import *
from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1MonitorRecHitsSoA_cfi import *

from Configuration.ProcessModifiers.gpu_cff import gpu
gpu.toModify(siPixelPhase1MonitorRecHitsSoA, pixelHitsSrc = "siPixelRecHitsPreSplittingSoA")

from Configuration.ProcessModifiers.pixelNtupletFit_cff import pixelNtupletFit
pixelNtupletFit.toModify(siPixelPhase1MonitorRecHitsSoA, pixelHitsSrc = "siPixelRecHitsPreSplittingSoA")

monitorpixelSoASource = cms.Sequence(siPixelPhase1MonitorRecHitsSoA * siPixelPhase1MonitorTrackSoA * siPixelPhase1MonitorVertexSoA)


#Define the sequence for GPU vs CPU validation
#This should run:- individual monitor for the 2 collections + comparison module
from DQM.SiPixelPhase1Heterogeneous.siPixelPhase1CompareTrackSoA_cfi import *
Expand All @@ -35,8 +33,19 @@
topFolderName = 'SiPixelHeterogeneous/PixelVertexSoAGPU',
)

siPixelPhase1MonitorRecHitsSoACPU = siPixelPhase1MonitorRecHitsSoA.clone(
pixelHitsSrc = "siPixelRecHitsPreSplittingSoA@cpu",
TopFolderName = "SiPixelHeterogeneous/PixelRecHitsSoACPU"
)

siPixelPhase1MonitorRecHitsSoAGPU = siPixelPhase1MonitorRecHitsSoA.clone(
pixelHitsSrc = "siPixelRecHitsPreSplittingSoA@cuda",
TopFolderName = "SiPixelHeterogeneous/PixelRecHitsSoAGPU"
)

monitorpixelSoACompareSource = cms.Sequence(siPixelPhase1MonitorTrackSoAGPU *
monitorpixelSoACompareSource = cms.Sequence(siPixelPhase1MonitorRecHitsSoACPU *
siPixelPhase1MonitorRecHitsSoAGPU *
siPixelPhase1MonitorTrackSoAGPU *
siPixelPhase1MonitorTrackSoACPU *
siPixelPhase1CompareTrackSoA *
siPixelPhase1MonitorVertexSoACPU *
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include <cuda_runtime.h>

#include <fmt/printf.h>

#include "CUDADataFormats/Common/interface/HostProduct.h"
#include "CUDADataFormats/Common/interface/Product.h"
#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
#include "DataFormats/Common/interface/DetSetVectorNew.h"
#include "DataFormats/Common/interface/Handle.h"
#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
#include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
#include "FWCore/Framework/interface/Event.h"
#include "FWCore/Framework/interface/EventSetup.h"
#include "FWCore/Framework/interface/MakerMacros.h"
#include "FWCore/Framework/interface/stream/EDProducer.h"
#include "FWCore/MessageLogger/interface/MessageLogger.h"
#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
#include "FWCore/ParameterSet/interface/ParameterSet.h"
#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
#include "FWCore/Utilities/interface/InputTag.h"
#include "Geometry/CommonDetUnit/interface/PixelGeomDetUnit.h"
#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
#include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"

class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
public:
explicit SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig);
~SiPixelRecHitSoAFromCUDA() override = default;

static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
using HMSstorage = HostProduct<uint32_t[]>;

private:
void acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;

const edm::EDGetTokenT<cms::cuda::Product<TrackingRecHit2DGPU>> hitsTokenGPU_; // CUDA hits
const edm::EDPutTokenT<TrackingRecHit2DCPU> hitsPutTokenCPU_;
const edm::EDPutTokenT<HMSstorage> hostPutToken_;

uint32_t nHits_;
uint32_t nMaxModules_;

cms::cuda::host::unique_ptr<float[]> store32_;
cms::cuda::host::unique_ptr<uint16_t[]> store16_;
cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
};

SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
: hitsTokenGPU_(
consumes<cms::cuda::Product<TrackingRecHit2DGPU>>(iConfig.getParameter<edm::InputTag>("pixelRecHitSrc"))),
hitsPutTokenCPU_(produces<TrackingRecHit2DCPU>()),
hostPutToken_(produces<HMSstorage>()) {}

void SiPixelRecHitSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
edm::ParameterSetDescription desc;
desc.add<edm::InputTag>("pixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplittingCUDA"));
descriptions.addWithDefaultLabel(desc);
}

void SiPixelRecHitSoAFromCUDA::acquire(edm::Event const& iEvent,
edm::EventSetup const& iSetup,
edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
cms::cuda::Product<TrackingRecHit2DGPU> const& inputDataWrapped = iEvent.get(hitsTokenGPU_);
cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
auto const& inputData = ctx.get(inputDataWrapped);

nHits_ = inputData.nHits();
LogDebug("SiPixelRecHitSoAFromCUDA") << "copying to cpu SoA" << inputData.nHits() << " Hits";

if (0 == nHits_)
return;
nMaxModules_ = inputData.nMaxModules();
store32_ = inputData.store32ToHostAsync(ctx.stream());
store16_ = inputData.store16ToHostAsync(ctx.stream());
hitsModuleStart_ = inputData.hitsModuleStartToHostAsync(ctx.stream());
}

void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());

iEvent.emplace(hostPutToken_, std::move(hmsp));
iEvent.emplace(hitsPutTokenCPU_, store32_.get(), store16_.get(), hitsModuleStart_.get(), nHits_);
}

DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
Loading