diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h index 7b04fac67b9f1..7faa6224959e7 100644 --- a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h +++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h @@ -4,15 +4,19 @@ #include #include "DataFormats/BeamSpot/interface/BeamSpotPOD.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" class BeamSpotCUDA { public: + using Buffer = memoryPool::Buffer; + // default constructor, required by cms::cuda::Product BeamSpotCUDA() = default; // constructor that allocates cached device memory on the given CUDA stream - BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique(stream); } + BeamSpotCUDA(cudaStream_t stream) { + data_d_ = memoryPool::cuda::makeBuffer(1, stream, memoryPool::onDevice); + } // movable, non-copiable BeamSpotCUDA(BeamSpotCUDA const&) = delete; @@ -23,11 +27,11 @@ class BeamSpotCUDA { BeamSpotPOD* data() { return data_d_.get(); } BeamSpotPOD const* data() const { return data_d_.get(); } - cms::cuda::device::unique_ptr& ptr() { return data_d_; } - cms::cuda::device::unique_ptr const& ptr() const { return data_d_; } + Buffer& ptr() { return data_d_; } + Buffer const& ptr() const { return data_d_; } private: - cms::cuda::device::unique_ptr data_d_; + Buffer data_d_; }; #endif // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h deleted file mode 100644 index 3f2a551bc320f..0000000000000 --- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h +++ /dev/null @@ -1,189 +0,0 @@ -#ifndef CUDADataFormatsCommonHeterogeneousSoA_H -#define CUDADataFormatsCommonHeterogeneousSoA_H - -#include - -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" - -// a heterogeneous unique pointer... -template -class HeterogeneousSoA { -public: - using Product = T; - - HeterogeneousSoA() = default; // make root happy - ~HeterogeneousSoA() = default; - HeterogeneousSoA(HeterogeneousSoA &&) = default; - HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default; - - explicit HeterogeneousSoA(cms::cuda::device::unique_ptr &&p) : dm_ptr(std::move(p)) {} - explicit HeterogeneousSoA(cms::cuda::host::unique_ptr &&p) : hm_ptr(std::move(p)) {} - explicit HeterogeneousSoA(std::unique_ptr &&p) : std_ptr(std::move(p)) {} - - auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } - - auto const &operator*() const { return *get(); } - - auto const *operator->() const { return get(); } - - auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); } - - auto &operator*() { return *get(); } - - auto *operator->() { return get(); } - - // in reality valid only for GPU version... - cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const { - assert(dm_ptr); - auto ret = cms::cuda::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream)); - return ret; - } - -private: - // a union wan't do it, a variant will not be more efficienct - cms::cuda::device::unique_ptr dm_ptr; //! - cms::cuda::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! -}; - -namespace cms { - namespace cudacompat { - - struct GPUTraits { - template - using unique_ptr = cms::cuda::device::unique_ptr; - - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - }; - - struct HostTraits { - template - using unique_ptr = cms::cuda::host::unique_ptr; - - template - static auto make_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_host_unique(cudaStream_t stream) { - return cms::cuda::make_host_unique(stream); - } - - template - static auto make_device_unique(cudaStream_t stream) { - return cms::cuda::make_device_unique(stream); - } - - template - static auto make_device_unique(size_t size, cudaStream_t stream) { - return cms::cuda::make_device_unique(size, stream); - } - }; - - struct CPUTraits { - template - using unique_ptr = std::unique_ptr; - - template - static auto make_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_unique(size_t size, cudaStream_t) { - return std::make_unique(size); - } - - template - static auto make_host_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_device_unique(cudaStream_t) { - return std::make_unique(); - } - - template - static auto make_device_unique(size_t size, cudaStream_t) { - return std::make_unique(size); - } - }; - - } // namespace cudacompat -} // namespace cms - -// a heterogeneous unique pointer (of a different sort) ... -template -class HeterogeneousSoAImpl { -public: - template - using unique_ptr = typename Traits::template unique_ptr; - - HeterogeneousSoAImpl() = default; // make root happy - ~HeterogeneousSoAImpl() = default; - HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default; - HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default; - - explicit HeterogeneousSoAImpl(unique_ptr &&p) : m_ptr(std::move(p)) {} - explicit HeterogeneousSoAImpl(cudaStream_t stream); - - T const *get() const { return m_ptr.get(); } - - T *get() { return m_ptr.get(); } - - cms::cuda::host::unique_ptr toHostAsync(cudaStream_t stream) const; - -private: - unique_ptr m_ptr; //! -}; - -template -HeterogeneousSoAImpl::HeterogeneousSoAImpl(cudaStream_t stream) { - m_ptr = Traits::template make_unique(stream); -} - -// in reality valid only for GPU version... -template -cms::cuda::host::unique_ptr HeterogeneousSoAImpl::toHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(stream); - cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream)); - return ret; -} - -template -using HeterogeneousSoAGPU = HeterogeneousSoAImpl; -template -using HeterogeneousSoACPU = HeterogeneousSoAImpl; -template -using HeterogeneousSoAHost = HeterogeneousSoAImpl; - -#endif diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h index 63a152298e42b..5622f3e038b5c 100644 --- a/CUDADataFormats/Common/interface/HostProduct.h +++ b/CUDADataFormats/Common/interface/HostProduct.h @@ -1,7 +1,7 @@ #ifndef CUDADataFormatsCommonHostProduct_H #define CUDADataFormatsCommonHostProduct_H -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include // a heterogeneous unique pointer... template @@ -12,18 +12,16 @@ class HostProduct { HostProduct(HostProduct&&) = default; HostProduct& operator=(HostProduct&&) = default; - explicit HostProduct(cms::cuda::host::unique_ptr&& p) : hm_ptr(std::move(p)) {} explicit HostProduct(std::unique_ptr&& p) : std_ptr(std::move(p)) {} - auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); } + auto const* get() const { return std_ptr.get(); } auto const& operator*() const { return *get(); } auto const* operator->() const { return get(); } private: - cms::cuda::host::unique_ptr hm_ptr; //! - std::unique_ptr std_ptr; //! + std::unique_ptr std_ptr; //! }; #endif diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h index eff550feeb22e..a486a97668163 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h @@ -5,16 +5,15 @@ #include "DataFormats/SiPixelRawData/interface/SiPixelErrorCompact.h" #include "DataFormats/SiPixelRawData/interface/SiPixelFormatterErrors.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" class SiPixelDigiErrorsCUDA { public: using SiPixelErrorCompactVector = cms::cuda::SimpleVector; SiPixelDigiErrorsCUDA() = default; - explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream); + SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream); ~SiPixelDigiErrorsCUDA() = default; SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete; @@ -27,16 +26,16 @@ class SiPixelDigiErrorsCUDA { SiPixelErrorCompactVector* error() { return error_d.get(); } SiPixelErrorCompactVector const* error() const { return error_d.get(); } - using HostDataError = std::pair>; + using HostDataError = std::pair>; HostDataError dataErrorToHostAsync(cudaStream_t stream) const; void copyErrorToHostAsync(cudaStream_t stream); int nErrorWords() const { return nErrorWords_; } private: - cms::cuda::device::unique_ptr data_d; - cms::cuda::device::unique_ptr error_d; - cms::cuda::host::unique_ptr error_h; + memoryPool::Buffer data_d; + memoryPool::Buffer error_d; + memoryPool::Buffer error_h; SiPixelFormatterErrors formatterErrors_h; int nErrorWords_ = 0; }; diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h index cf6b51687982f..f4bdc09ee4e8c 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h @@ -1,18 +1,16 @@ #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h -#include +#include -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h" class SiPixelDigisCUDA { public: using StoreType = uint16_t; SiPixelDigisCUDA() = default; - explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); + SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream); ~SiPixelDigisCUDA() = default; SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete; @@ -28,14 +26,14 @@ class SiPixelDigisCUDA { uint32_t nModules() const { return nModules_h; } uint32_t nDigis() const { return nDigis_h; } - cms::cuda::host::unique_ptr copyAllToHostAsync(cudaStream_t stream) const; + memoryPool::Buffer copyAllToHostAsync(cudaStream_t stream) const; SiPixelDigisCUDASOAView view() { return m_view; } SiPixelDigisCUDASOAView const view() const { return m_view; } private: // These are consumed by downstream device code - cms::cuda::device::unique_ptr m_store; + memoryPool::Buffer m_store; SiPixelDigisCUDASOAView m_view; diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h index 70d00ae584279..f1efeaad2e2f3 100644 --- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h +++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h @@ -3,8 +3,6 @@ #include -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include @@ -96,11 +94,11 @@ class SiPixelDigisCUDASOAView { uint32_t* rawIdArr_; template - ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { + static ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) { return reinterpret_cast(store.get() + static_cast(column) * roundFor128ByteAlignment(size)); } - static int roundFor128ByteAlignment(int size) { + static constexpr int roundFor128ByteAlignment(int size) { constexpr int mul = 128 / sizeof(uint16_t); return ((size + mul - 1) / mul) * mul; }; diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc index e81b1b2b592af..f2459801dba55 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc @@ -1,40 +1,42 @@ -#include - #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream) - : data_d(cms::cuda::make_device_unique(maxFedWords, stream)), - error_d(cms::cuda::make_device_unique(stream)), - error_h(cms::cuda::make_host_unique(stream)), - formatterErrors_h(std::move(errors)), - nErrorWords_(maxFedWords) { + : formatterErrors_h(std::move(errors)), nErrorWords_(maxFedWords) { assert(maxFedWords != 0); - cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream); + + memoryPool::Deleter deleter = + memoryPool::Deleter(std::make_shared(stream, memoryPool::onDevice)); + assert(deleter.pool()); + + data_d = memoryPool::cuda::makeBuffer(maxFedWords, deleter); + error_d = memoryPool::cuda::makeBuffer(1, deleter); + error_h = memoryPool::cuda::makeBuffer(1, stream, memoryPool::onHost); + + cudaMemsetAsync(data_d.get(), 0x00, maxFedWords, stream); cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get()); assert(error_h->empty()); assert(error_h->capacity() == static_cast(maxFedWords)); - cms::cuda::copyAsync(error_d, error_h, stream); + cudaCheck(memoryPool::cuda::copy(error_d, error_h, 1, stream)); } void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) { - cms::cuda::copyAsync(error_h, error_d, stream); + cudaCheck(memoryPool::cuda::copy(error_h, error_d, 1, stream)); } SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const { // On one hand size() could be sufficient. On the other hand, if // someone copies the SimpleVector<>, (s)he might expect the data - // buffer to actually have space for capacity() elements. - auto data = cms::cuda::make_host_unique(error_h->capacity(), stream); + // Buffer to actually have space for capacity() elements. + auto data = memoryPool::cuda::makeBuffer(error_h->capacity(), stream, memoryPool::onHost); // but transfer only the required amount if (not error_h->empty()) { - cms::cuda::copyAsync(data, data_d, error_h->size(), stream); + cudaCheck(memoryPool::cuda::copy(data, data_d, error_h->size(), stream)); } auto err = *error_h; err.set_data(data.get()); diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc index 9a7f8ae8bdad5..db464afb492f4 100644 --- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc +++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc @@ -1,24 +1,24 @@ #include #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream) - : m_store(cms::cuda::make_device_unique( + : m_store(memoryPool::cuda::makeBuffer( SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) * static_cast(SiPixelDigisCUDASOAView::StorageLocation::kMAX), - stream)), + stream, + memoryPool::onDevice)), m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) { assert(maxFedWords != 0); } -cms::cuda::host::unique_ptr SiPixelDigisCUDA::copyAllToHostAsync( - cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique( +memoryPool::Buffer SiPixelDigisCUDA::copyAllToHostAsync(cudaStream_t stream) const { + auto ret = memoryPool::cuda::makeBuffer( m_view.roundFor128ByteAlignment(nDigis()) * static_cast(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX), - stream); + stream, + memoryPool::onHost); cudaCheck(cudaMemcpyAsync(ret.get(), m_view.clus(), m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) * diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h index 3ee5af80353dd..f791050aa6552 100644 --- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h +++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h @@ -1,9 +1,9 @@ #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h #define CUDADataFormats_Track_PixelTrackHeterogeneous_h -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" -using PixelTrackHeterogeneous = HeterogeneousSoA; +using PixelTrackHeterogeneous = memoryPool::Buffer; -#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h \ No newline at end of file +#endif // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h index 356ea3eddeb7f..be73e2650b417 100644 --- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h +++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h @@ -8,8 +8,6 @@ #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" - namespace pixelTrack { enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality }; constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)}; diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h index 97c116f6c88d3..a2b57f2e3e9e1 100644 --- a/CUDADataFormats/Track/src/classes.h +++ b/CUDADataFormats/Track/src/classes.h @@ -2,7 +2,7 @@ #define CUDADataFormats_Track_src_classes_h #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/Common/interface/HostProduct.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h" #include "DataFormats/Common/interface/Wrapper.h" diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml index 9c80ae91baf29..c95568b18f1f3 100644 --- a/CUDADataFormats/Track/src/classes_def.xml +++ b/CUDADataFormats/Track/src/classes_def.xml @@ -1,6 +1,7 @@ - - - - + + + + + diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h index 8ce37f280ac6c..2a413ecb2a370 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h @@ -2,10 +2,10 @@ #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" -template +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" + class TrackingRecHit2DHeterogeneous { public: enum class Storage32 { @@ -30,23 +30,28 @@ class TrackingRecHit2DHeterogeneous { }; template - using unique_ptr = typename Traits::template unique_ptr; + using Buffer = typename memoryPool::Buffer; using PhiBinner = TrackingRecHit2DSOAView::PhiBinner; TrackingRecHit2DHeterogeneous() = default; - explicit TrackingRecHit2DHeterogeneous( - uint32_t nHits, - bool isPhase2, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneous const* input = nullptr); - - explicit TrackingRecHit2DHeterogeneous( - float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr); + TrackingRecHit2DHeterogeneous(uint32_t nHits, + bool isPhase2, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + memoryPool::Where where, + cudaStream_t stream, + TrackingRecHit2DHeterogeneous const* input = nullptr); + + // used on CPU only + TrackingRecHit2DHeterogeneous(float* store32, + uint16_t* store16, + uint32_t* modules, + int nHits, + memoryPool::Where where = memoryPool::onCPU, + cudaStream_t stream = nullptr); ~TrackingRecHit2DHeterogeneous() = default; TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete; @@ -67,28 +72,28 @@ class TrackingRecHit2DHeterogeneous { auto phiBinnerStorage() { return m_phiBinnerStorage; } auto iphi() { return m_iphi; } - cms::cuda::host::unique_ptr localCoordToHostAsync(cudaStream_t stream) const; + Buffer localCoordToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr hitsModuleStartToHostAsync(cudaStream_t stream) const; + Buffer hitsModuleStartToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store16ToHostAsync(cudaStream_t stream) const; - cms::cuda::host::unique_ptr store32ToHostAsync(cudaStream_t stream) const; + Buffer store16ToHostAsync(cudaStream_t stream) const; + Buffer store32ToHostAsync(cudaStream_t stream) const; - // needs specialization for Host - void copyFromGPU(TrackingRecHit2DHeterogeneous const* input, cudaStream_t stream); + // needed for Host + void copyFromGPU(TrackingRecHit2DHeterogeneous const* input, cudaStream_t stream); private: static constexpr uint32_t n16 = 4; // number of elements in m_store16 static constexpr uint32_t n32 = 10; // number of elements in m_store32 static_assert(sizeof(uint32_t) == sizeof(float)); // just stating the obvious static_assert(n32 == static_cast(Storage32::kLayers)); - unique_ptr m_store16; //! - unique_ptr m_store32; //! + Buffer m_store16; //! + Buffer m_store32; //! - unique_ptr m_PhiBinnerStore; //! - unique_ptr m_AverageGeometryStore; //! + Buffer m_PhiBinnerStore; //! + Buffer m_AverageGeometryStore; //! - unique_ptr m_view; //! + Buffer m_view; //! uint32_t m_nHits; int32_t m_offsetBPIX2; @@ -103,175 +108,8 @@ class TrackingRecHit2DHeterogeneous { int16_t* m_iphi; }; -using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; -using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; - -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" -#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" - -template -TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( - uint32_t nHits, - bool isPhase2, - int32_t offsetBPIX2, - pixelCPEforGPU::ParamsOnGPU const* cpeParams, - uint32_t const* hitsModuleStart, - cudaStream_t stream, - TrackingRecHit2DHeterogeneous const* input) - : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { - auto view = Traits::template make_host_unique(stream); - - m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; - - view->m_nHits = nHits; - view->m_nMaxModules = m_nMaxModules; - m_view = Traits::template make_unique(stream); // leave it on host and pass it by value? - m_AverageGeometryStore = Traits::template make_unique(stream); - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_cpeParams = cpeParams; - view->m_hitsModuleStart = hitsModuleStart; - - // if empy do not bother - if (0 == nHits) { - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - } - return; - } - - // the single arrays are not 128 bit alligned... - // the hits are actually accessed in order only in building - // if ordering is relevant they may have to be stored phi-ordered by layer or so - // this will break 1to1 correspondence with cluster and module locality - // so unless proven VERY inefficient we keep it ordered as generated - - // host copy is "reduced" (to be reviewed at some point) - if constexpr (std::is_same_v) { - // it has to compile for ALL cases - copyFromGPU(input, stream); - } else { - assert(input == nullptr); - - auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32 + nL + 1, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - } - - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); - static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); - - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - - // copy all the pointers - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - - if constexpr (!std::is_same_v) { - assert(input == nullptr); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - view->m_detInd = get16(Storage16::kDetId); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); - } - - // transfer view - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view.reset(view.release()); // NOLINT: std::move() breaks CUDA version - } -} - -//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases -template -TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( - float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream) - : m_nHits(nHits), m_hitsModuleStart(modules) { - auto view = Traits::template make_host_unique(stream); - - m_view = Traits::template make_unique(stream); - - view->m_nHits = nHits; - - if (0 == nHits) { - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view = std::move(view); - } - return; - } - - m_store16 = Traits::template make_unique(nHits * n16, stream); - m_store32 = Traits::template make_unique(nHits * n32, stream); - m_PhiBinnerStore = Traits::template make_unique(stream); - m_AverageGeometryStore = Traits::template make_unique(stream); - - view->m_averageGeometry = m_AverageGeometryStore.get(); - view->m_hitsModuleStart = m_hitsModuleStart; - - //store transfer - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_store16, store16, stream); - cms::cuda::copyAsync(m_store32, store32, stream); - } else { - std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it - std::copy(store16, store16 + nHits * n16, m_store16.get()); - } - - //getters - auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; - auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; - - //Store 32 - view->m_xl = get32(Storage32::kXLocal); - view->m_yl = get32(Storage32::kYLocal); - view->m_xerr = get32(Storage32::kXerror); - view->m_yerr = get32(Storage32::kYerror); - view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); - view->m_xg = get32(Storage32::kXGlobal); - view->m_yg = get32(Storage32::kYGlobal); - view->m_zg = get32(Storage32::kZGlobal); - view->m_rg = get32(Storage32::kRGlobal); - - m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); - m_phiBinnerStorage = view->m_phiBinnerStorage = - reinterpret_cast(get32(Storage32::kPhiStorage)); - - //Store 16 - view->m_detInd = get16(Storage16::kDetId); - m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); - view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); - view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); - - // transfer view - if constexpr (std::is_same_v) { - cms::cuda::copyAsync(m_view, view, stream); - } else { - m_view = std::move(view); - } -} +using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous; +using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous; #endif // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h index f252ca94d2296..ff56e845dd19c 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h @@ -1,14 +1,14 @@ #ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" -#include "CUDADataFormats/Common/interface/HostProduct.h" // a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction class TrackingRecHit2DReduced { public: - using HLPstorage = HostProduct; - using HIDstorage = HostProduct; + using HLPstorage = memoryPool::Buffer; + using HIDstorage = memoryPool::Buffer; template TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits) diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h index 39ee136189955..748b8ec6ad593 100644 --- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h +++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h @@ -4,6 +4,7 @@ #include #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" @@ -25,7 +26,6 @@ class TrackingRecHit2DSOAView { using AverageGeometry = pixelTopology::AverageGeometry; - template friend class TrackingRecHit2DHeterogeneous; friend class TrackingRecHit2DReduced; diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc index fc6a05ba9ed3e..632157b9d8e93 100644 --- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc +++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc @@ -1,40 +1,215 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" + +#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h" +#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h" + #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nHits, + bool isPhase2, + int32_t offsetBPIX2, + pixelCPEforGPU::ParamsOnGPU const* cpeParams, + uint32_t const* hitsModuleStart, + memoryPool::Where where, + cudaStream_t stream, + TrackingRecHit2DHeterogeneous const* input) + : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) { + using namespace memoryPool::cuda; + + memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared(stream, where)); + assert(deleter.pool()); + auto view = makeBuffer( + 1, stream, memoryPool::onCPU == where ? memoryPool::onCPU : memoryPool::onHost); + assert(view.get()); + m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules; + assert(view.get()); + view->m_nHits = nHits; + view->m_nMaxModules = m_nMaxModules; + m_view = makeBuffer( + 1, deleter); // stream, where); // deleter); // leave it on host and pass it by value? + assert(m_view.get()); + m_AverageGeometryStore = makeBuffer(1, deleter); + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_cpeParams = cpeParams; + view->m_hitsModuleStart = hitsModuleStart; + + // if empy do not bother + if (0 == nHits) { + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view.reset(view.release()); + } + return; + } + + // the single arrays are not 128 bit alligned... + // the hits are actually accessed in order only in building + // if ordering is relevant they may have to be stored phi-ordered by layer or so + // this will break 1to1 correspondence with cluster and module locality + // so unless proven VERY inefficient we keep it ordered as generated + + // host copy is "reduced" (to be reviewed at some point) + if (memoryPool::onHost == where) { + // it has to compile for ALL cases + copyFromGPU(input, stream); + } else { + assert(input == nullptr); + + auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers; + + m_store16 = makeBuffer(nHits * n16, deleter); + m_store32 = makeBuffer(nHits * n32 + nL + 1, deleter); + m_PhiBinnerStore = makeBuffer(1, deleter); + } + + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float)); + static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type)); + + auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; + + // copy all the pointers + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + + if (memoryPool::onHost != where) { + assert(input == nullptr); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); + + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + view->m_detInd = get16(Storage16::kDetId); + + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast(get32(Storage32::kLayers)); + } + + // transfer view + if (memoryPool::onDevice == where) { + cudaCheck( + cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream)); + // cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view.reset(view.release()); + } +} + +//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases +TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous( + float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream) + : m_nHits(nHits), m_hitsModuleStart(modules) { + using namespace memoryPool::cuda; + auto view = makeBuffer( + 1, stream, memoryPool::onCPU == where ? memoryPool::onCPU : memoryPool::onHost); + + m_view = makeBuffer(1, stream, where); + + view->m_nHits = nHits; + + if (0 == nHits) { + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view = std::move(view); + } + return; + } + + m_store16 = makeBuffer(nHits * n16, stream, where); + m_store32 = makeBuffer(nHits * n32, stream, where); + m_PhiBinnerStore = makeBuffer(1, stream, where); + m_AverageGeometryStore = makeBuffer(1, stream, where); + + view->m_averageGeometry = m_AverageGeometryStore.get(); + view->m_hitsModuleStart = m_hitsModuleStart; + + //store transfer + if (memoryPool::onDevice == where) { + cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32 * sizeof(float), cudaMemcpyHostToDevice, stream)); + cudaCheck( + cudaMemcpyAsync(m_store16.get(), store16, nHits * n16 * sizeof(uint16_t), cudaMemcpyHostToDevice, stream)); + } else { + std::copy(store32, store32 + nHits * n32, m_store32.get()); // want to copy it + std::copy(store16, store16 + nHits * n16, m_store16.get()); + } + + //getters + auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast(i) * nHits; }; + auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast(i) * nHits; }; + + //Store 32 + view->m_xl = get32(Storage32::kXLocal); + view->m_yl = get32(Storage32::kYLocal); + view->m_xerr = get32(Storage32::kXerror); + view->m_yerr = get32(Storage32::kYerror); + view->m_chargeAndStatus = reinterpret_cast(get32(Storage32::kCharge)); + view->m_xg = get32(Storage32::kXGlobal); + view->m_yg = get32(Storage32::kYGlobal); + view->m_zg = get32(Storage32::kZGlobal); + view->m_rg = get32(Storage32::kRGlobal); + + m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get(); + m_phiBinnerStorage = view->m_phiBinnerStorage = + reinterpret_cast(get32(Storage32::kPhiStorage)); + + //Store 16 + view->m_detInd = get16(Storage16::kDetId); + m_iphi = view->m_iphi = reinterpret_cast(get16(Storage16::kPhi)); + view->m_xsize = reinterpret_cast(get16(Storage16::kXSize)); + view->m_ysize = reinterpret_cast(get16(Storage16::kYSize)); + + // transfer view + if (memoryPool::onDevice == where) { + cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream)); + } else { + m_view = std::move(view); + } +} + +using namespace memoryPool::cuda; -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(5 * nHits(), stream); - cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream); +memoryPool::Buffer TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const { + auto ret = makeBuffer(5 * nHits(), stream, memoryPool::onHost); + cudaCheck(cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * sizeof(float) * nHits(), cudaMemcpyDeviceToHost, stream)); return ret; } -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(n32) * nHits(), stream); - cms::cuda::copyAsync(ret, m_store32, static_cast(n32) * nHits(), stream); +memoryPool::Buffer TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const { + auto ret = makeBuffer(static_cast(n32) * nHits(), stream, memoryPool::onHost); + cudaCheck(cudaMemcpyAsync( + ret.get(), m_store32.get(), static_cast(n32) * sizeof(float) * nHits(), cudaMemcpyDeviceToHost, stream)); return ret; } -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(static_cast(n16) * nHits(), stream); - cms::cuda::copyAsync(ret, m_store16, static_cast(n16) * nHits(), stream); +memoryPool::Buffer TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const { + auto ret = makeBuffer(static_cast(n16) * nHits(), stream, memoryPool::onHost); + cudaCheck(cudaMemcpyAsync( + ret.get(), m_store16.get(), static_cast(n16) * sizeof(uint16_t) * nHits(), cudaMemcpyDeviceToHost, stream)); return ret; } -template <> -cms::cuda::host::unique_ptr TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const { - auto ret = cms::cuda::make_host_unique(nMaxModules() + 1, stream); - cudaCheck( - cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream)); +memoryPool::Buffer TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const { + auto ret = makeBuffer(nMaxModules() + 1, stream, memoryPool::onHost); + if (m_hitsModuleStart) + cudaCheck(cudaMemcpyAsync( + ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDeviceToHost, stream)); return ret; } -// the only specialization needed -template <> void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) { assert(input); m_store32 = input->localCoordToHostAsync(stream); diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp index 8aca68e294469..e85878c05e2a6 100644 --- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp +++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp @@ -1,9 +1,11 @@ #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h" -#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h" +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + namespace testTrackingRecHit2D { void runKernels(TrackingRecHit2DSOAView* hits); @@ -12,6 +14,7 @@ namespace testTrackingRecHit2D { int main() { cms::cudatest::requireDevices(); + memoryPool::cuda::init(false); cudaStream_t stream; cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking)); @@ -19,26 +22,54 @@ int main() { auto nHits = 200; // inner scope to deallocate memory before destroying the stream { - TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream); + TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, memoryPool::onDevice, stream); testTrackingRecHit2D::runKernels(tkhit.view()); - TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream); + TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, memoryPool::onDevice, stream); testTrackingRecHit2D::runKernels(tkhitPhase2.view()); - TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit); + memoryPool::cuda::dumpStat(); + + TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, memoryPool::onHost, stream, &tkhit); + cudaStreamSynchronize(stream); + memoryPool::cuda::dumpStat(); + assert(tkhitH.view()); assert(tkhitH.view()->nHits() == unsigned(nHits)); assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules); - TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit); + TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, memoryPool::onHost, stream, &tkhitPhase2); cudaStreamSynchronize(stream); assert(tkhitHPhase2.view()); assert(tkhitHPhase2.view()->nHits() == unsigned(nHits)); assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules); + + memoryPool::cuda::dumpStat(); } + cudaCheck(cudaStreamSynchronize(stream)); + memoryPool::cuda::dumpStat(); + + std::cout << "on CPU" << std::endl; + ((SimplePoolAllocatorImpl*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat(); + cudaCheck(cudaStreamDestroy(stream)); + memoryPool::cuda::dumpStat(); + + { + TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, memoryPool::onCPU, nullptr); + assert(tkhit.view()); + assert(tkhit.view()->nHits() == unsigned(nHits)); + assert(tkhit.view()->nMaxModules() == phase1PixelTopology::numberOfModules); + std::cout << "on CPU" << std::endl; + ((SimplePoolAllocatorImpl*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat(); + } + std::cout << "on CPU" << std::endl; + ((SimplePoolAllocatorImpl*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat(); + + memoryPool::cuda::shutdown(); + return 0; } diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h index 417a960951fb1..0faeabf609da1 100644 --- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h +++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h @@ -2,9 +2,9 @@ #define CUDADataFormatsVertexZVertexHeterogeneous_H #include "CUDADataFormats/Vertex/interface/ZVertexSoA.h" -#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h" +#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h" -using ZVertexHeterogeneous = HeterogeneousSoA; +using ZVertexHeterogeneous = memoryPool::Buffer; #ifndef __CUDACC__ #include "CUDADataFormats/Common/interface/Product.h" using ZVertexCUDAProduct = cms::cuda::Product; diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml index ea633080af9af..a0886b0cc0797 100644 --- a/CUDADataFormats/Vertex/src/classes_def.xml +++ b/CUDADataFormats/Vertex/src/classes_def.xml @@ -2,5 +2,6 @@ + diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc index 4037b4d50612c..ba23465e6f753 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc @@ -9,7 +9,6 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer { public: @@ -27,7 +26,7 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer> digiErrorGetToken_; edm::EDPutTokenT digiErrorPutToken_; - cms::cuda::host::unique_ptr data_; + memoryPool::Buffer data_; cms::cuda::SimpleVector error_; const SiPixelFormatterErrors* formatterErrors_ = nullptr; }; diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc index 0702bc4830c7c..4cc99064e43cf 100644 --- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc +++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc @@ -9,7 +9,6 @@ #include "FWCore/ParameterSet/interface/ParameterSetDescription.h" #include "FWCore/ParameterSet/interface/ParameterSet.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" -#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer { public: @@ -27,7 +26,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer> digiGetToken_; edm::EDPutTokenT digiPutToken_; - cms::cuda::host::unique_ptr store_; + memoryPool::Buffer store_; int nDigis_; }; diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc index ccf7995a20061..0b547c4baecce 100644 --- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc +++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc @@ -27,6 +27,8 @@ namespace { try { auto error = cudaGetErrorName(status); auto message = cudaGetErrorString(status); + std::cout << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " + << message << std::endl; throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message; } catch (cms::Exception&) { diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc index 346c81267ec49..fa8d115ed1fd2 100644 --- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc +++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc @@ -16,6 +16,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h" #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" @@ -135,6 +136,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getU bool configEnabled = config.getUntrackedParameter("enabled"); if (not configEnabled) { edm::LogInfo("CUDAService") << "CUDAService disabled by configuration"; + // enable cpu memory pool + memoryPool::cuda::init(true); return; } @@ -361,6 +364,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getU cms::cuda::getEventCache().clear(); cms::cuda::getStreamCache().clear(); + // enable memory pool + memoryPool::cuda::init(false); + if (verbose_) { log << '\n' << "CUDAService fully initialized"; } @@ -381,6 +387,9 @@ CUDAService::~CUDAService() { cms::cuda::getEventCache().clear(); cms::cuda::getStreamCache().clear(); + // destroy cpu memory pool + memoryPool::cuda::shutdown(); + for (int i = 0; i < numberOfDevices_; ++i) { cudaCheck(cudaSetDevice(i)); cudaCheck(cudaDeviceSynchronize()); diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h b/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h new file mode 100644 index 0000000000000..c80af3c60be98 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h @@ -0,0 +1,155 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// #define MEMORY_POOL_DEBUG + +namespace poolDetails { + + constexpr int bucket(uint64_t s) { return 64 - __builtin_clzl(s - 1); } + constexpr uint64_t bucketSize(int b) { return 1LL << b; } + +}; // namespace poolDetails + +class SimplePoolAllocator; + +namespace memoryPool { + struct Payload { + SimplePoolAllocator *pool; + std::vector buckets; + }; +} // namespace memoryPool + +class SimplePoolAllocator { +public: + using Pointer = void *; + + virtual ~SimplePoolAllocator() = default; + + virtual Pointer doAlloc(size_t size) = 0; + virtual void doFree(Pointer ptr) = 0; + virtual void scheduleFree(memoryPool::Payload *payload, void *stream) = 0; + + SimplePoolAllocator(int maxSlots) : m_maxSlots(maxSlots) { + for (auto &p : m_used) + p.v = true; + } + + int size() const { return m_size; } + + Pointer pointer(int i) const { return m_slots[i]; } + + void dumpStat() const; + + void free(int i) { +#ifdef MEMORY_POOL_DEBUG + m_last[i] = -1; +#endif + m_used[i].v = false; + } + + int alloc(uint64_t s) { + auto i = allocImpl(s); + + //test garbage + // if(totBytes>4507964512) garbageCollect(); + + if (i >= 0) { +#ifdef MEMORY_POOL_DEBUG + assert(m_used[i].v); + if (nullptr == m_slots[i]) + std::cout << "race ??? " << i << ' ' << m_bucket[i] << ' ' << m_last[i] << std::endl; + assert(m_slots[i]); +#endif + return i; + } + garbageCollect(); + i = allocImpl(s); + if (i >= 0) { + assert(m_used[i].v); + assert(m_slots[i]); +#ifdef MEMORY_POOL_DEBUG + assert(m_last[i] >= 0); +#endif + } + return i; + } + +protected: + int allocImpl(uint64_t s); + int createAt(int ls, int b); + void garbageCollect(); + int useOld(int b); + +private: + const int m_maxSlots; + +#ifdef MEMORY_POOL_DEBUG + std::vector m_last = std::vector(m_maxSlots, -2); +#endif + + std::vector m_bucket = std::vector(m_maxSlots, -1); + std::vector m_slots = std::vector(m_maxSlots, nullptr); + struct alBool { + alignas(64) std::atomic v; + }; + std::vector m_used = std::vector(m_maxSlots); + std::atomic m_size = 0; + + std::atomic totBytes = 0; + std::atomic nAlloc = 0; + std::atomic nFree = 0; +}; + +namespace poolDetails { + // free callback + inline void freeAsync(memoryPool::Payload *payload) { + auto &pool = *(payload->pool); + auto const &buckets = payload->buckets; + for (auto i : buckets) { + pool.free(i); + } + delete payload; + } +} // namespace poolDetails + +template +struct SimplePoolAllocatorImpl final : public SimplePoolAllocator { + using Traits = T; + + using SimplePoolAllocator::SimplePoolAllocator; + + ~SimplePoolAllocatorImpl() override { + garbageCollect(); +#ifdef MEMORY_POOL_DEBUG + dumpStat(); +#endif + } + + Pointer doAlloc(size_t size) override { return Traits::alloc(size); } + void doFree(Pointer ptr) override { Traits::free(ptr); } + + void scheduleFree(memoryPool::Payload *payload, void *stream) override { + assert(payload->pool == this); + Traits::scheduleFree(payload, stream); + } +}; + +#include +struct PosixAlloc { + using Pointer = void *; + + static Pointer alloc(size_t size) { return ::malloc(size); } + static void free(Pointer ptr) { ::free(ptr); } + + static void scheduleFree(memoryPool::Payload *payload, void *) { poolDetails::freeAsync(payload); } +}; diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h new file mode 100644 index 0000000000000..b99d425eca25c --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h @@ -0,0 +1,87 @@ +#pragma once +#include "memoryPool.h" +#include + +// only for cudaStream_t +#include + +#include + +namespace memoryPool { + namespace cuda { + + void init(bool onlyCPU = false); + void shutdown(); + + void dumpStat(); + + SimplePoolAllocator *getPool(Where where); + + // allocate either on current device or on host + std::pair alloc(uint64_t size, SimplePoolAllocator &pool); + + // schedule free + void free(cudaStream_t stream, std::vector buckets, SimplePoolAllocator &pool); + + template + auto copy(Buffer &dst, Buffer const &src, uint64_t size, cudaStream_t stream) { + assert(dst.get()); + assert(src.get()); + assert(size > 0); + return cudaMemcpyAsync(dst.get(), src.get(), sizeof(T) * size, cudaMemcpyDefault, stream); + } + + struct CudaDeleterBase : public DeleterBase { + CudaDeleterBase(cudaStream_t const &stream, Where where) : DeleterBase(getPool(where)), m_stream(stream) {} + + CudaDeleterBase(cudaStream_t const &stream, SimplePoolAllocator *pool) : DeleterBase(pool), m_stream(stream) {} + + ~CudaDeleterBase() override = default; + + cudaStream_t m_stream; + }; + + struct DeleteOne final : public CudaDeleterBase { + using CudaDeleterBase::CudaDeleterBase; + + ~DeleteOne() override = default; + void operator()(int bucket) override { free(m_stream, std::vector(1, bucket), *pool()); } + }; + + struct BundleDelete final : public CudaDeleterBase { + BundleDelete(cudaStream_t const &stream, Where where) : CudaDeleterBase(stream, where) { m_buckets.reserve(8); } + + ~BundleDelete() override { free(m_stream, std::move(m_buckets), *pool()); } + + void operator()(int bucket) override { m_buckets.push_back(bucket); } + + std::vector m_buckets; + }; + + template + Buffer makeBuffer(uint64_t size, Deleter const &del) { + auto ret = alloc(sizeof(T) * size, *del.pool()); + if (ret.second < 0) { + std::cout << "could not allocate " << size << ' ' << typeid(T).name() << " of size " << sizeof(T) << std::endl; + throw std::bad_alloc(); + } + return Buffer((T *)(ret.first), ret.second, del); + } + + template + Buffer makeBuffer(uint64_t size, Deleter &&del) { + auto ret = alloc(sizeof(T) * size, *del.pool()); + if (ret.second < 0) { + std::cout << "could not allocate " << size << ' ' << typeid(T).name() << " of size " << sizeof(T) << std::endl; + throw std::bad_alloc(); + } + return Buffer((T *)(ret.first), ret.second, std::move(del)); + } + + template + Buffer makeBuffer(uint64_t size, cudaStream_t const &stream, Where where) { + return makeBuffer(size, Deleter(std::make_shared(stream, where))); + } + + } // namespace cuda +} // namespace memoryPool diff --git a/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h new file mode 100644 index 0000000000000..2472a92cafb85 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h @@ -0,0 +1,121 @@ +#pragma once +#include +#include + +#include +class SimplePoolAllocator; + +namespace memoryPool { + + enum Where { onCPU, onDevice, onHost, unified }; + + class DeleterBase { + public: + explicit DeleterBase(SimplePoolAllocator* pool) : m_pool(pool) {} + virtual ~DeleterBase() = default; + virtual void operator()(int bucket) = 0; + + SimplePoolAllocator* pool() const { return m_pool; } + + protected: + SimplePoolAllocator* m_pool; + }; + + class Deleter { + public: + Deleter() = default; + explicit Deleter(std::shared_ptr const& del) : me(del) {} + explicit Deleter(std::shared_ptr&& del) : me(del) {} + + void set(std::shared_ptr const& del) { me = del; } + std::shared_ptr const& get() const { return me; } + + void operator()(int bucket) { + if (!me) { + std::cout << "deleter w/o implementation!!!" << std::endl; + throw std::bad_alloc(); + } + if (bucket < 0) + std::cout << "delete with negative bucket!!!" << std::endl; + (*me)(bucket); + } + + SimplePoolAllocator* pool() const { return me->pool(); } + + private: + std::shared_ptr me; //! + }; + + template + class Buffer { + public: + typedef T value_type; + typedef T* pointer; + typedef T& reference; + typedef T const* const_pointer; + typedef T const& const_reference; + + Buffer() = default; + Buffer(T* p, int bucket) : m_p(p), m_bucket(bucket) {} + Buffer(T* p, int bucket, Deleter const& del) : m_deleter(del), m_p(p), m_bucket(bucket) {} + Buffer(T* p, int bucket, Deleter&& del) : m_deleter(del), m_p(p), m_bucket(bucket) {} + Buffer(std::pair const& rh, Deleter const& del) : m_deleter(del), m_p(rh.first), m_bucket(rh.second) {} + Buffer(Buffer const&) = delete; + Buffer& operator=(Buffer const&) = delete; + + template + Buffer(Buffer&& rh) : Buffer(rh.release(), rh.deleter()) {} + template + Buffer& operator=(Buffer&& rh) { + reset(rh.release()); + m_deleter = rh.deleter(); + return *this; + } + + ~Buffer() { + // assert(m_p == pool()->pointer(m_bucket)); + if (m_p) + m_deleter(m_bucket); + } + + pointer get() { return m_p; } + const_pointer get() const { return m_p; } + reference operator*() { return *m_p; } + const_reference operator*() const { return *m_p; } + pointer operator->() { return get(); } + const_pointer operator->() const { return get(); } + reference operator[](int i) { return m_p[i]; } + const_reference operator[](int i) const { return m_p[i]; } + + Deleter& deleter() { return m_deleter; } + Deleter const& deleter() const { return m_deleter; } + SimplePoolAllocator* pool() const { return deleter().pool(); } + + int bucket() const { return m_bucket; } + + std::pair release() { + auto ret = std::make_pair(m_p, m_bucket); + m_p = nullptr; + m_bucket = -1; + return ret; + } + void reset() { + if (m_p) + m_deleter(m_bucket); + m_p = nullptr; + m_bucket = -1; + } + void reset(std::pair const& rh) { + if (m_p) + m_deleter(m_bucket); + m_p = rh.first; + m_bucket = rh.second; + } + + private: + Deleter m_deleter; //! + pointer m_p = nullptr; //! + int m_bucket = -1; //! + }; + +} // namespace memoryPool diff --git a/HeterogeneousCore/CUDAUtilities/src/SimplePoolAllocator.cc b/HeterogeneousCore/CUDAUtilities/src/SimplePoolAllocator.cc new file mode 100644 index 0000000000000..629d31c1f26d1 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/SimplePoolAllocator.cc @@ -0,0 +1,136 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" + +int SimplePoolAllocator::allocImpl(uint64_t s) { + auto b = poolDetails::bucket(s); + assert(s <= poolDetails::bucketSize(b)); + int ls = size(); + // look for an existing slot + for (int i = 0; i < ls; ++i) { + if (b != m_bucket[i]) + continue; + if (m_used[i].v) + continue; + bool exp = false; + if (m_used[i].v.compare_exchange_strong(exp, true)) { + // verify if in the mean time the garbage collector did operate + if (nullptr == m_slots[i]) { + assert(m_bucket[i] < 0); + m_used[i].v = false; + continue; + } +#ifdef MEMORY_POOL_DEBUG + m_last[i] = 0; +#endif + return i; + } + } + + // try to create in existing slot (if garbage has been collected) + ls = useOld(b); + if (ls >= 0) + return ls; + + // try to allocate a new slot + if (m_size >= m_maxSlots) + return -1; + ls = m_size++; + if (ls >= m_maxSlots) + return -1; +#ifdef MEMORY_POOL_DEBUG + m_last[ls] = 2; +#endif + return createAt(ls, b); +} + +int SimplePoolAllocator::createAt(int ls, int b) { + assert(m_used[ls].v); +#ifdef MEMORY_POOL_DEBUG + assert(m_last[ls] > 0); +#endif + m_bucket[ls] = b; + auto as = poolDetails::bucketSize(b); + assert(nullptr == m_slots[ls]); + m_slots[ls] = doAlloc(as); + if (nullptr == m_slots[ls]) { + m_bucket[ls] = -1; + m_used[ls].v = false; + return -1; + } + totBytes += as; + nAlloc++; + return ls; +} + +void SimplePoolAllocator::garbageCollect() { + int ls = size(); + for (int i = 0; i < ls; ++i) { + if (m_used[i].v) + continue; + if (m_bucket[i] < 0) + continue; + bool exp = false; + if (!m_used[i].v.compare_exchange_strong(exp, true)) + continue; + assert(m_used[i].v); + if (nullptr != m_slots[i]) { + assert(m_bucket[i] >= 0); + doFree(m_slots[i]); + nFree++; + totBytes -= poolDetails::bucketSize(m_bucket[i]); + } + m_slots[i] = nullptr; + m_bucket[i] = -1; +#ifdef MEMORY_POOL_DEBUG + m_last[i] = -3; +#endif + m_used[i].v = false; // here memory fence as well + } +} + +int SimplePoolAllocator::useOld(int b) { + int ls = size(); + for (int i = 0; i < ls; ++i) { + if (m_bucket[i] >= 0) + continue; + if (m_used[i].v) + continue; + bool exp = false; + if (!m_used[i].v.compare_exchange_strong(exp, true)) + continue; + if (nullptr != m_slots[i]) { // ops allocated and freed + assert(m_bucket[i] >= 0); +#ifdef MEMORY_POOL_DEBUG + assert(m_last[i] = -1); +#endif + m_used[i].v = false; + continue; + } + assert(m_used[i].v); +#ifdef MEMORY_POOL_DEBUG + m_last[i] = 1; +#endif + return createAt(i, b); + } + return -1; +} + +void SimplePoolAllocator::dumpStat() const { + uint64_t fn = 0; + uint64_t fs = 0; + int ls = size(); + for (int i = 0; i < ls; ++i) { + if (m_used[i].v) { + auto b = m_bucket[i]; + if (b < 0) + continue; + fn++; + fs += (1LL << b); + } + } + std::cout << "# slots " << size() << '\n' + << "# bytes " << totBytes << '\n' + << "# alloc " << nAlloc << '\n' + << "# free " << nFree << '\n' + << "# used " << fn << ' ' << fs << '\n' + << std::endl; +} diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc new file mode 100644 index 0000000000000..6ecfa47865c3e --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc @@ -0,0 +1,148 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" + +#include +#include +#include + +#include + +namespace { + + // free callback + void CUDART_CB freeCallback(cudaStream_t streamId, cudaError_t status, void *p) { + //void CUDART_CB freeCallback(void *p) { + if (status != cudaSuccess) { + std::cout << "Error in free callaback in stream " << streamId << std::endl; + auto error = cudaGetErrorName(status); + auto message = cudaGetErrorString(status); + std::cout << " error " << error << ": " << message << std::endl; + } + // std::cout << "free callaback for stream " << streamId << std::endl; + auto payload = (memoryPool::Payload *)(p); + poolDetails::freeAsync(payload); + } + +} // namespace + +struct CudaAlloc { + static void scheduleFree(memoryPool::Payload *payload, void *stream) { + // std::cout << "schedule free for stream " << stream <> cpuPool; + + std::unique_ptr> hostPool; + + using DevicePool = SimplePoolAllocatorImpl; + std::vector> devicePools; + + void initDevicePools(int size) { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status == cudaSuccess && devices > 0) { + devicePools.reserve(devices); + for (int i = 0; i < devices; ++i) + devicePools.emplace_back(new DevicePool(size)); + } + } + + DevicePool *getDevicePool() { + int dev = -1; + cudaGetDevice(&dev); + return devicePools[dev].get(); + } + +} // namespace + +namespace memoryPool { + namespace cuda { + + void init(bool onlyCPU) { + constexpr int poolSize = 128 * 1024; + cpuPool = std::make_unique>(poolSize); + if (onlyCPU) + return; + initDevicePools(poolSize); + hostPool = std::make_unique>(poolSize); + } + + void shutdown() { + cpuPool.reset(); + devicePools.clear(); + hostPool.reset(); + } + + void dumpStat() { + std::cout << "device pool" << std::endl; + getDevicePool()->dumpStat(); + std::cout << "host pool" << std::endl; + hostPool->dumpStat(); + } + + SimplePoolAllocator *getPool(Where where) { + return onCPU == where ? (SimplePoolAllocator *)(cpuPool.get()) + : (onDevice == where ? (SimplePoolAllocator *)(getDevicePool()) + : (SimplePoolAllocator *)(hostPool.get())); + } + + // allocate either on current device or on host (actually anywhere, not cuda specific) + std::pair alloc(uint64_t size, SimplePoolAllocator &pool) { + int i = pool.alloc(size); + void *p = pool.pointer(i); + return std::pair(p, i); + } + + // schedule free + void free(cudaStream_t stream, std::vector buckets, SimplePoolAllocator &pool) { + auto payload = new Payload{&pool, std::move(buckets)}; + pool.scheduleFree(payload, stream); + } + + } // namespace cuda +} // namespace memoryPool diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml index 53d41efcf4236..9a01d343e5d53 100644 --- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml +++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml @@ -129,4 +129,29 @@ + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu new file mode 100644 index 0000000000000..417f42e606c42 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu @@ -0,0 +1,101 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include + +template +struct dataProducer { + auto operator()(cudaStream_t stream) { return memoryPool::cuda::makeBuffer(20, stream, where); } +}; + +int main() { + { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status != cudaSuccess || 0 == devices) + return 0; + std::cout << "found " << devices << " cuda devices" << std::endl; + } + const int NUMTHREADS = 1; + + printf("Using CUDA %d\n", CUDART_VERSION); + int cuda_device = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, cuda_device); + printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor); + + cudaStream_t streams[NUMTHREADS]; + + for (int i = 0; i < NUMTHREADS; i++) { + cudaStreamCreate(&(streams[i])); + } + + memoryPool::cuda::init(false); + memoryPool::cuda::dumpStat(); + + auto& stream = streams[0]; + + { + auto pd = memoryPool::cuda::makeBuffer(20, stream, memoryPool::onDevice); + auto ph = memoryPool::cuda::makeBuffer(20, stream, memoryPool::onHost); + auto pc = memoryPool::cuda::makeBuffer(20, nullptr, memoryPool::onCPU); + + auto dp = dataProducer()(stream); + + cudaCheck(memoryPool::cuda::copy(ph, pd, 20, stream)); + cudaCheck(memoryPool::cuda::copy(pd, ph, 20, stream)); + std::cout << "expect 2a 2u 1a 1u" << std::endl; + memoryPool::cuda::dumpStat(); + + { + auto ph = memoryPool::cuda::makeBuffer(20, stream, memoryPool::onHost); + cudaCheck(memoryPool::cuda::copy(pd, ph, 20, stream)); + } + cudaStreamSynchronize(stream); + std::cout << "expect 2a 2u 2a 1u " << std::endl; + memoryPool::cuda::dumpStat(); + { + auto ph = memoryPool::cuda::makeBuffer(20, stream, memoryPool::onHost); + cudaCheck(memoryPool::cuda::copy(pd, ph, 20, stream)); + } + std::cout << "expect 2a 2u 2a 1u " << std::endl; + cudaStreamSynchronize(stream); + memoryPool::cuda::dumpStat(); + } + std::cout << "expect 2a 0u 2a 0u " << std::endl; + cudaStreamSynchronize(stream); + memoryPool::cuda::dumpStat(); + + { + memoryPool::Deleter devDeleter(std::make_shared(stream, memoryPool::onDevice)); + memoryPool::Deleter hosDeleter(std::make_shared(stream, memoryPool::onHost)); + + auto p0 = memoryPool::cuda::makeBuffer(20, devDeleter); + auto p1 = memoryPool::cuda::makeBuffer(20, devDeleter); + auto p2 = memoryPool::cuda::makeBuffer(20, devDeleter); + auto p3 = memoryPool::cuda::makeBuffer(20, devDeleter); + + { + auto pd = memoryPool::cuda::makeBuffer(40, stream, memoryPool::onDevice); + p0.reset(pd.release()); + memoryPool::cuda::dumpStat(); + } + cudaStreamSynchronize(stream); + + auto hp0 = memoryPool::cuda::makeBuffer(40, hosDeleter); + auto hp1 = memoryPool::cuda::makeBuffer(20, hosDeleter); + auto hp2 = memoryPool::cuda::makeBuffer(20, hosDeleter); + auto hp3 = memoryPool::cuda::makeBuffer(20, hosDeleter); + + cudaCheck(memoryPool::cuda::copy(hp3, p3, 20, stream)); + cudaCheck(memoryPool::cuda::copy(p0, hp0, 40, stream)); + ; + + memoryPool::cuda::dumpStat(); + } + + cudaStreamSynchronize(stream); + memoryPool::cuda::dumpStat(); + memoryPool::cuda::shutdown(); + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu new file mode 100644 index 0000000000000..e9b3e0b261a9b --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu @@ -0,0 +1,249 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +#include +#include + +#include +#include + +#include +#include +#include + +typedef std::thread Thread; +typedef std::vector ThreadGroup; +typedef std::mutex Mutex; +typedef std::lock_guard Lock; + +struct Node { + int it = -1; + int i = -1; + void *p = nullptr; +#ifdef __CUDACC__ + int c = 0; +#else + std::atomic c = 0; +#endif +}; + +#ifdef __CUDACC__ + +// generic callback +template +void CUDART_CB myCallback(void *fun) { + (*(F *)(fun))(); +} + +__global__ void kernel_set(int s, Node **p, int me) { + int first = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = first; i < s; i += gridDim.x * blockDim.x) { + assert(p[i]); + auto n = p[i]; + n->it = me; + n->i = i; + n->p = p[i]; + n->c = 1; + } +} + +__global__ void kernel_test(int s, Node **p, int me) { + int first = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = first; i < s; i += gridDim.x * blockDim.x) { + assert(p[i]); + auto n = p[i]; + atomicSub(&(n->c), 1); + assert(n->it == me); + assert(n->i == i); + assert(n->p == p[i]); + assert(0 == n->c); + } +} +#endif + +template +void go() { + auto start = std::chrono::high_resolution_clock::now(); + + const int NUMTHREADS = 24; + +#ifdef __CUDACC__ + printf("Using CUDA %d\n", CUDART_VERSION); + int cuda_device = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, cuda_device); + printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor); + + cudaStream_t streams[NUMTHREADS]; + + for (int i = 0; i < NUMTHREADS; i++) { + cudaStreamCreate(&(streams[i])); + } + +#endif + + memoryPool::cuda::init(false); + + bool stop = false; + bool bin24 = false; + Thread monitor([&] { + int n = 10; + while (n--) { + sleep(5); + memoryPool::cuda::dumpStat(); + if (5 == n) + bin24 = true; + } + std::cout << "\nstop\n" << std::endl; + stop = true; + }); + + int s = 40; + { + std::cout << "try to allocate " << s << std::endl; + auto stream = streams[0]; + { + auto pd = memoryPool::cuda::makeBuffer(s, stream, where); + assert(pd.get()); + memoryPool::cuda::dumpStat(); + pd = memoryPool::cuda::makeBuffer(s, stream, where); + memoryPool::cuda::dumpStat(); + } + cudaStreamSynchronize(stream); + memoryPool::cuda::dumpStat(); + } + std::atomic nt = 0; + + auto test = [&] { + int const me = nt++; + auto delta = std::chrono::high_resolution_clock::now() - start; + + std::mt19937 eng(me + std::chrono::duration_cast(delta).count()); + std::uniform_int_distribution rgen1(1, 100); + std::uniform_int_distribution rgen20(3, 20); + std::uniform_int_distribution rgen24(3, 24); + std::cout << "first RN " << rgen1(eng) << " at " + << std::chrono::duration_cast(delta).count() << " in " << me << std::endl; + +#ifdef __CUDACC__ + Node **dp = nullptr; + Node **hp = nullptr; + cudaMalloc(&dp, 100 * sizeof(void *)); + assert(dp); + cudaMallocHost(&hp, 100 * sizeof(void *)); + assert(hp); +#endif + auto &stream = streams[me]; + + int iter = 0; + while (true) { + if (stop) + break; + iter++; + + memoryPool::Deleter devDeleter(std::make_shared(stream, where)); + auto n = rgen1(eng); + bool large = 0 == (iter % (128 + me)); + for (int k = 0; k < n; ++k) { + int b = bin24 ? rgen24(eng) : rgen20(eng); + // once in while let's allocate 2GB + if (large) { + b = 31; + large = false; + } + uint64_t s = 1LL << b; + assert(s > 0); + try { + auto p0 = memoryPool::cuda::makeBuffer(s / sizeof(Node) + sizeof(Node), devDeleter); + auto p = p0.get(); + if (nullptr == p) { + std::cout << "error not detected??? " << b << ' ' << std::endl; + memoryPool::cuda::dumpStat(); + } + assert(p); + hp[k] = p; + } catch (...) { + std::cout << "\n\n!!!Failed " << me << " at " << iter << std::endl; + cudaStreamSynchronize(stream); + memoryPool::cuda::dumpStat(); + return; + } + } +#ifdef __CUDACC__ + assert(n <= 100); + // do something??? + cudaMemcpyAsync(dp, hp, n * sizeof(void *), cudaMemcpyHostToDevice, stream); + kernel_set<<<1, 128, 0, stream>>>(n, dp, me); + kernel_test<<<1, 128, 0, stream>>>(n, dp, me); + + // better sync each "event" + // cudaStreamSynchronize(stream); +#else + // do something??? + for (int k = 0; k < n; ++k) { + auto p = hp[k]; + assert(p); + auto n = p; + n->it = me; + n->i = i; + n->p = p; + n->c = 1; + } + for (int k = 0; k < n; ++k) { + auto p = hp[k]; + assert(p); + auto n = p; + n->c--; + assert(n->it == me); + assert(n->i == i); + assert(n->p == p); + assert(0 == n->c); + } +#endif + } + cudaStreamSynchronize(stream); + }; + + ThreadGroup threads; + threads.reserve(NUMTHREADS); + + for (int i = 0; i < NUMTHREADS; ++i) { + threads.emplace_back(test); + } + + for (auto &t : threads) + t.join(); + + threads.clear(); + monitor.join(); + cudaDeviceSynchronize(); + std::cout << "\nfinished\n" << std::endl; + memoryPool::cuda::dumpStat(); + std::cout << "\nshutdown\n" << std::endl; + memoryPool::cuda::shutdown(); +} + +#ifdef __CUDACC__ +#include +#include + +#endif + +int main() { +#ifdef __CUDACC__ + { + int devices = 0; + auto status = cudaGetDeviceCount(&devices); + if (status != cudaSuccess || 0 == devices) + return 0; + std::cout << "found " << devices << " cuda devices" << std::endl; + } + + std::cout << "\ntesting cuda device" << std::endl; + go(); +#else + std::cout << "testing posix" << std::endl; + go(); +#endif + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cpp b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cpp new file mode 100644 index 0000000000000..1e57ad7312056 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cpp @@ -0,0 +1,304 @@ +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" + +#include +#include + +#include +#include + +#include +#include +#include + +typedef std::thread Thread; +typedef std::vector ThreadGroup; +typedef std::mutex Mutex; +typedef std::lock_guard Lock; + +struct Node { + int it = -1; + int i = -1; + void *p = nullptr; +#ifdef __CUDACC__ + int c = 0; +#else + std::atomic c = 0; +#endif +}; + +#ifdef __CUDACC__ + +// generic callback +template +void CUDART_CB myCallback(void *fun) { + (*(F *)(fun))(); +} + +__global__ void kernel_set(int s, void **p, int me) { + int first = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = first; i < s; i += gridDim.x * blockDim.x) { + assert(p[i]); + auto n = (Node *)(p[i]); + n->it = me; + n->i = i; + n->p = p[i]; + n->c = 1; + } +} + +__global__ void kernel_test(int s, void **p, int me) { + int first = blockIdx.x * blockDim.x + threadIdx.x; + for (int i = first; i < s; i += gridDim.x * blockDim.x) { + assert(p + i); + auto n = (Node *)(p[i]); + atomicSub(&(n->c), 1); + assert(n->it == me); + assert(n->i == i); + assert(n->p == p[i]); + assert(0 == n->c); + } +} +#endif + +template +void go() { + auto start = std::chrono::high_resolution_clock::now(); + + const int NUMTHREADS = 24; + +#ifdef __CUDACC__ + printf("Using CUDA %d\n", CUDART_VERSION); + int cuda_device = 0; + cudaDeviceProp deviceProp; + cudaGetDeviceProperties(&deviceProp, cuda_device); + printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor); + + cudaStream_t streams[NUMTHREADS]; + + for (int i = 0; i < NUMTHREADS; i++) { + cudaStreamCreate(&(streams[i])); + } + +#endif + + SimplePoolAllocatorImpl pool(128 * 1024); + assert(0 == pool.size()); + + bool stop = false; + bool bin24 = false; + Thread monitor([&] { + int n = 10; + while (n--) { + sleep(5); + pool.dumpStat(); + if (5 == n) + bin24 = true; + } + std::cout << "\nstop\n" << std::endl; + stop = true; + }); + + int s = 40; + + std::cout << "try to allocate " << s << std::endl; + + int i0 = pool.alloc(s); + assert(1 == pool.size()); + assert(i0 >= 0); + auto p0 = pool.pointer(i0); + assert(nullptr != p0); + + pool.free(i0); + assert(1 == pool.size()); + + int i1 = pool.alloc(s); + assert(1 == pool.size()); + assert(i1 == i0); + auto p1 = pool.pointer(i1); + assert(p1 == p0); + + std::atomic nt = 0; + + auto test = [&] { + int const me = nt++; + auto delta = std::chrono::high_resolution_clock::now() - start; + + std::mt19937 eng(me + std::chrono::duration_cast(delta).count()); + std::uniform_int_distribution rgen1(1, 100); + std::uniform_int_distribution rgen20(3, 20); + std::uniform_int_distribution rgen24(3, 24); + std::cout << "first RN " << rgen1(eng) << " at " + << std::chrono::duration_cast(delta).count() << " in " << me << std::endl; + +#ifdef __CUDACC__ + void **dp = nullptr; + void **hp = nullptr; + cudaMalloc(&dp, 100 * sizeof(void *)); + assert(dp); + cudaMallocHost(&hp, 100 * sizeof(void *)); + assert(hp); +#endif + + int iter = 0; + while (true) { + if (stop) + break; + iter++; + auto n = rgen1(eng); + int ind[n]; + bool large = 0 == (iter % (128 + me)); + for (auto &i : ind) { + int b = bin24 ? rgen24(eng) : rgen20(eng); + // once in while let's allocate 2GB + if (large) { + b = 31; + large = false; + } + uint64_t s = 1LL << b; + assert(s > 0); + i = pool.alloc(s + sizeof(Node)); + if (i < 0) { + std::cout << "\n\n!!!Failed " << me << " at " << iter << std::endl; + pool.dumpStat(); + return; + } + assert(i >= 0); + auto p = pool.pointer(i); + if (nullptr == p) { + std::cout << "error not detected??? " << b << ' ' << i << std::endl; + pool.dumpStat(); + } + assert(p); + } +#ifdef __CUDACC__ + assert(n <= 100); + auto &stream = streams[me]; + // do something??? + for (int k = 0; k < n; ++k) { + auto i = ind[k]; + hp[k] = pool.pointer(i); + } + cudaMemcpyAsync(dp, hp, n * sizeof(void *), cudaMemcpyHostToDevice, stream); + kernel_set<<<1, 128, 0, stream>>>(n, dp, me); + kernel_test<<<1, 128, 0, stream>>>(n, dp, me); + + // free + auto doFree = [&]() { + for (int k = 0; k < n; ++k) { + auto i = ind[k]; + pool.free(i); + } + }; + cudaLaunchHostFunc(stream, myCallback, &doFree); + + // better sync each "event" + cudaStreamSynchronize(stream); +#else + // do something??? + for (auto i : ind) { + auto p = pool.pointer(i); + assert(p); + auto n = (Node *)(p); + n->it = me; + n->i = i; + n->p = p; + n->c = 1; + } + for (auto i : ind) { + auto p = pool.pointer(i); + assert(p); + auto n = (Node *)(p); + n->c--; + assert(n->it == me); + assert(n->i == i); + assert(n->p == p); + assert(0 == n->c); + } + // free + for (auto i : ind) { + pool.free(i); + } +#endif + } + }; + + ThreadGroup threads; + threads.reserve(NUMTHREADS); + + for (int i = 0; i < NUMTHREADS; ++i) { + threads.emplace_back(test); + } + + for (auto &t : threads) + t.join(); + + threads.clear(); + monitor.join(); + std::cout << "\nfinished\n" << std::endl; + pool.dumpStat(); +} + +#ifdef __CUDACC__ +#include +#include + +// copy from implementation + +struct CudaAlloc { + // not called in this test (done inline) + static void scheduleFree(memoryPool::Payload *payload, void *stream) { + std::cout << "schedule free for stream " << stream << std::endl; + abort(); + } +}; + +struct CudaDeviceAlloc : public CudaAlloc { + using Pointer = void *; + + static Pointer alloc(size_t size) { + Pointer p = nullptr; + auto err = cudaMalloc(&p, size); + // std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl; + return err == cudaSuccess ? p : nullptr; + } + static void free(Pointer ptr) { + auto err = cudaFree(ptr); + // std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <(); +#else + std::cout << "testing posix" << std::endl; + go(); +#endif + + return 0; +} diff --git a/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cu b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cu new file mode 100644 index 0000000000000..92511a3ec0f53 --- /dev/null +++ b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cu @@ -0,0 +1 @@ +#include "testSimplePoolAllocator.cpp" diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu index 135254fa6e9f2..2d6bf12e665df 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu +++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu @@ -13,6 +13,8 @@ #include "PixelRecHitGPUKernel.h" #include "gpuPixelRecHits.h" +// #define GPU_DEBUG + namespace { __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart, pixelCPEforGPU::ParamsOnGPU const* cpeParams, @@ -42,10 +44,21 @@ namespace pixelgpudetails { cudaStream_t stream) const { auto nHits = clusters_d.nClusters(); - TrackingRecHit2DGPU hits_d( - nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream); + TrackingRecHit2DGPU hits_d(nHits, + isPhase2, + clusters_d.offsetBPIX2(), + cpeParams, + clusters_d.clusModuleStart(), + memoryPool::onDevice, + stream); + + assert(hits_d.view()); assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules); + cudaCheck(cudaGetLastError()); +#ifdef GPU_DEBUG + cudaCheck(cudaDeviceSynchronize()); +#endif int activeModulesWithDigis = digis_d.nModules(); // protect from empty events diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc index 7ff2da5552e6d..28c912f9da6c3 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc @@ -47,8 +47,8 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer uint32_t nHits_; uint32_t nMaxModules_; - cms::cuda::host::unique_ptr store32_; - cms::cuda::host::unique_ptr hitsModuleStart_; + memoryPool::Buffer store32_; + memoryPool::Buffer hitsModuleStart_; }; SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig) @@ -84,7 +84,7 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent, } void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) { - // allocate a buffer for the indices of the clusters + // allocate a Buffer for the indices of the clusters auto hmsp = std::make_unique(nMaxModules_ + 1); SiPixelRecHitCollection output; @@ -98,7 +98,7 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e output.reserve(nMaxModules_, nHits_); std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get()); - // wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart + // wrap the Buffer in a HostProduct, and move it to the Event, without reallocating the Buffer or affecting hitsModuleStart iEvent.emplace(hostPutToken_, std::move(hmsp)); auto xl = store32_.get(); @@ -124,8 +124,8 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e const PixelGeomDetUnit* pixDet = dynamic_cast(genericDet); assert(pixDet); SiPixelRecHitCollection::FastFiller recHitsOnDetUnit(output, detid); - auto fc = hitsModuleStart_[gind]; - auto lc = hitsModuleStart_[gind + 1]; + auto fc = hitsModuleStart_.get()[gind]; + auto lc = hitsModuleStart_.get()[gind + 1]; auto nhits = lc - fc; assert(lc > fc); diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc index fda418320e70a..71e775056d150 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc @@ -45,9 +45,9 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer store32_; - cms::cuda::host::unique_ptr store16_; - cms::cuda::host::unique_ptr hitsModuleStart_; + memoryPool::Buffer store32_; + memoryPool::Buffer store16_; + memoryPool::Buffer hitsModuleStart_; }; SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig) @@ -89,3 +89,25 @@ void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const } DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA); + +// #define FINAL_POOL_DUMP +#ifdef FINAL_POOL_DUMP + +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + +#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h" + +namespace { + + struct FinalPoolDump { + ~FinalPoolDump() { + std::cout << "Final Pool Dump\n==== ==== ====\n\n Posix Pool" << std::endl; + ((SimplePoolAllocatorImpl*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat(); + memoryPool::cuda::dumpStat(); + } + }; + + FinalPoolDump dump; + +} // namespace +#endif diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc index d23ecec66fea0..9f9e067734ccc 100644 --- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc +++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc @@ -157,7 +157,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1) auto output = std::make_unique( - numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr); + numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, memoryPool::onCPU, nullptr); assert(output->nMaxModules() == uint32_t(nMaxModules)); if (0 == numberOfClusters) { diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py index c5f8c0949dd08..21202f3d21b65 100644 --- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py +++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py @@ -61,7 +61,7 @@ siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA( cpu = cms.EDAlias( siPixelRecHitsPreSplittingCPU = cms.VPSet( - cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")), + cms.PSet(type = cms.string("TrackingRecHit2DHeterogeneous")), cms.PSet(type = cms.string("uintAsHostProduct")) )), ) diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc index edbcd44eeeef5..473263b5add9d 100644 --- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc @@ -1,7 +1,6 @@ #include #include "CUDADataFormats/Common/interface/Product.h" -#include "CUDADataFormats/Common/interface/HostProduct.h" #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "DataFormats/Common/interface/Handle.h" #include "FWCore/Framework/interface/Event.h" @@ -15,6 +14,7 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" // Switch on to enable checks and printout for found tracks // #define PIXEL_DEBUG_PRODUCE @@ -35,7 +35,7 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr soa_; + PixelTrackHeterogeneous soa_; }; PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig) @@ -56,7 +56,8 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent, cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - soa_ = inputData.toHostAsync(ctx.stream()); + soa_ = memoryPool::cuda::makeBuffer(1, ctx.stream(), memoryPool::onHost); + memoryPool::cuda::copy(soa_, inputData, 1, ctx.stream()); } void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { @@ -79,9 +80,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i #endif // DO NOT make a copy (actually TWO....) - iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(soa_))); + iEvent.emplace(tokenSOA_, std::move(soa_)); - assert(!soa_); + assert(!soa_.get()); } DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu index d99a96b705451..e7d20fb0d8158 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu @@ -1,5 +1,6 @@ + #include "BrokenLineFitOnGPU.h" -#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, uint32_t hitsInFit, @@ -11,13 +12,15 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize; // Fit internals - auto tkidGPU = cms::cuda::make_device_unique(maxNumberOfConcurrentFits_, stream); - auto hitsGPU = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream); - auto hits_geGPU = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), stream); - auto fast_fit_resultsGPU = cms::cuda::make_device_unique( - maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream); + memoryPool::Deleter deleter = + memoryPool::Deleter(std::make_shared(stream, memoryPool::onDevice)); + auto tkidGPU = memoryPool::cuda::makeBuffer(maxNumberOfConcurrentFits_, deleter); + auto hitsGPU = memoryPool::cuda::makeBuffer( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), deleter); + auto hits_geGPU = memoryPool::cuda::makeBuffer( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), deleter); + auto fast_fit_resultsGPU = memoryPool::cuda::makeBuffer( + maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), deleter); for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) { // fit triplets @@ -127,6 +130,9 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv, fast_fit_resultsGPU.get()); cudaCheck(cudaGetLastError()); } - +#ifdef GPU_DEBUG + cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); +#endif } // loop on concurrent fits } diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc index 66208debdc98d..d48af27a329be 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc @@ -22,18 +22,16 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif // use "nhits" to heuristically dimension the workspace - - // no need to use the Traits allocations, since we know this is being compiled for the CPU - //device_isOuterHitOfCell_ = Traits::template make_unique(std::max(1U, nhits), stream); - device_isOuterHitOfCell_ = std::make_unique(std::max(1U, nhits)); + memoryPool::Deleter const &deleter = device_storage_.deleter(); + device_isOuterHitOfCell_ = + memoryPool::cuda::makeBuffer(std::max(1U, nhits), deleter); assert(device_isOuterHitOfCell_.get()); isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks); // no need to use the Traits allocations, since we know this is being compiled for the CPU - //cellStorage_ = Traits::template make_unique(cellStorageSize, stream); - cellStorage_ = std::make_unique(cellStorageSize); + cellStorage_ = memoryPool::cuda::makeBuffer(cellStorageSize, deleter); device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors)); @@ -45,9 +43,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr device_theCellTracks_.get(), device_theCellTracksContainer_); - // no need to use the Traits allocations, since we know this is being compiled for the CPU - //device_theCells_ = Traits::template make_unique(params_.maxNumberOfDoublets_, stream); - device_theCells_ = std::make_unique(params_.maxNumberOfDoublets_); + device_theCells_ = memoryPool::cuda::makeBuffer(params_.maxNumberOfDoublets_, deleter); if (0 == nhits) return; // protect against empty events diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu index 913b6d5a32d28..e29b15fc80e53 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu @@ -135,8 +135,6 @@ template <> void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) { int32_t nhits = hh.nHits(); - isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; - #ifdef NTUPLE_DEBUG std::cout << "building Doublets out of " << nhits << " Hits" << std::endl; #endif @@ -147,16 +145,17 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr #endif // in principle we can use "nhits" to heuristically dimension the workspace... - device_isOuterHitOfCell_ = cms::cuda::make_device_unique( - std::max(1, nhits - hh.offsetBPIX2()), stream); + memoryPool::Deleter const &deleter = device_storage_.deleter(); + device_isOuterHitOfCell_ = + memoryPool::cuda::makeBuffer(std::max(1, nhits), deleter); assert(device_isOuterHitOfCell_.get()); isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()}; - cellStorage_ = cms::cuda::make_device_unique( + cellStorage_ = memoryPool::cuda::makeBuffer( caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks), - stream); + deleter); device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get(); device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors)); @@ -174,7 +173,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr cudaCheck(cudaGetLastError()); } - device_theCells_ = cms::cuda::make_device_unique(params_.maxNumberOfDoublets_, stream); + device_theCells_ = memoryPool::cuda::makeBuffer(params_.maxNumberOfDoublets_, deleter); #ifdef GPU_DEBUG cudaDeviceSynchronize(); @@ -325,6 +324,9 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA cudaCheck(cudaGetLastError()); } #ifdef GPU_DEBUG + //std::cout << "sync stream " << cudaStream << std::endl; + //cudaStreamSynchronize(cudaStream); + //std::cout << "sync stream done " << cudaStream << std::endl; cudaDeviceSynchronize(); cudaCheck(cudaGetLastError()); #endif diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h index 8af1176fe92c6..6c1abc5bb5b10 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h @@ -6,6 +6,8 @@ #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h" #include "GPUCACell.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" + // #define DUMP_GPU_TK_TUPLES namespace cAHitNtupletGenerator { @@ -153,21 +155,21 @@ namespace cAHitNtupletGenerator { } // namespace cAHitNtupletGenerator -template +template class CAHitNtupletGeneratorKernels { public: - using Traits = TTraits; + static constexpr memoryPool::Where where = w; using QualityCuts = cAHitNtupletGenerator::QualityCuts; using Params = cAHitNtupletGenerator::Params; using Counters = cAHitNtupletGenerator::Counters; template - using unique_ptr = typename Traits::template unique_ptr; + using Buffer = memoryPool::Buffer; using HitsView = TrackingRecHit2DSOAView; using HitsOnGPU = TrackingRecHit2DSOAView; - using HitsOnCPU = TrackingRecHit2DHeterogeneous; + using HitsOnCPU = TrackingRecHit2DHeterogeneous; using HitToTuple = caConstants::HitToTuple; using TupleMultiplicity = caConstants::TupleMultiplicity; @@ -197,28 +199,28 @@ class CAHitNtupletGeneratorKernels { Counters* counters_ = nullptr; // workspace - unique_ptr cellStorage_; - unique_ptr device_theCellNeighbors_; + Buffer cellStorage_; + Buffer device_theCellNeighbors_; caConstants::CellNeighbors* device_theCellNeighborsContainer_; - unique_ptr device_theCellTracks_; + Buffer device_theCellTracks_; caConstants::CellTracks* device_theCellTracksContainer_; - unique_ptr device_theCells_; - unique_ptr device_isOuterHitOfCell_; + Buffer device_theCells_; + Buffer device_isOuterHitOfCell_; GPUCACell::OuterHitOfCell isOuterHitOfCell_; uint32_t* device_nCells_ = nullptr; - unique_ptr device_hitToTuple_; - unique_ptr device_hitToTupleStorage_; + Buffer device_hitToTuple_; + Buffer device_hitToTupleStorage_; HitToTuple::View hitToTupleView_; cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr; cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr; - unique_ptr device_tupleMultiplicity_; + Buffer device_tupleMultiplicity_; - unique_ptr device_storage_; + Buffer device_storage_; // params Params const& params_; /// Intermediate result avoiding repeated computations. @@ -236,7 +238,7 @@ class CAHitNtupletGeneratorKernels { } }; -using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; -using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; +using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels; +using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels; #endif // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc index 5978ef8851c73..cec96faab2c86 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc @@ -12,31 +12,34 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER) ////////////////////////////////////////////////////////// - device_theCellNeighbors_ = Traits::template make_unique(stream); - device_theCellTracks_ = Traits::template make_unique(stream); + memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared(stream, where)); + + device_theCellNeighbors_ = memoryPool::cuda::makeBuffer(1, deleter); + device_theCellTracks_ = memoryPool::cuda::makeBuffer(1, deleter); #ifdef GPU_DEBUG - std::cout << "Allocation for tuple building. N hits " << nHits << std::endl; + std::cout << "Allocation for tuple building. N hits " << nHits + << ((where == memoryPool::onDevice) ? " on GPU" : " on CPU") << std::endl; #endif nHits++; // storage requires one more counter; assert(nHits > 0); - device_hitToTuple_ = Traits::template make_unique(stream); - device_hitToTupleStorage_ = Traits::template make_unique(nHits, stream); + device_hitToTuple_ = memoryPool::cuda::makeBuffer(1, deleter); + device_hitToTupleStorage_ = memoryPool::cuda::makeBuffer(nHits, deleter); hitToTupleView_.assoc = device_hitToTuple_.get(); hitToTupleView_.offStorage = device_hitToTupleStorage_.get(); hitToTupleView_.offSize = nHits; - device_tupleMultiplicity_ = Traits::template make_unique(stream); + device_tupleMultiplicity_ = memoryPool::cuda::makeBuffer(1, deleter); - device_storage_ = Traits::template make_unique(3, stream); + device_storage_ = memoryPool::cuda::makeBuffer(3, deleter); device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get(); device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1; device_nCells_ = (uint32_t*)(device_storage_.get() + 2); // FIXME: consider collapsing these 3 in one adhoc kernel - if constexpr (std::is_same::value) { + if constexpr (where == memoryPool::onDevice) { cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream)); } else { *device_nCells_ = 0; diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc index a554d364ee42b..2d61797b47fcf 100644 --- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc +++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc @@ -185,7 +185,8 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription& PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d, float bfield, cudaStream_t stream) const { - PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique(stream)); + auto where = memoryPool::onDevice; + PixelTrackHeterogeneous tracks = memoryPool::cuda::makeBuffer(1, stream, where); auto* soa = tracks.get(); assert(soa); @@ -216,7 +217,8 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH } PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const { - PixelTrackHeterogeneous tracks(std::make_unique()); + auto where = memoryPool::onCPU; + PixelTrackHeterogeneous tracks = memoryPool::cuda::makeBuffer(1, nullptr, where); auto* soa = tracks.get(); assert(soa); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc index dc125878b1058..205c84bbf876a 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc @@ -16,6 +16,7 @@ #include "FWCore/Utilities/interface/EDGetToken.h" #include "FWCore/Utilities/interface/InputTag.h" #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" class PixelVertexSoAFromCUDA : public edm::stream::EDProducer { public: @@ -33,7 +34,7 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer edm::EDGetTokenT> tokenCUDA_; edm::EDPutTokenT tokenSOA_; - cms::cuda::host::unique_ptr m_soa; + ZVertexHeterogeneous soa_; }; PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig) @@ -54,12 +55,13 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent, cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)}; auto const& inputData = ctx.get(inputDataWrapped); - m_soa = inputData.toHostAsync(ctx.stream()); + soa_ = memoryPool::cuda::makeBuffer(1, ctx.stream(), memoryPool::onHost); + memoryPool::cuda::copy(soa_, inputData, 1, ctx.stream()); } void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) { // No copies.... - iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa))); + iEvent.emplace(tokenSOA_, std::move(soa_)); } DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA); diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc index 20b007d2d029f..f1c652b68c61e 100644 --- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc +++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc @@ -1,4 +1,5 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" #include "gpuClusterTracksByDensity.h" #include "gpuClusterTracksDBSCAN.h" @@ -96,26 +97,28 @@ namespace gpuVertexFinder { #ifdef __CUDACC__ ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const { -#ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "producing Vertices on GPU" << std::endl; -#endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(cms::cuda::make_device_unique(stream)); + auto where = memoryPool::onDevice; #else ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const { + cudaStream_t stream = nullptr; + auto where = memoryPool::onCPU; +#endif #ifdef PIXVERTEX_DEBUG_PRODUCE - std::cout << "producing Vertices on CPU" << std::endl; -#endif // PIXVERTEX_DEBUG_PRODUCE - ZVertexHeterogeneous vertices(std::make_unique()); +#ifdef __CUDACC__ + auto whereName = "GPU"; +#else + auto whereName = "CPU"; #endif + std::cout << "producing Vertices on " << whereName << std::endl; +#endif // PIXVERTEX_DEBUG_PRODUCE + + ZVertexHeterogeneous vertices = memoryPool::cuda::makeBuffer(1, stream, where); + assert(tksoa); auto* soa = vertices.get(); assert(soa); -#ifdef __CUDACC__ - auto ws_d = cms::cuda::make_device_unique(stream); -#else - auto ws_d = std::make_unique(); -#endif + auto ws_d = memoryPool::cuda::makeBuffer(1, stream, where); #ifdef __CUDACC__ init<<<1, 1, 0, stream>>>(soa, ws_d.get()); diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h index 5f8a0646c726a..b3a2886e2447f 100644 --- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h +++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h @@ -7,6 +7,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h" #include "HeterogeneousCore/CUDAUtilities/interface/launch.h" +#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h" #ifdef USE_DBSCAN #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h" #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN @@ -114,235 +115,248 @@ __global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder:: int main() { #ifdef __CUDACC__ cms::cudatest::requireDevices(); - - auto onGPU_d = cms::cuda::make_device_unique(1, nullptr); - auto ws_d = cms::cuda::make_device_unique(1, nullptr); + auto where = memoryPool::onDevice; #else - auto onGPU_d = std::make_unique(); - auto ws_d = std::make_unique(); + auto where = memoryPool::onCPU; #endif - Event ev; + memoryPool::cuda::init(false); + + { + cudaStream_t stream = nullptr; // here works as nothing is supposed to be deleted before synch... + + auto onGPU_d = memoryPool::cuda::makeBuffer(1, stream, where); + auto ws_d = memoryPool::cuda::makeBuffer(1, stream, where); + + Event ev; - float eps = 0.1f; - std::array par{{eps, 0.01f, 9.0f}}; - for (int nav = 30; nav < 80; nav += 20) { - ClusterGenerator gen(nav, 10); + float eps = 0.1f; + std::array par{{eps, 0.01f, 9.0f}}; + for (int nav = 30; nav < 80; nav += 20) { + ClusterGenerator gen(nav, 10); - for (int i = 8; i < 20; ++i) { - auto kk = i / 4; // M param + for (int i = 8; i < 20; ++i) { + auto kk = i / 4; // M param - gen(ev); + gen(ev); #ifdef __CUDACC__ - init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); #else - onGPU_d->init(); - ws_d->init(); + onGPU_d->init(); + ws_d->init(); #endif - std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; - auto nt = ev.ztrack.size(); + std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl; + auto nt = ev.ztrack.size(); #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); - cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice)); + cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); + cudaCheck( + cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice)); #else - ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); - ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); - ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); - ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t)); + ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size()); + ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size()); + ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size()); #endif - std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; + std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl; - if ((i % 4) == 0) - par = {{eps, 0.02f, 12.0f}}; - if ((i % 4) == 1) - par = {{eps, 0.02f, 9.0f}}; - if ((i % 4) == 2) - par = {{eps, 0.01f, 9.0f}}; - if ((i % 4) == 3) - par = {{0.7f * eps, 0.01f, 9.0f}}; + if ((i % 4) == 0) + par = {{eps, 0.02f, 12.0f}}; + if ((i % 4) == 1) + par = {{eps, 0.02f, 9.0f}}; + if ((i % 4) == 2) + par = {{eps, 0.01f, 9.0f}}; + if ((i % 4) == 3) + par = {{0.7f * eps, 0.01f, 9.0f}}; - uint32_t nv = 0; + uint32_t nv = 0; #ifdef __CUDACC__ - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); - cudaCheck(cudaGetLastError()); - cudaDeviceSynchronize(); + print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + cudaCheck(cudaGetLastError()); + cudaDeviceSynchronize(); #ifdef ONE_KERNEL - cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); #else - cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); #endif - print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); + print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get()); - cudaCheck(cudaGetLastError()); - cudaDeviceSynchronize(); + cudaCheck(cudaGetLastError()); + cudaDeviceSynchronize(); - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - print(onGPU_d.get(), ws_d.get()); - CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); - print(onGPU_d.get(), ws_d.get()); - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; + print(onGPU_d.get(), ws_d.get()); + CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]); + print(onGPU_d.get(), ws_d.get()); + fitVertices(onGPU_d.get(), ws_d.get(), 50.f); + nv = onGPU_d->nvFinal; #endif - if (nv == 0) { - std::cout << "NO VERTICES???" << std::endl; - continue; - } + if (nv == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } - float* zv = nullptr; - float* wv = nullptr; - float* ptv2 = nullptr; - int32_t* nn = nullptr; - uint16_t* ind = nullptr; + float* zv = nullptr; + float* wv = nullptr; + float* ptv2 = nullptr; + int32_t* nn = nullptr; + uint16_t* ind = nullptr; - // keep chi2 separated... - float chi2[2 * nv]; // make space for splitting... + // keep chi2 separated... + float chi2[2 * nv]; // make space for splitting... #ifdef __CUDACC__ - float hzv[2 * nv]; - float hwv[2 * nv]; - float hptv2[2 * nv]; - int32_t hnn[2 * nv]; - uint16_t hind[2 * nv]; - - zv = hzv; - wv = hwv; - ptv2 = hptv2; - nn = hnn; - ind = hind; + float hzv[2 * nv]; + float hwv[2 * nv]; + float hptv2[2 * nv]; + int32_t hnn[2 * nv]; + uint16_t hind[2 * nv]; + + zv = hzv; + wv = hwv; + ptv2 = hptv2; + nn = hnn; + ind = hind; #else - zv = onGPU_d->zv; - wv = onGPU_d->wv; - ptv2 = onGPU_d->ptv2; - nn = onGPU_d->ndof; - ind = onGPU_d->sortInd; + zv = onGPU_d->zv; + wv = onGPU_d->wv; + ptv2 = onGPU_d->ptv2; + nn = onGPU_d->ndof; + ind = onGPU_d->sortInd; #endif #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif - for (auto j = 0U; j < nv; ++j) - if (nn[j] > 0) - chi2[j] /= float(nn[j]); - { - auto mx = std::minmax_element(chi2, chi2 + nv); - std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; - } + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 50.f); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + fitVertices(onGPU_d.get(), ws_d.get(), 50.f); + nv = onGPU_d->nvFinal; + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif - for (auto j = 0U; j < nv; ++j) - if (nn[j] > 0) - chi2[j] /= float(nn[j]); - { - auto mx = std::minmax_element(chi2, chi2 + nv); - std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; - } + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } #ifdef __CUDACC__ - // one vertex per block!!! - cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); - cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + // one vertex per block!!! + cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f); + cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - splitVertices(onGPU_d.get(), ws_d.get(), 9.f); - nv = ws_d->nvIntermediate; + splitVertices(onGPU_d.get(), ws_d.get(), 9.f); + nv = ws_d->nvIntermediate; #endif - std::cout << "after split " << nv << std::endl; + std::cout << "after split " << nv << std::endl; #ifdef __CUDACC__ - cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); - cudaCheck(cudaGetLastError()); + cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f); + cudaCheck(cudaGetLastError()); - cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); - cudaCheck(cudaGetLastError()); - cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get()); + cudaCheck(cudaGetLastError()); + cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost)); #else - fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); - sortByPt2(onGPU_d.get(), ws_d.get()); - nv = onGPU_d->nvFinal; - memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); + fitVertices(onGPU_d.get(), ws_d.get(), 5000.f); + sortByPt2(onGPU_d.get(), ws_d.get()); + nv = onGPU_d->nvFinal; + memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float)); #endif - if (nv == 0) { - std::cout << "NO VERTICES???" << std::endl; - continue; - } + if (nv == 0) { + std::cout << "NO VERTICES???" << std::endl; + continue; + } #ifdef __CUDACC__ - cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); - cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost)); + cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost)); #endif - for (auto j = 0U; j < nv; ++j) - if (nn[j] > 0) - chi2[j] /= float(nn[j]); - { - auto mx = std::minmax_element(chi2, chi2 + nv); - std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; - } + for (auto j = 0U; j < nv; ++j) + if (nn[j] > 0) + chi2[j] /= float(nn[j]); + { + auto mx = std::minmax_element(chi2, chi2 + nv); + std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl; + } - { - auto mx = std::minmax_element(wv, wv + nv); - std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl; - } + { + auto mx = std::minmax_element(wv, wv + nv); + std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl; + } - { - auto mx = std::minmax_element(ptv2, ptv2 + nv); - std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; - std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' ' - << ind[nv - 1] << std::endl; - } + { + auto mx = std::minmax_element(ptv2, ptv2 + nv); + std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl; + std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' ' + << ind[nv - 1] << std::endl; + } - float dd[nv]; - for (auto kv = 0U; kv < nv; ++kv) { - auto zr = zv[kv]; - auto md = 500.0f; - for (auto zs : ev.ztrack) { - auto d = std::abs(zr - zs); - md = std::min(d, md); + float dd[nv]; + for (auto kv = 0U; kv < nv; ++kv) { + auto zr = zv[kv]; + auto md = 500.0f; + for (auto zs : ev.ztrack) { + auto d = std::abs(zr - zs); + md = std::min(d, md); + } + dd[kv] = md; } - dd[kv] = md; - } - if (i == 6) { + if (i == 6) { + for (auto d : dd) + std::cout << d << ' '; + std::cout << std::endl; + } + auto mx = std::minmax_element(dd, dd + nv); + float rms = 0; for (auto d : dd) - std::cout << d << ' '; - std::cout << std::endl; - } - auto mx = std::minmax_element(dd, dd + nv); - float rms = 0; - for (auto d : dd) - rms += d * d; - rms = std::sqrt(rms) / (nv - 1); - std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; - - } // loop on events - } // lopp on ave vert + rms += d * d; + rms = std::sqrt(rms) / (nv - 1); + std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl; + + } // loop on events + } // lopp on ave vert + } + +#ifdef __CUDACC__ + memoryPool::cuda::dumpStat(); +#endif + memoryPool::cuda::shutdown(); return 0; } diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc index 8b0de1c739076..3b0c42a78fe65 100644 --- a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc +++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc @@ -31,7 +31,7 @@ namespace { BeamSpotPOD* data() { return data_h_.get(); } BeamSpotPOD const* data() const { return data_h_.get(); } - cms::cuda::host::noncached::unique_ptr& ptr() { return data_h_; } + auto& ptr() { return data_h_; } cms::cuda::host::noncached::unique_ptr const& ptr() const { return data_h_; } private: @@ -93,7 +93,7 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e bsHost->betaStar = bs.betaStar(); BeamSpotCUDA bsDevice(ctx.stream()); - cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream()); + cudaCheck(cudaMemcpyAsync(bsDevice.data(), bsHost.get(), sizeof(BeamSpotPOD), cudaMemcpyHostToDevice, ctx.stream())); ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice)); }