diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
index 7b04fac67b9f1..7faa6224959e7 100644
--- a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
+++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
@@ -4,15 +4,19 @@
 #include <cuda_runtime.h>
 
 #include "DataFormats/BeamSpot/interface/BeamSpotPOD.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 
 class BeamSpotCUDA {
 public:
+  using Buffer = memoryPool::Buffer<BeamSpotPOD>;
+
   // default constructor, required by cms::cuda::Product<BeamSpotCUDA>
   BeamSpotCUDA() = default;
 
   // constructor that allocates cached device memory on the given CUDA stream
-  BeamSpotCUDA(cudaStream_t stream) { data_d_ = cms::cuda::make_device_unique<BeamSpotPOD>(stream); }
+  BeamSpotCUDA(cudaStream_t stream) {
+    data_d_ = memoryPool::cuda::makeBuffer<BeamSpotPOD>(1, stream, memoryPool::onDevice);
+  }
 
   // movable, non-copiable
   BeamSpotCUDA(BeamSpotCUDA const&) = delete;
@@ -23,11 +27,11 @@ class BeamSpotCUDA {
   BeamSpotPOD* data() { return data_d_.get(); }
   BeamSpotPOD const* data() const { return data_d_.get(); }
 
-  cms::cuda::device::unique_ptr<BeamSpotPOD>& ptr() { return data_d_; }
-  cms::cuda::device::unique_ptr<BeamSpotPOD> const& ptr() const { return data_d_; }
+  Buffer& ptr() { return data_d_; }
+  Buffer const& ptr() const { return data_d_; }
 
 private:
-  cms::cuda::device::unique_ptr<BeamSpotPOD> data_d_;
+  Buffer data_d_;
 };
 
 #endif  // CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
diff --git a/CUDADataFormats/Common/interface/HeterogeneousSoA.h b/CUDADataFormats/Common/interface/HeterogeneousSoA.h
deleted file mode 100644
index 3f2a551bc320f..0000000000000
--- a/CUDADataFormats/Common/interface/HeterogeneousSoA.h
+++ /dev/null
@@ -1,189 +0,0 @@
-#ifndef CUDADataFormatsCommonHeterogeneousSoA_H
-#define CUDADataFormatsCommonHeterogeneousSoA_H
-
-#include <cassert>
-
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-
-// a heterogeneous unique pointer...
-template <typename T>
-class HeterogeneousSoA {
-public:
-  using Product = T;
-
-  HeterogeneousSoA() = default;  // make root happy
-  ~HeterogeneousSoA() = default;
-  HeterogeneousSoA(HeterogeneousSoA &&) = default;
-  HeterogeneousSoA &operator=(HeterogeneousSoA &&) = default;
-
-  explicit HeterogeneousSoA(cms::cuda::device::unique_ptr<T> &&p) : dm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(cms::cuda::host::unique_ptr<T> &&p) : hm_ptr(std::move(p)) {}
-  explicit HeterogeneousSoA(std::unique_ptr<T> &&p) : std_ptr(std::move(p)) {}
-
-  auto const *get() const { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
-
-  auto const &operator*() const { return *get(); }
-
-  auto const *operator->() const { return get(); }
-
-  auto *get() { return dm_ptr ? dm_ptr.get() : (hm_ptr ? hm_ptr.get() : std_ptr.get()); }
-
-  auto &operator*() { return *get(); }
-
-  auto *operator->() { return get(); }
-
-  // in reality valid only for GPU version...
-  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const {
-    assert(dm_ptr);
-    auto ret = cms::cuda::make_host_unique<T>(stream);
-    cudaCheck(cudaMemcpyAsync(ret.get(), dm_ptr.get(), sizeof(T), cudaMemcpyDefault, stream));
-    return ret;
-  }
-
-private:
-  // a union wan't do it, a variant will not be more efficienct
-  cms::cuda::device::unique_ptr<T> dm_ptr;  //!
-  cms::cuda::host::unique_ptr<T> hm_ptr;    //!
-  std::unique_ptr<T> std_ptr;               //!
-};
-
-namespace cms {
-  namespace cudacompat {
-
-    struct GPUTraits {
-      template <typename T>
-      using unique_ptr = cms::cuda::device::unique_ptr<T>;
-
-      template <typename T>
-      static auto make_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
-      }
-
-      template <typename T>
-      static auto make_host_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
-      }
-    };
-
-    struct HostTraits {
-      template <typename T>
-      using unique_ptr = cms::cuda::host::unique_ptr<T>;
-
-      template <typename T>
-      static auto make_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_host_unique(cudaStream_t stream) {
-        return cms::cuda::make_host_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(stream);
-      }
-
-      template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t stream) {
-        return cms::cuda::make_device_unique<T>(size, stream);
-      }
-    };
-
-    struct CPUTraits {
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T>;
-
-      template <typename T>
-      static auto make_unique(cudaStream_t) {
-        return std::make_unique<T>();
-      }
-
-      template <typename T>
-      static auto make_unique(size_t size, cudaStream_t) {
-        return std::make_unique<T>(size);
-      }
-
-      template <typename T>
-      static auto make_host_unique(cudaStream_t) {
-        return std::make_unique<T>();
-      }
-
-      template <typename T>
-      static auto make_device_unique(cudaStream_t) {
-        return std::make_unique<T>();
-      }
-
-      template <typename T>
-      static auto make_device_unique(size_t size, cudaStream_t) {
-        return std::make_unique<T>(size);
-      }
-    };
-
-  }  // namespace cudacompat
-}  // namespace cms
-
-// a heterogeneous unique pointer (of a different sort) ...
-template <typename T, typename Traits>
-class HeterogeneousSoAImpl {
-public:
-  template <typename V>
-  using unique_ptr = typename Traits::template unique_ptr<V>;
-
-  HeterogeneousSoAImpl() = default;  // make root happy
-  ~HeterogeneousSoAImpl() = default;
-  HeterogeneousSoAImpl(HeterogeneousSoAImpl &&) = default;
-  HeterogeneousSoAImpl &operator=(HeterogeneousSoAImpl &&) = default;
-
-  explicit HeterogeneousSoAImpl(unique_ptr<T> &&p) : m_ptr(std::move(p)) {}
-  explicit HeterogeneousSoAImpl(cudaStream_t stream);
-
-  T const *get() const { return m_ptr.get(); }
-
-  T *get() { return m_ptr.get(); }
-
-  cms::cuda::host::unique_ptr<T> toHostAsync(cudaStream_t stream) const;
-
-private:
-  unique_ptr<T> m_ptr;  //!
-};
-
-template <typename T, typename Traits>
-HeterogeneousSoAImpl<T, Traits>::HeterogeneousSoAImpl(cudaStream_t stream) {
-  m_ptr = Traits::template make_unique<T>(stream);
-}
-
-// in reality valid only for GPU version...
-template <typename T, typename Traits>
-cms::cuda::host::unique_ptr<T> HeterogeneousSoAImpl<T, Traits>::toHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<T>(stream);
-  cudaCheck(cudaMemcpyAsync(ret.get(), get(), sizeof(T), cudaMemcpyDefault, stream));
-  return ret;
-}
-
-template <typename T>
-using HeterogeneousSoAGPU = HeterogeneousSoAImpl<T, cms::cudacompat::GPUTraits>;
-template <typename T>
-using HeterogeneousSoACPU = HeterogeneousSoAImpl<T, cms::cudacompat::CPUTraits>;
-template <typename T>
-using HeterogeneousSoAHost = HeterogeneousSoAImpl<T, cms::cudacompat::HostTraits>;
-
-#endif
diff --git a/CUDADataFormats/Common/interface/HostProduct.h b/CUDADataFormats/Common/interface/HostProduct.h
index 63a152298e42b..5622f3e038b5c 100644
--- a/CUDADataFormats/Common/interface/HostProduct.h
+++ b/CUDADataFormats/Common/interface/HostProduct.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormatsCommonHostProduct_H
 #define CUDADataFormatsCommonHostProduct_H
 
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include <memory>
 
 // a heterogeneous unique pointer...
 template <typename T>
@@ -12,18 +12,16 @@ class HostProduct {
   HostProduct(HostProduct&&) = default;
   HostProduct& operator=(HostProduct&&) = default;
 
-  explicit HostProduct(cms::cuda::host::unique_ptr<T>&& p) : hm_ptr(std::move(p)) {}
   explicit HostProduct(std::unique_ptr<T>&& p) : std_ptr(std::move(p)) {}
 
-  auto const* get() const { return hm_ptr ? hm_ptr.get() : std_ptr.get(); }
+  auto const* get() const { return std_ptr.get(); }
 
   auto const& operator*() const { return *get(); }
 
   auto const* operator->() const { return get(); }
 
 private:
-  cms::cuda::host::unique_ptr<T> hm_ptr;  //!
-  std::unique_ptr<T> std_ptr;             //!
+  std::unique_ptr<T> std_ptr;  //!
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
index eff550feeb22e..a486a97668163 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
@@ -5,16 +5,15 @@
 
 #include "DataFormats/SiPixelRawData/interface/SiPixelErrorCompact.h"
 #include "DataFormats/SiPixelRawData/interface/SiPixelFormatterErrors.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/SimpleVector.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 class SiPixelDigiErrorsCUDA {
 public:
   using SiPixelErrorCompactVector = cms::cuda::SimpleVector<SiPixelErrorCompact>;
 
   SiPixelDigiErrorsCUDA() = default;
-  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream);
+  SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream);
   ~SiPixelDigiErrorsCUDA() = default;
 
   SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
@@ -27,16 +26,16 @@ class SiPixelDigiErrorsCUDA {
   SiPixelErrorCompactVector* error() { return error_d.get(); }
   SiPixelErrorCompactVector const* error() const { return error_d.get(); }
 
-  using HostDataError = std::pair<SiPixelErrorCompactVector, cms::cuda::host::unique_ptr<SiPixelErrorCompact[]>>;
+  using HostDataError = std::pair<SiPixelErrorCompactVector, memoryPool::Buffer<SiPixelErrorCompact>>;
   HostDataError dataErrorToHostAsync(cudaStream_t stream) const;
 
   void copyErrorToHostAsync(cudaStream_t stream);
   int nErrorWords() const { return nErrorWords_; }
 
 private:
-  cms::cuda::device::unique_ptr<SiPixelErrorCompact[]> data_d;
-  cms::cuda::device::unique_ptr<SiPixelErrorCompactVector> error_d;
-  cms::cuda::host::unique_ptr<SiPixelErrorCompactVector> error_h;
+  memoryPool::Buffer<SiPixelErrorCompact> data_d;
+  memoryPool::Buffer<SiPixelErrorCompactVector> error_d;
+  memoryPool::Buffer<SiPixelErrorCompactVector> error_h;
   SiPixelFormatterErrors formatterErrors_h;
   int nErrorWords_ = 0;
 };
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index cf6b51687982f..f4bdc09ee4e8c 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -1,18 +1,16 @@
 #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 
-#include <cuda_runtime.h>
+#include <cuda.h>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h"
 
 class SiPixelDigisCUDA {
 public:
   using StoreType = uint16_t;
   SiPixelDigisCUDA() = default;
-  explicit SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
+  SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream);
   ~SiPixelDigisCUDA() = default;
 
   SiPixelDigisCUDA(const SiPixelDigisCUDA &) = delete;
@@ -28,14 +26,14 @@ class SiPixelDigisCUDA {
   uint32_t nModules() const { return nModules_h; }
   uint32_t nDigis() const { return nDigis_h; }
 
-  cms::cuda::host::unique_ptr<StoreType[]> copyAllToHostAsync(cudaStream_t stream) const;
+  memoryPool::Buffer<StoreType> copyAllToHostAsync(cudaStream_t stream) const;
 
   SiPixelDigisCUDASOAView view() { return m_view; }
   SiPixelDigisCUDASOAView const view() const { return m_view; }
 
 private:
   // These are consumed by downstream device code
-  cms::cuda::device::unique_ptr<StoreType[]> m_store;
+  memoryPool::Buffer<StoreType> m_store;
 
   SiPixelDigisCUDASOAView m_view;
 
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
index 70d00ae584279..f1efeaad2e2f3 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDASOAView.h
@@ -3,8 +3,6 @@
 
 #include <cuda_runtime.h>
 
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 
 #include <cstdint>
@@ -96,11 +94,11 @@ class SiPixelDigisCUDASOAView {
   uint32_t* rawIdArr_;
 
   template <typename ReturnType, typename StoreType, typename LocationType>
-  ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) {
+  static ReturnType* getColumnAddress(LocationType column, StoreType& store, int size) {
     return reinterpret_cast<ReturnType*>(store.get() + static_cast<int>(column) * roundFor128ByteAlignment(size));
   }
 
-  static int roundFor128ByteAlignment(int size) {
+  static constexpr int roundFor128ByteAlignment(int size) {
     constexpr int mul = 128 / sizeof(uint16_t);
     return ((size + mul - 1) / mul) * mul;
   };
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
index e81b1b2b592af..f2459801dba55 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -1,40 +1,42 @@
-#include <cassert>
-
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, SiPixelFormatterErrors errors, cudaStream_t stream)
-    : data_d(cms::cuda::make_device_unique<SiPixelErrorCompact[]>(maxFedWords, stream)),
-      error_d(cms::cuda::make_device_unique<SiPixelErrorCompactVector>(stream)),
-      error_h(cms::cuda::make_host_unique<SiPixelErrorCompactVector>(stream)),
-      formatterErrors_h(std::move(errors)),
-      nErrorWords_(maxFedWords) {
+    : formatterErrors_h(std::move(errors)), nErrorWords_(maxFedWords) {
   assert(maxFedWords != 0);
-  cms::cuda::memsetAsync(data_d, 0x00, maxFedWords, stream);
+
+  memoryPool::Deleter deleter =
+      memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
+  assert(deleter.pool());
+
+  data_d = memoryPool::cuda::makeBuffer<SiPixelErrorCompact>(maxFedWords, deleter);
+  error_d = memoryPool::cuda::makeBuffer<SiPixelErrorCompactVector>(1, deleter);
+  error_h = memoryPool::cuda::makeBuffer<SiPixelErrorCompactVector>(1, stream, memoryPool::onHost);
+
+  cudaMemsetAsync(data_d.get(), 0x00, maxFedWords, stream);
 
   cms::cuda::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
   assert(error_h->empty());
   assert(error_h->capacity() == static_cast<int>(maxFedWords));
 
-  cms::cuda::copyAsync(error_d, error_h, stream);
+  cudaCheck(memoryPool::cuda::copy(error_d, error_h, 1, stream));
 }
 
 void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cudaStream_t stream) {
-  cms::cuda::copyAsync(error_h, error_d, stream);
+  cudaCheck(memoryPool::cuda::copy(error_h, error_d, 1, stream));
 }
 
 SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cudaStream_t stream) const {
   // On one hand size() could be sufficient. On the other hand, if
   // someone copies the SimpleVector<>, (s)he might expect the data
-  // buffer to actually have space for capacity() elements.
-  auto data = cms::cuda::make_host_unique<SiPixelErrorCompact[]>(error_h->capacity(), stream);
+  // Buffer to actually have space for capacity() elements.
+  auto data = memoryPool::cuda::makeBuffer<SiPixelErrorCompact>(error_h->capacity(), stream, memoryPool::onHost);
 
   // but transfer only the required amount
   if (not error_h->empty()) {
-    cms::cuda::copyAsync(data, data_d, error_h->size(), stream);
+    cudaCheck(memoryPool::cuda::copy(data, data_d, error_h->size(), stream));
   }
   auto err = *error_h;
   err.set_data(data.get());
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index 9a7f8ae8bdad5..db464afb492f4 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -1,24 +1,24 @@
 #include <cassert>
 
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
 SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cudaStream_t stream)
-    : m_store(cms::cuda::make_device_unique<SiPixelDigisCUDA::StoreType[]>(
+    : m_store(memoryPool::cuda::makeBuffer<SiPixelDigisCUDA::StoreType>(
           SiPixelDigisCUDASOAView::roundFor128ByteAlignment(maxFedWords) *
               static_cast<int>(SiPixelDigisCUDASOAView::StorageLocation::kMAX),
-          stream)),
+          stream,
+          memoryPool::onDevice)),
       m_view(m_store, maxFedWords, SiPixelDigisCUDASOAView::StorageLocation::kMAX) {
   assert(maxFedWords != 0);
 }
 
-cms::cuda::host::unique_ptr<SiPixelDigisCUDA::StoreType[]> SiPixelDigisCUDA::copyAllToHostAsync(
-    cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<StoreType[]>(
+memoryPool::Buffer<SiPixelDigisCUDA::StoreType> SiPixelDigisCUDA::copyAllToHostAsync(cudaStream_t stream) const {
+  auto ret = memoryPool::cuda::makeBuffer<StoreType>(
       m_view.roundFor128ByteAlignment(nDigis()) * static_cast<int>(SiPixelDigisCUDASOAView::StorageLocationHost::kMAX),
-      stream);
+      stream,
+      memoryPool::onHost);
   cudaCheck(cudaMemcpyAsync(ret.get(),
                             m_view.clus(),
                             m_view.roundFor128ByteAlignment(nDigis()) * sizeof(SiPixelDigisCUDA::StoreType) *
diff --git a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
index 3ee5af80353dd..f791050aa6552 100644
--- a/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
+++ b/CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h
@@ -1,9 +1,9 @@
 #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
 #define CUDADataFormats_Track_PixelTrackHeterogeneous_h
 
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
 
-using PixelTrackHeterogeneous = HeterogeneousSoA<pixelTrack::TrackSoA>;
+using PixelTrackHeterogeneous = memoryPool::Buffer<pixelTrack::TrackSoA>;
 
-#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
\ No newline at end of file
+#endif  // #ifndef CUDADataFormats_Track_PixelTrackHeterogeneous_h
diff --git a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
index 356ea3eddeb7f..be73e2650b417 100644
--- a/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
+++ b/CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h
@@ -8,8 +8,6 @@
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
-
 namespace pixelTrack {
   enum class Quality : uint8_t { bad = 0, edup, dup, loose, strict, tight, highPurity, notQuality };
   constexpr uint32_t qualitySize{uint8_t(Quality::notQuality)};
diff --git a/CUDADataFormats/Track/src/classes.h b/CUDADataFormats/Track/src/classes.h
index 97c116f6c88d3..a2b57f2e3e9e1 100644
--- a/CUDADataFormats/Track/src/classes.h
+++ b/CUDADataFormats/Track/src/classes.h
@@ -2,7 +2,7 @@
 #define CUDADataFormats_Track_src_classes_h
 
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/Common/interface/HostProduct.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "CUDADataFormats/Track/interface/TrackSoAHeterogeneousT.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/Track/src/classes_def.xml b/CUDADataFormats/Track/src/classes_def.xml
index 9c80ae91baf29..c95568b18f1f3 100644
--- a/CUDADataFormats/Track/src/classes_def.xml
+++ b/CUDADataFormats/Track/src/classes_def.xml
@@ -1,6 +1,7 @@
 <lcgdict>
-  <class name="cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
-  <class name="edm::Wrapper<cms::cuda::Product<HeterogeneousSoA<pixelTrack::TrackSoA>>>" persistent="false"/>
-  <class name="HeterogeneousSoA<pixelTrack::TrackSoA>" persistent="false"/>
-  <class name="edm::Wrapper<HeterogeneousSoA<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="cms::cuda::Product<memoryPool::Buffer<pixelTrack::TrackSoA>>" persistent="false"/>
+  <class name="edm::Wrapper<cms::cuda::Product<memoryPool::Buffer<pixelTrack::TrackSoA>>>" persistent="false"/>
+  <class name="memoryPool::Buffer<pixelTrack::TrackSoA>" persistent="false"/>
+  <class name="pixelTrack::TrackSoA" persistent="false"/>
+  <class name="edm::Wrapper<memoryPool::Buffer<pixelTrack::TrackSoA>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
index 8ce37f280ac6c..2a413ecb2a370 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h
@@ -2,10 +2,10 @@
 #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
 
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
 
-template <typename Traits>
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
+
 class TrackingRecHit2DHeterogeneous {
 public:
   enum class Storage32 {
@@ -30,23 +30,28 @@ class TrackingRecHit2DHeterogeneous {
   };
 
   template <typename T>
-  using unique_ptr = typename Traits::template unique_ptr<T>;
+  using Buffer = typename memoryPool::Buffer<T>;
 
   using PhiBinner = TrackingRecHit2DSOAView::PhiBinner;
 
   TrackingRecHit2DHeterogeneous() = default;
 
-  explicit TrackingRecHit2DHeterogeneous(
-      uint32_t nHits,
-      bool isPhase2,
-      int32_t offsetBPIX2,
-      pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-      uint32_t const* hitsModuleStart,
-      cudaStream_t stream,
-      TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input = nullptr);
-
-  explicit TrackingRecHit2DHeterogeneous(
-      float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream = nullptr);
+  TrackingRecHit2DHeterogeneous(uint32_t nHits,
+                                bool isPhase2,
+                                int32_t offsetBPIX2,
+                                pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                uint32_t const* hitsModuleStart,
+                                memoryPool::Where where,
+                                cudaStream_t stream,
+                                TrackingRecHit2DHeterogeneous const* input = nullptr);
+
+  // used on CPU only
+  TrackingRecHit2DHeterogeneous(float* store32,
+                                uint16_t* store16,
+                                uint32_t* modules,
+                                int nHits,
+                                memoryPool::Where where = memoryPool::onCPU,
+                                cudaStream_t stream = nullptr);
   ~TrackingRecHit2DHeterogeneous() = default;
 
   TrackingRecHit2DHeterogeneous(const TrackingRecHit2DHeterogeneous&) = delete;
@@ -67,28 +72,28 @@ class TrackingRecHit2DHeterogeneous {
   auto phiBinnerStorage() { return m_phiBinnerStorage; }
   auto iphi() { return m_iphi; }
 
-  cms::cuda::host::unique_ptr<float[]> localCoordToHostAsync(cudaStream_t stream) const;
+  Buffer<float> localCoordToHostAsync(cudaStream_t stream) const;
 
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStartToHostAsync(cudaStream_t stream) const;
+  Buffer<uint32_t> hitsModuleStartToHostAsync(cudaStream_t stream) const;
 
-  cms::cuda::host::unique_ptr<uint16_t[]> store16ToHostAsync(cudaStream_t stream) const;
-  cms::cuda::host::unique_ptr<float[]> store32ToHostAsync(cudaStream_t stream) const;
+  Buffer<uint16_t> store16ToHostAsync(cudaStream_t stream) const;
+  Buffer<float> store32ToHostAsync(cudaStream_t stream) const;
 
-  // needs specialization for Host
-  void copyFromGPU(TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input, cudaStream_t stream);
+  // needed for Host
+  void copyFromGPU(TrackingRecHit2DHeterogeneous const* input, cudaStream_t stream);
 
 private:
   static constexpr uint32_t n16 = 4;                 // number of elements in m_store16
   static constexpr uint32_t n32 = 10;                // number of elements in m_store32
   static_assert(sizeof(uint32_t) == sizeof(float));  // just stating the obvious
   static_assert(n32 == static_cast<uint32_t>(Storage32::kLayers));
-  unique_ptr<uint16_t[]> m_store16;  //!
-  unique_ptr<float[]> m_store32;     //!
+  Buffer<uint16_t> m_store16;  //!
+  Buffer<float> m_store32;     //!
 
-  unique_ptr<TrackingRecHit2DSOAView::PhiBinner> m_PhiBinnerStore;              //!
-  unique_ptr<TrackingRecHit2DSOAView::AverageGeometry> m_AverageGeometryStore;  //!
+  Buffer<TrackingRecHit2DSOAView::PhiBinner> m_PhiBinnerStore;              //!
+  Buffer<TrackingRecHit2DSOAView::AverageGeometry> m_AverageGeometryStore;  //!
 
-  unique_ptr<TrackingRecHit2DSOAView> m_view;  //!
+  Buffer<TrackingRecHit2DSOAView> m_view;  //!
 
   uint32_t m_nHits;
   int32_t m_offsetBPIX2;
@@ -103,175 +108,8 @@ class TrackingRecHit2DHeterogeneous {
   int16_t* m_iphi;
 };
 
-using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits>;
-using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous<cms::cudacompat::CPUTraits>;
-using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous<cms::cudacompat::HostTraits>;
-
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-
-template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
-    uint32_t nHits,
-    bool isPhase2,
-    int32_t offsetBPIX2,
-    pixelCPEforGPU::ParamsOnGPU const* cpeParams,
-    uint32_t const* hitsModuleStart,
-    cudaStream_t stream,
-    TrackingRecHit2DHeterogeneous<cms::cudacompat::GPUTraits> const* input)
-    : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
-
-  m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
-
-  view->m_nHits = nHits;
-  view->m_nMaxModules = m_nMaxModules;
-  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);  // leave it on host and pass it by value?
-  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
-  view->m_averageGeometry = m_AverageGeometryStore.get();
-  view->m_cpeParams = cpeParams;
-  view->m_hitsModuleStart = hitsModuleStart;
-
-  // if empy do not bother
-  if (0 == nHits) {
-    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-      cms::cuda::copyAsync(m_view, view, stream);
-    } else {
-      m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
-    }
-    return;
-  }
-
-  // the single arrays are not 128 bit alligned...
-  // the hits are actually accessed in order only in building
-  // if ordering is relevant they may have to be stored phi-ordered by layer or so
-  // this will break 1to1 correspondence with cluster and module locality
-  // so unless proven VERY inefficient we keep it ordered as generated
-
-  // host copy is "reduced"  (to be reviewed at some point)
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
-    // it has to compile for ALL cases
-    copyFromGPU(input, stream);
-  } else {
-    assert(input == nullptr);
-
-    auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
-
-    m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-    m_store32 = Traits::template make_unique<float[]>(nHits * n32 + nL + 1, stream);
-    m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
-  }
-
-  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
-  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
-
-  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
-
-  // copy all the pointers
-  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-  m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
-
-  view->m_xl = get32(Storage32::kXLocal);
-  view->m_yl = get32(Storage32::kYLocal);
-  view->m_xerr = get32(Storage32::kXerror);
-  view->m_yerr = get32(Storage32::kYerror);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
-
-  if constexpr (!std::is_same_v<Traits, cms::cudacompat::HostTraits>) {
-    assert(input == nullptr);
-    view->m_xg = get32(Storage32::kXGlobal);
-    view->m_yg = get32(Storage32::kYGlobal);
-    view->m_zg = get32(Storage32::kZGlobal);
-    view->m_rg = get32(Storage32::kRGlobal);
-
-    auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
-    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
-
-    view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
-    view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
-    view->m_detInd = get16(Storage16::kDetId);
-
-    m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
-  }
-
-  // transfer view
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_view, view, stream);
-  } else {
-    m_view.reset(view.release());  // NOLINT: std::move() breaks CUDA version
-  }
-}
-
-//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
-template <typename Traits>
-TrackingRecHit2DHeterogeneous<Traits>::TrackingRecHit2DHeterogeneous(
-    float* store32, uint16_t* store16, uint32_t* modules, int nHits, cudaStream_t stream)
-    : m_nHits(nHits), m_hitsModuleStart(modules) {
-  auto view = Traits::template make_host_unique<TrackingRecHit2DSOAView>(stream);
-
-  m_view = Traits::template make_unique<TrackingRecHit2DSOAView>(stream);
-
-  view->m_nHits = nHits;
-
-  if (0 == nHits) {
-    if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-      cms::cuda::copyAsync(m_view, view, stream);
-    } else {
-      m_view = std::move(view);
-    }
-    return;
-  }
-
-  m_store16 = Traits::template make_unique<uint16_t[]>(nHits * n16, stream);
-  m_store32 = Traits::template make_unique<float[]>(nHits * n32, stream);
-  m_PhiBinnerStore = Traits::template make_unique<TrackingRecHit2DSOAView::PhiBinner>(stream);
-  m_AverageGeometryStore = Traits::template make_unique<TrackingRecHit2DSOAView::AverageGeometry>(stream);
-
-  view->m_averageGeometry = m_AverageGeometryStore.get();
-  view->m_hitsModuleStart = m_hitsModuleStart;
-
-  //store transfer
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_store16, store16, stream);
-    cms::cuda::copyAsync(m_store32, store32, stream);
-  } else {
-    std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
-    std::copy(store16, store16 + nHits * n16, m_store16.get());
-  }
-
-  //getters
-  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
-  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
-
-  //Store 32
-  view->m_xl = get32(Storage32::kXLocal);
-  view->m_yl = get32(Storage32::kYLocal);
-  view->m_xerr = get32(Storage32::kXerror);
-  view->m_yerr = get32(Storage32::kYerror);
-  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
-  view->m_xg = get32(Storage32::kXGlobal);
-  view->m_yg = get32(Storage32::kYGlobal);
-  view->m_zg = get32(Storage32::kZGlobal);
-  view->m_rg = get32(Storage32::kRGlobal);
-
-  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
-  m_phiBinnerStorage = view->m_phiBinnerStorage =
-      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
-
-  //Store 16
-  view->m_detInd = get16(Storage16::kDetId);
-  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
-  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
-  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
-
-  // transfer view
-  if constexpr (std::is_same_v<Traits, cms::cudacompat::GPUTraits>) {
-    cms::cuda::copyAsync(m_view, view, stream);
-  } else {
-    m_view = std::move(view);
-  }
-}
+using TrackingRecHit2DGPU = TrackingRecHit2DHeterogeneous;
+using TrackingRecHit2DCPU = TrackingRecHit2DHeterogeneous;
+using TrackingRecHit2DHost = TrackingRecHit2DHeterogeneous;
 
 #endif  // CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DHeterogeneous_h
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
index f252ca94d2296..ff56e845dd19c 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DReduced.h
@@ -1,14 +1,14 @@
 #ifndef CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
 #define CUDADataFormats_TrackingRecHit_interface_TrackingRecHit2DReduced_h
 
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
-#include "CUDADataFormats/Common/interface/HostProduct.h"
 
 // a reduced (in content and therefore in size) version to be used on CPU for Legacy reconstruction
 class TrackingRecHit2DReduced {
 public:
-  using HLPstorage = HostProduct<float[]>;
-  using HIDstorage = HostProduct<uint16_t[]>;
+  using HLPstorage = memoryPool::Buffer<float>;
+  using HIDstorage = memoryPool::Buffer<uint16_t>;
 
   template <typename UP32, typename UP16>
   TrackingRecHit2DReduced(UP32&& istore32, UP16&& istore16, int nhits)
diff --git a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
index 39ee136189955..748b8ec6ad593 100644
--- a/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
+++ b/CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h
@@ -4,6 +4,7 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/SiPixelCluster/interface/gpuClusteringConstants.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCompat.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
@@ -25,7 +26,6 @@ class TrackingRecHit2DSOAView {
 
   using AverageGeometry = pixelTopology::AverageGeometry;
 
-  template <typename>
   friend class TrackingRecHit2DHeterogeneous;
   friend class TrackingRecHit2DReduced;
 
diff --git a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
index fc6a05ba9ed3e..632157b9d8e93 100644
--- a/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
+++ b/CUDADataFormats/TrackingRecHit/src/TrackingRecHit2DHeterogeneous.cc
@@ -1,40 +1,215 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+#include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DSOAView.h"
+#include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
+
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(uint32_t nHits,
+                                                             bool isPhase2,
+                                                             int32_t offsetBPIX2,
+                                                             pixelCPEforGPU::ParamsOnGPU const* cpeParams,
+                                                             uint32_t const* hitsModuleStart,
+                                                             memoryPool::Where where,
+                                                             cudaStream_t stream,
+                                                             TrackingRecHit2DHeterogeneous const* input)
+    : m_nHits(nHits), m_offsetBPIX2(offsetBPIX2), m_hitsModuleStart(hitsModuleStart) {
+  using namespace memoryPool::cuda;
+
+  memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, where));
+  assert(deleter.pool());
+  auto view = makeBuffer<TrackingRecHit2DSOAView>(
+      1, stream, memoryPool::onCPU == where ? memoryPool::onCPU : memoryPool::onHost);
+  assert(view.get());
+  m_nMaxModules = isPhase2 ? phase2PixelTopology::numberOfModules : phase1PixelTopology::numberOfModules;
+  assert(view.get());
+  view->m_nHits = nHits;
+  view->m_nMaxModules = m_nMaxModules;
+  m_view = makeBuffer<TrackingRecHit2DSOAView>(
+      1, deleter);  // stream, where); // deleter);  // leave it on host and pass it by value?
+  assert(m_view.get());
+  m_AverageGeometryStore = makeBuffer<TrackingRecHit2DSOAView::AverageGeometry>(1, deleter);
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_cpeParams = cpeParams;
+  view->m_hitsModuleStart = hitsModuleStart;
+
+  // if empy do not bother
+  if (0 == nHits) {
+    if (memoryPool::onDevice == where) {
+      cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+    } else {
+      m_view.reset(view.release());
+    }
+    return;
+  }
+
+  // the single arrays are not 128 bit alligned...
+  // the hits are actually accessed in order only in building
+  // if ordering is relevant they may have to be stored phi-ordered by layer or so
+  // this will break 1to1 correspondence with cluster and module locality
+  // so unless proven VERY inefficient we keep it ordered as generated
+
+  // host copy is "reduced"  (to be reviewed at some point)
+  if (memoryPool::onHost == where) {
+    // it has to compile for ALL cases
+    copyFromGPU(input, stream);
+  } else {
+    assert(input == nullptr);
+
+    auto nL = isPhase2 ? phase2PixelTopology::numberOfLayers : phase1PixelTopology::numberOfLayers;
+
+    m_store16 = makeBuffer<uint16_t>(nHits * n16, deleter);
+    m_store32 = makeBuffer<float>(nHits * n32 + nL + 1, deleter);
+    m_PhiBinnerStore = makeBuffer<TrackingRecHit2DSOAView::PhiBinner>(1, deleter);
+  }
+
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(float));
+  static_assert(sizeof(TrackingRecHit2DSOAView::hindex_type) == sizeof(TrackingRecHit2DSOAView::PhiBinner::index_type));
+
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
+
+  // copy all the pointers
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+
+  if (memoryPool::onHost != where) {
+    assert(input == nullptr);
+    view->m_xg = get32(Storage32::kXGlobal);
+    view->m_yg = get32(Storage32::kYGlobal);
+    view->m_zg = get32(Storage32::kZGlobal);
+    view->m_rg = get32(Storage32::kRGlobal);
+
+    auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+    m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+
+    view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+    view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+    view->m_detInd = get16(Storage16::kDetId);
+
+    m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+    m_hitsLayerStart = view->m_hitsLayerStart = reinterpret_cast<uint32_t*>(get32(Storage32::kLayers));
+  }
+
+  // transfer view
+  if (memoryPool::onDevice == where) {
+    cudaCheck(
+        cudaMemcpyAsync(m_view.get(), view.get(), sizeof(TrackingRecHit2DSOAView), cudaMemcpyHostToDevice, stream));
+    //    cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+  } else {
+    m_view.reset(view.release());
+  }
+}
+
+//this is intended to be used only for CPU SoA but doesn't hurt to have it for all cases
+TrackingRecHit2DHeterogeneous::TrackingRecHit2DHeterogeneous(
+    float* store32, uint16_t* store16, uint32_t* modules, int nHits, memoryPool::Where where, cudaStream_t stream)
+    : m_nHits(nHits), m_hitsModuleStart(modules) {
+  using namespace memoryPool::cuda;
+  auto view = makeBuffer<TrackingRecHit2DSOAView>(
+      1, stream, memoryPool::onCPU == where ? memoryPool::onCPU : memoryPool::onHost);
+
+  m_view = makeBuffer<TrackingRecHit2DSOAView>(1, stream, where);
+
+  view->m_nHits = nHits;
+
+  if (0 == nHits) {
+    if (memoryPool::onDevice == where) {
+      cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+    } else {
+      m_view = std::move(view);
+    }
+    return;
+  }
+
+  m_store16 = makeBuffer<uint16_t>(nHits * n16, stream, where);
+  m_store32 = makeBuffer<float>(nHits * n32, stream, where);
+  m_PhiBinnerStore = makeBuffer<TrackingRecHit2DSOAView::PhiBinner>(1, stream, where);
+  m_AverageGeometryStore = makeBuffer<TrackingRecHit2DSOAView::AverageGeometry>(1, stream, where);
+
+  view->m_averageGeometry = m_AverageGeometryStore.get();
+  view->m_hitsModuleStart = m_hitsModuleStart;
+
+  //store transfer
+  if (memoryPool::onDevice == where) {
+    cudaCheck(cudaMemcpyAsync(m_store32.get(), store32, nHits * n32 * sizeof(float), cudaMemcpyHostToDevice, stream));
+    cudaCheck(
+        cudaMemcpyAsync(m_store16.get(), store16, nHits * n16 * sizeof(uint16_t), cudaMemcpyHostToDevice, stream));
+  } else {
+    std::copy(store32, store32 + nHits * n32, m_store32.get());  // want to copy it
+    std::copy(store16, store16 + nHits * n16, m_store16.get());
+  }
+
+  //getters
+  auto get32 = [&](Storage32 i) { return m_store32.get() + static_cast<int>(i) * nHits; };
+  auto get16 = [&](Storage16 i) { return m_store16.get() + static_cast<int>(i) * nHits; };
+
+  //Store 32
+  view->m_xl = get32(Storage32::kXLocal);
+  view->m_yl = get32(Storage32::kYLocal);
+  view->m_xerr = get32(Storage32::kXerror);
+  view->m_yerr = get32(Storage32::kYerror);
+  view->m_chargeAndStatus = reinterpret_cast<uint32_t*>(get32(Storage32::kCharge));
+  view->m_xg = get32(Storage32::kXGlobal);
+  view->m_yg = get32(Storage32::kYGlobal);
+  view->m_zg = get32(Storage32::kZGlobal);
+  view->m_rg = get32(Storage32::kRGlobal);
+
+  m_phiBinner = view->m_phiBinner = m_PhiBinnerStore.get();
+  m_phiBinnerStorage = view->m_phiBinnerStorage =
+      reinterpret_cast<TrackingRecHit2DSOAView::PhiBinner::index_type*>(get32(Storage32::kPhiStorage));
+
+  //Store 16
+  view->m_detInd = get16(Storage16::kDetId);
+  m_iphi = view->m_iphi = reinterpret_cast<int16_t*>(get16(Storage16::kPhi));
+  view->m_xsize = reinterpret_cast<int16_t*>(get16(Storage16::kXSize));
+  view->m_ysize = reinterpret_cast<int16_t*>(get16(Storage16::kYSize));
+
+  // transfer view
+  if (memoryPool::onDevice == where) {
+    cudaCheck(memoryPool::cuda::copy(m_view, view, sizeof(TrackingRecHit2DSOAView), stream));
+  } else {
+    m_view = std::move(view);
+  }
+}
+
+using namespace memoryPool::cuda;
 
-template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(5 * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, 5 * nHits(), stream);
+memoryPool::Buffer<float> TrackingRecHit2DGPU::localCoordToHostAsync(cudaStream_t stream) const {
+  auto ret = makeBuffer<float>(5 * nHits(), stream, memoryPool::onHost);
+  cudaCheck(cudaMemcpyAsync(ret.get(), m_store32.get(), 5 * sizeof(float) * nHits(), cudaMemcpyDeviceToHost, stream));
   return ret;
 }
 
-template <>
-cms::cuda::host::unique_ptr<float[]> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<float[]>(static_cast<int>(n32) * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store32, static_cast<int>(n32) * nHits(), stream);
+memoryPool::Buffer<float> TrackingRecHit2DGPU::store32ToHostAsync(cudaStream_t stream) const {
+  auto ret = makeBuffer<float>(static_cast<int>(n32) * nHits(), stream, memoryPool::onHost);
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), m_store32.get(), static_cast<int>(n32) * sizeof(float) * nHits(), cudaMemcpyDeviceToHost, stream));
   return ret;
 }
 
-template <>
-cms::cuda::host::unique_ptr<uint16_t[]> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint16_t[]>(static_cast<int>(n16) * nHits(), stream);
-  cms::cuda::copyAsync(ret, m_store16, static_cast<int>(n16) * nHits(), stream);
+memoryPool::Buffer<uint16_t> TrackingRecHit2DGPU::store16ToHostAsync(cudaStream_t stream) const {
+  auto ret = makeBuffer<uint16_t>(static_cast<int>(n16) * nHits(), stream, memoryPool::onHost);
+  cudaCheck(cudaMemcpyAsync(
+      ret.get(), m_store16.get(), static_cast<int>(n16) * sizeof(uint16_t) * nHits(), cudaMemcpyDeviceToHost, stream));
   return ret;
 }
 
-template <>
-cms::cuda::host::unique_ptr<uint32_t[]> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
-  auto ret = cms::cuda::make_host_unique<uint32_t[]>(nMaxModules() + 1, stream);
-  cudaCheck(
-      cudaMemcpyAsync(ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDefault, stream));
+memoryPool::Buffer<uint32_t> TrackingRecHit2DGPU::hitsModuleStartToHostAsync(cudaStream_t stream) const {
+  auto ret = makeBuffer<uint32_t>(nMaxModules() + 1, stream, memoryPool::onHost);
+  if (m_hitsModuleStart)
+    cudaCheck(cudaMemcpyAsync(
+        ret.get(), m_hitsModuleStart, sizeof(uint32_t) * (nMaxModules() + 1), cudaMemcpyDeviceToHost, stream));
   return ret;
 }
 
-// the only specialization needed
-template <>
 void TrackingRecHit2DHost::copyFromGPU(TrackingRecHit2DGPU const* input, cudaStream_t stream) {
   assert(input);
   m_store32 = input->localCoordToHostAsync(stream);
diff --git a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
index 8aca68e294469..e85878c05e2a6 100644
--- a/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
+++ b/CUDADataFormats/TrackingRecHit/test/TrackingRecHit2DCUDA_t.cpp
@@ -1,9 +1,11 @@
 #include "CUDADataFormats/TrackingRecHit/interface/TrackingRecHit2DHeterogeneous.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "Geometry/CommonTopologies/interface/SimplePixelTopology.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
 namespace testTrackingRecHit2D {
 
   void runKernels(TrackingRecHit2DSOAView* hits);
@@ -12,6 +14,7 @@ namespace testTrackingRecHit2D {
 
 int main() {
   cms::cudatest::requireDevices();
+  memoryPool::cuda::init(false);
 
   cudaStream_t stream;
   cudaCheck(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
@@ -19,26 +22,54 @@ int main() {
   auto nHits = 200;
   // inner scope to deallocate memory before destroying the stream
   {
-    TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, stream);
+    TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, memoryPool::onDevice, stream);
     testTrackingRecHit2D::runKernels(tkhit.view());
 
-    TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, stream);
+    TrackingRecHit2DGPU tkhitPhase2(nHits, true, 0, nullptr, nullptr, memoryPool::onDevice, stream);
     testTrackingRecHit2D::runKernels(tkhitPhase2.view());
 
-    TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, stream, &tkhit);
+    memoryPool::cuda::dumpStat();
+
+    TrackingRecHit2DHost tkhitH(nHits, false, 0, nullptr, nullptr, memoryPool::onHost, stream, &tkhit);
+
     cudaStreamSynchronize(stream);
+    memoryPool::cuda::dumpStat();
+
     assert(tkhitH.view());
     assert(tkhitH.view()->nHits() == unsigned(nHits));
     assert(tkhitH.view()->nMaxModules() == phase1PixelTopology::numberOfModules);
 
-    TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, stream, &tkhit);
+    TrackingRecHit2DHost tkhitHPhase2(nHits, true, 0, nullptr, nullptr, memoryPool::onHost, stream, &tkhitPhase2);
     cudaStreamSynchronize(stream);
     assert(tkhitHPhase2.view());
     assert(tkhitHPhase2.view()->nHits() == unsigned(nHits));
     assert(tkhitHPhase2.view()->nMaxModules() == phase2PixelTopology::numberOfModules);
+
+    memoryPool::cuda::dumpStat();
   }
 
+  cudaCheck(cudaStreamSynchronize(stream));
+  memoryPool::cuda::dumpStat();
+
+  std::cout << "on CPU" << std::endl;
+  ((SimplePoolAllocatorImpl<PosixAlloc>*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat();
+
   cudaCheck(cudaStreamDestroy(stream));
 
+  memoryPool::cuda::dumpStat();
+
+  {
+    TrackingRecHit2DGPU tkhit(nHits, false, 0, nullptr, nullptr, memoryPool::onCPU, nullptr);
+    assert(tkhit.view());
+    assert(tkhit.view()->nHits() == unsigned(nHits));
+    assert(tkhit.view()->nMaxModules() == phase1PixelTopology::numberOfModules);
+    std::cout << "on CPU" << std::endl;
+    ((SimplePoolAllocatorImpl<PosixAlloc>*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat();
+  }
+  std::cout << "on CPU" << std::endl;
+  ((SimplePoolAllocatorImpl<PosixAlloc>*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat();
+
+  memoryPool::cuda::shutdown();
+
   return 0;
 }
diff --git a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
index 417a960951fb1..0faeabf609da1 100644
--- a/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
+++ b/CUDADataFormats/Vertex/interface/ZVertexHeterogeneous.h
@@ -2,9 +2,9 @@
 #define CUDADataFormatsVertexZVertexHeterogeneous_H
 
 #include "CUDADataFormats/Vertex/interface/ZVertexSoA.h"
-#include "CUDADataFormats/Common/interface/HeterogeneousSoA.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memoryPool.h"
 
-using ZVertexHeterogeneous = HeterogeneousSoA<ZVertexSoA>;
+using ZVertexHeterogeneous = memoryPool::Buffer<ZVertexSoA>;
 #ifndef __CUDACC__
 #include "CUDADataFormats/Common/interface/Product.h"
 using ZVertexCUDAProduct = cms::cuda::Product<ZVertexHeterogeneous>;
diff --git a/CUDADataFormats/Vertex/src/classes_def.xml b/CUDADataFormats/Vertex/src/classes_def.xml
index ea633080af9af..a0886b0cc0797 100644
--- a/CUDADataFormats/Vertex/src/classes_def.xml
+++ b/CUDADataFormats/Vertex/src/classes_def.xml
@@ -2,5 +2,6 @@
   <class name="cms::cuda::Product<ZVertexHeterogeneous>" persistent="false"/>
   <class name="edm::Wrapper<ZVertexCUDAProduct>" persistent="false"/>
   <class name="ZVertexHeterogeneous" persistent="false"/>
+  <class name="ZVertexSoA" persistent="false"/>
   <class name="edm::Wrapper<ZVertexHeterogeneous>" persistent="false"/>
 </lcgdict>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
index 4037b4d50612c..ba23465e6f753 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -9,7 +9,6 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -27,7 +26,7 @@ class SiPixelDigiErrorsSoAFromCUDA : public edm::stream::EDProducer<edm::Externa
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
   edm::EDPutTokenT<SiPixelErrorsSoA> digiErrorPutToken_;
 
-  cms::cuda::host::unique_ptr<SiPixelErrorCompact[]> data_;
+  memoryPool::Buffer<SiPixelErrorCompact> data_;
   cms::cuda::SimpleVector<SiPixelErrorCompact> error_;
   const SiPixelFormatterErrors* formatterErrors_ = nullptr;
 };
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 0702bc4830c7c..4cc99064e43cf 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -9,7 +9,6 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -27,7 +26,7 @@ class SiPixelDigisSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork
   edm::EDGetTokenT<cms::cuda::Product<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
-  cms::cuda::host::unique_ptr<uint16_t[]> store_;
+  memoryPool::Buffer<uint16_t> store_;
 
   int nDigis_;
 };
diff --git a/HeterogeneousCore/CUDACore/src/ScopedContext.cc b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
index ccf7995a20061..0b547c4baecce 100644
--- a/HeterogeneousCore/CUDACore/src/ScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/ScopedContext.cc
@@ -27,6 +27,8 @@ namespace {
       try {
         auto error = cudaGetErrorName(status);
         auto message = cudaGetErrorString(status);
+        std::cout << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": "
+                  << message << std::endl;
         throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device
                                           << " error " << error << ": " << message;
       } catch (cms::Exception&) {
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 346c81267ec49..fa8d115ed1fd2 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -16,6 +16,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/StreamCache.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cachingAllocators.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/currentDevice.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
@@ -135,6 +136,8 @@ CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getU
   bool configEnabled = config.getUntrackedParameter<bool>("enabled");
   if (not configEnabled) {
     edm::LogInfo("CUDAService") << "CUDAService disabled by configuration";
+    // enable cpu memory pool
+    memoryPool::cuda::init(true);
     return;
   }
 
@@ -361,6 +364,9 @@ CUDAService::CUDAService(edm::ParameterSet const& config) : verbose_(config.getU
   cms::cuda::getEventCache().clear();
   cms::cuda::getStreamCache().clear();
 
+  // enable memory pool
+  memoryPool::cuda::init(false);
+
   if (verbose_) {
     log << '\n' << "CUDAService fully initialized";
   }
@@ -381,6 +387,9 @@ CUDAService::~CUDAService() {
     cms::cuda::getEventCache().clear();
     cms::cuda::getStreamCache().clear();
 
+    // destroy cpu memory pool
+    memoryPool::cuda::shutdown();
+
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
       cudaCheck(cudaDeviceSynchronize());
diff --git a/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h b/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h
new file mode 100644
index 0000000000000..c80af3c60be98
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h
@@ -0,0 +1,155 @@
+#pragma once
+
+#include <atomic>
+#include <array>
+#include <memory>
+#include <algorithm>
+#include <cassert>
+#include <mutex>
+#include <vector>
+#include <cstdint>
+#include <iostream>
+#include <chrono>
+
+// #define MEMORY_POOL_DEBUG
+
+namespace poolDetails {
+
+  constexpr int bucket(uint64_t s) { return 64 - __builtin_clzl(s - 1); }
+  constexpr uint64_t bucketSize(int b) { return 1LL << b; }
+
+};  // namespace poolDetails
+
+class SimplePoolAllocator;
+
+namespace memoryPool {
+  struct Payload {
+    SimplePoolAllocator *pool;
+    std::vector<int> buckets;
+  };
+}  // namespace memoryPool
+
+class SimplePoolAllocator {
+public:
+  using Pointer = void *;
+
+  virtual ~SimplePoolAllocator() = default;
+
+  virtual Pointer doAlloc(size_t size) = 0;
+  virtual void doFree(Pointer ptr) = 0;
+  virtual void scheduleFree(memoryPool::Payload *payload, void *stream) = 0;
+
+  SimplePoolAllocator(int maxSlots) : m_maxSlots(maxSlots) {
+    for (auto &p : m_used)
+      p.v = true;
+  }
+
+  int size() const { return m_size; }
+
+  Pointer pointer(int i) const { return m_slots[i]; }
+
+  void dumpStat() const;
+
+  void free(int i) {
+#ifdef MEMORY_POOL_DEBUG
+    m_last[i] = -1;
+#endif
+    m_used[i].v = false;
+  }
+
+  int alloc(uint64_t s) {
+    auto i = allocImpl(s);
+
+    //test garbage
+    // if(totBytes>4507964512) garbageCollect();
+
+    if (i >= 0) {
+#ifdef MEMORY_POOL_DEBUG
+      assert(m_used[i].v);
+      if (nullptr == m_slots[i])
+        std::cout << "race ??? " << i << ' ' << m_bucket[i] << ' ' << m_last[i] << std::endl;
+      assert(m_slots[i]);
+#endif
+      return i;
+    }
+    garbageCollect();
+    i = allocImpl(s);
+    if (i >= 0) {
+      assert(m_used[i].v);
+      assert(m_slots[i]);
+#ifdef MEMORY_POOL_DEBUG
+      assert(m_last[i] >= 0);
+#endif
+    }
+    return i;
+  }
+
+protected:
+  int allocImpl(uint64_t s);
+  int createAt(int ls, int b);
+  void garbageCollect();
+  int useOld(int b);
+
+private:
+  const int m_maxSlots;
+
+#ifdef MEMORY_POOL_DEBUG
+  std::vector<int> m_last = std::vector<int>(m_maxSlots, -2);
+#endif
+
+  std::vector<int> m_bucket = std::vector<int>(m_maxSlots, -1);
+  std::vector<Pointer> m_slots = std::vector<Pointer>(m_maxSlots, nullptr);
+  struct alBool {
+    alignas(64) std::atomic<bool> v;
+  };
+  std::vector<alBool> m_used = std::vector<alBool>(m_maxSlots);
+  std::atomic<int> m_size = 0;
+
+  std::atomic<uint64_t> totBytes = 0;
+  std::atomic<uint64_t> nAlloc = 0;
+  std::atomic<uint64_t> nFree = 0;
+};
+
+namespace poolDetails {
+  //  free callback
+  inline void freeAsync(memoryPool::Payload *payload) {
+    auto &pool = *(payload->pool);
+    auto const &buckets = payload->buckets;
+    for (auto i : buckets) {
+      pool.free(i);
+    }
+    delete payload;
+  }
+}  // namespace poolDetails
+
+template <typename T>
+struct SimplePoolAllocatorImpl final : public SimplePoolAllocator {
+  using Traits = T;
+
+  using SimplePoolAllocator::SimplePoolAllocator;
+
+  ~SimplePoolAllocatorImpl() override {
+    garbageCollect();
+#ifdef MEMORY_POOL_DEBUG
+    dumpStat();
+#endif
+  }
+
+  Pointer doAlloc(size_t size) override { return Traits::alloc(size); }
+  void doFree(Pointer ptr) override { Traits::free(ptr); }
+
+  void scheduleFree(memoryPool::Payload *payload, void *stream) override {
+    assert(payload->pool == this);
+    Traits::scheduleFree(payload, stream);
+  }
+};
+
+#include <cstdlib>
+struct PosixAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) { return ::malloc(size); }
+  static void free(Pointer ptr) { ::free(ptr); }
+
+  static void scheduleFree(memoryPool::Payload *payload, void *) { poolDetails::freeAsync(payload); }
+};
diff --git a/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
new file mode 100644
index 0000000000000..b99d425eca25c
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h
@@ -0,0 +1,87 @@
+#pragma once
+#include "memoryPool.h"
+#include <vector>
+
+// only for cudaStream_t
+#include <cuda_runtime.h>
+
+#include <cassert>
+
+namespace memoryPool {
+  namespace cuda {
+
+    void init(bool onlyCPU = false);
+    void shutdown();
+
+    void dumpStat();
+
+    SimplePoolAllocator *getPool(Where where);
+
+    // allocate either on current device or on host
+    std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool);
+
+    // schedule free
+    void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool);
+
+    template <typename T>
+    auto copy(Buffer<T> &dst, Buffer<T> const &src, uint64_t size, cudaStream_t stream) {
+      assert(dst.get());
+      assert(src.get());
+      assert(size > 0);
+      return cudaMemcpyAsync(dst.get(), src.get(), sizeof(T) * size, cudaMemcpyDefault, stream);
+    }
+
+    struct CudaDeleterBase : public DeleterBase {
+      CudaDeleterBase(cudaStream_t const &stream, Where where) : DeleterBase(getPool(where)), m_stream(stream) {}
+
+      CudaDeleterBase(cudaStream_t const &stream, SimplePoolAllocator *pool) : DeleterBase(pool), m_stream(stream) {}
+
+      ~CudaDeleterBase() override = default;
+
+      cudaStream_t m_stream;
+    };
+
+    struct DeleteOne final : public CudaDeleterBase {
+      using CudaDeleterBase::CudaDeleterBase;
+
+      ~DeleteOne() override = default;
+      void operator()(int bucket) override { free(m_stream, std::vector<int>(1, bucket), *pool()); }
+    };
+
+    struct BundleDelete final : public CudaDeleterBase {
+      BundleDelete(cudaStream_t const &stream, Where where) : CudaDeleterBase(stream, where) { m_buckets.reserve(8); }
+
+      ~BundleDelete() override { free(m_stream, std::move(m_buckets), *pool()); }
+
+      void operator()(int bucket) override { m_buckets.push_back(bucket); }
+
+      std::vector<int> m_buckets;
+    };
+
+    template <typename T>
+    Buffer<T> makeBuffer(uint64_t size, Deleter const &del) {
+      auto ret = alloc(sizeof(T) * size, *del.pool());
+      if (ret.second < 0) {
+        std::cout << "could not allocate " << size << ' ' << typeid(T).name() << " of size " << sizeof(T) << std::endl;
+        throw std::bad_alloc();
+      }
+      return Buffer<T>((T *)(ret.first), ret.second, del);
+    }
+
+    template <typename T>
+    Buffer<T> makeBuffer(uint64_t size, Deleter &&del) {
+      auto ret = alloc(sizeof(T) * size, *del.pool());
+      if (ret.second < 0) {
+        std::cout << "could not allocate " << size << ' ' << typeid(T).name() << " of size " << sizeof(T) << std::endl;
+        throw std::bad_alloc();
+      }
+      return Buffer<T>((T *)(ret.first), ret.second, std::move(del));
+    }
+
+    template <typename T>
+    Buffer<T> makeBuffer(uint64_t size, cudaStream_t const &stream, Where where) {
+      return makeBuffer<T>(size, Deleter(std::make_shared<DeleteOne>(stream, where)));
+    }
+
+  }  // namespace cuda
+}  // namespace memoryPool
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
new file mode 100644
index 0000000000000..2472a92cafb85
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/memoryPool.h
@@ -0,0 +1,121 @@
+#pragma once
+#include <memory>
+#include <new>
+
+#include <iostream>
+class SimplePoolAllocator;
+
+namespace memoryPool {
+
+  enum Where { onCPU, onDevice, onHost, unified };
+
+  class DeleterBase {
+  public:
+    explicit DeleterBase(SimplePoolAllocator* pool) : m_pool(pool) {}
+    virtual ~DeleterBase() = default;
+    virtual void operator()(int bucket) = 0;
+
+    SimplePoolAllocator* pool() const { return m_pool; }
+
+  protected:
+    SimplePoolAllocator* m_pool;
+  };
+
+  class Deleter {
+  public:
+    Deleter() = default;
+    explicit Deleter(std::shared_ptr<DeleterBase> const& del) : me(del) {}
+    explicit Deleter(std::shared_ptr<DeleterBase>&& del) : me(del) {}
+
+    void set(std::shared_ptr<DeleterBase> const& del) { me = del; }
+    std::shared_ptr<DeleterBase> const& get() const { return me; }
+
+    void operator()(int bucket) {
+      if (!me) {
+        std::cout << "deleter w/o implementation!!!" << std::endl;
+        throw std::bad_alloc();
+      }
+      if (bucket < 0)
+        std::cout << "delete with negative bucket!!!" << std::endl;
+      (*me)(bucket);
+    }
+
+    SimplePoolAllocator* pool() const { return me->pool(); }
+
+  private:
+    std::shared_ptr<DeleterBase> me;  //!
+  };
+
+  template <typename T>
+  class Buffer {
+  public:
+    typedef T value_type;
+    typedef T* pointer;
+    typedef T& reference;
+    typedef T const* const_pointer;
+    typedef T const& const_reference;
+
+    Buffer() = default;
+    Buffer(T* p, int bucket) : m_p(p), m_bucket(bucket) {}
+    Buffer(T* p, int bucket, Deleter const& del) : m_deleter(del), m_p(p), m_bucket(bucket) {}
+    Buffer(T* p, int bucket, Deleter&& del) : m_deleter(del), m_p(p), m_bucket(bucket) {}
+    Buffer(std::pair<T*, int> const& rh, Deleter const& del) : m_deleter(del), m_p(rh.first), m_bucket(rh.second) {}
+    Buffer(Buffer const&) = delete;
+    Buffer& operator=(Buffer const&) = delete;
+
+    template <typename U>
+    Buffer(Buffer<U>&& rh) : Buffer(rh.release(), rh.deleter()) {}
+    template <typename U>
+    Buffer& operator=(Buffer<U>&& rh) {
+      reset(rh.release());
+      m_deleter = rh.deleter();
+      return *this;
+    }
+
+    ~Buffer() {
+      // assert(m_p == pool()->pointer(m_bucket));
+      if (m_p)
+        m_deleter(m_bucket);
+    }
+
+    pointer get() { return m_p; }
+    const_pointer get() const { return m_p; }
+    reference operator*() { return *m_p; }
+    const_reference operator*() const { return *m_p; }
+    pointer operator->() { return get(); }
+    const_pointer operator->() const { return get(); }
+    reference operator[](int i) { return m_p[i]; }
+    const_reference operator[](int i) const { return m_p[i]; }
+
+    Deleter& deleter() { return m_deleter; }
+    Deleter const& deleter() const { return m_deleter; }
+    SimplePoolAllocator* pool() const { return deleter().pool(); }
+
+    int bucket() const { return m_bucket; }
+
+    std::pair<T*, int> release() {
+      auto ret = std::make_pair(m_p, m_bucket);
+      m_p = nullptr;
+      m_bucket = -1;
+      return ret;
+    }
+    void reset() {
+      if (m_p)
+        m_deleter(m_bucket);
+      m_p = nullptr;
+      m_bucket = -1;
+    }
+    void reset(std::pair<T*, int> const& rh) {
+      if (m_p)
+        m_deleter(m_bucket);
+      m_p = rh.first;
+      m_bucket = rh.second;
+    }
+
+  private:
+    Deleter m_deleter;      //!
+    pointer m_p = nullptr;  //!
+    int m_bucket = -1;      //!
+  };
+
+}  // namespace memoryPool
diff --git a/HeterogeneousCore/CUDAUtilities/src/SimplePoolAllocator.cc b/HeterogeneousCore/CUDAUtilities/src/SimplePoolAllocator.cc
new file mode 100644
index 0000000000000..629d31c1f26d1
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/SimplePoolAllocator.cc
@@ -0,0 +1,136 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+
+int SimplePoolAllocator::allocImpl(uint64_t s) {
+  auto b = poolDetails::bucket(s);
+  assert(s <= poolDetails::bucketSize(b));
+  int ls = size();
+  // look for an existing slot
+  for (int i = 0; i < ls; ++i) {
+    if (b != m_bucket[i])
+      continue;
+    if (m_used[i].v)
+      continue;
+    bool exp = false;
+    if (m_used[i].v.compare_exchange_strong(exp, true)) {
+      // verify if in the mean time the garbage collector did operate
+      if (nullptr == m_slots[i]) {
+        assert(m_bucket[i] < 0);
+        m_used[i].v = false;
+        continue;
+      }
+#ifdef MEMORY_POOL_DEBUG
+      m_last[i] = 0;
+#endif
+      return i;
+    }
+  }
+
+  // try to create in existing slot (if garbage has been collected)
+  ls = useOld(b);
+  if (ls >= 0)
+    return ls;
+
+  // try to allocate a new slot
+  if (m_size >= m_maxSlots)
+    return -1;
+  ls = m_size++;
+  if (ls >= m_maxSlots)
+    return -1;
+#ifdef MEMORY_POOL_DEBUG
+  m_last[ls] = 2;
+#endif
+  return createAt(ls, b);
+}
+
+int SimplePoolAllocator::createAt(int ls, int b) {
+  assert(m_used[ls].v);
+#ifdef MEMORY_POOL_DEBUG
+  assert(m_last[ls] > 0);
+#endif
+  m_bucket[ls] = b;
+  auto as = poolDetails::bucketSize(b);
+  assert(nullptr == m_slots[ls]);
+  m_slots[ls] = doAlloc(as);
+  if (nullptr == m_slots[ls]) {
+    m_bucket[ls] = -1;
+    m_used[ls].v = false;
+    return -1;
+  }
+  totBytes += as;
+  nAlloc++;
+  return ls;
+}
+
+void SimplePoolAllocator::garbageCollect() {
+  int ls = size();
+  for (int i = 0; i < ls; ++i) {
+    if (m_used[i].v)
+      continue;
+    if (m_bucket[i] < 0)
+      continue;
+    bool exp = false;
+    if (!m_used[i].v.compare_exchange_strong(exp, true))
+      continue;
+    assert(m_used[i].v);
+    if (nullptr != m_slots[i]) {
+      assert(m_bucket[i] >= 0);
+      doFree(m_slots[i]);
+      nFree++;
+      totBytes -= poolDetails::bucketSize(m_bucket[i]);
+    }
+    m_slots[i] = nullptr;
+    m_bucket[i] = -1;
+#ifdef MEMORY_POOL_DEBUG
+    m_last[i] = -3;
+#endif
+    m_used[i].v = false;  // here memory fence as well
+  }
+}
+
+int SimplePoolAllocator::useOld(int b) {
+  int ls = size();
+  for (int i = 0; i < ls; ++i) {
+    if (m_bucket[i] >= 0)
+      continue;
+    if (m_used[i].v)
+      continue;
+    bool exp = false;
+    if (!m_used[i].v.compare_exchange_strong(exp, true))
+      continue;
+    if (nullptr != m_slots[i]) {  // ops allocated and freed
+      assert(m_bucket[i] >= 0);
+#ifdef MEMORY_POOL_DEBUG
+      assert(m_last[i] = -1);
+#endif
+      m_used[i].v = false;
+      continue;
+    }
+    assert(m_used[i].v);
+#ifdef MEMORY_POOL_DEBUG
+    m_last[i] = 1;
+#endif
+    return createAt(i, b);
+  }
+  return -1;
+}
+
+void SimplePoolAllocator::dumpStat() const {
+  uint64_t fn = 0;
+  uint64_t fs = 0;
+  int ls = size();
+  for (int i = 0; i < ls; ++i) {
+    if (m_used[i].v) {
+      auto b = m_bucket[i];
+      if (b < 0)
+        continue;
+      fn++;
+      fs += (1LL << b);
+    }
+  }
+  std::cout << "# slots " << size() << '\n'
+            << "# bytes " << totBytes << '\n'
+            << "# alloc " << nAlloc << '\n'
+            << "# free " << nFree << '\n'
+            << "# used " << fn << ' ' << fs << '\n'
+            << std::endl;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc
new file mode 100644
index 0000000000000..6ecfa47865c3e
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/src/cudaMemoryPool.cc
@@ -0,0 +1,148 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#include <iostream>
+
+namespace {
+
+  //  free callback
+  void CUDART_CB freeCallback(cudaStream_t streamId, cudaError_t status, void *p) {
+    //void CUDART_CB freeCallback(void *p) {
+    if (status != cudaSuccess) {
+      std::cout << "Error in free callaback in stream " << streamId << std::endl;
+      auto error = cudaGetErrorName(status);
+      auto message = cudaGetErrorString(status);
+      std::cout << " error " << error << ": " << message << std::endl;
+    }
+    // std::cout << "free callaback for stream " << streamId << std::endl;
+    auto payload = (memoryPool::Payload *)(p);
+    poolDetails::freeAsync(payload);
+  }
+
+}  // namespace
+
+struct CudaAlloc {
+  static void scheduleFree(memoryPool::Payload *payload, void *stream) {
+    // std::cout    << "schedule free for stream " <<  stream <<std::endl;
+    if (!stream)
+      std::cout << "???? schedule free for stream " << stream << std::endl;
+    if (cudaStreamQuery((cudaStream_t)(stream)) == cudaSuccess)
+      poolDetails::freeAsync(payload);
+    else
+      cudaCheck(cudaStreamAddCallback((cudaStream_t)(stream), freeCallback, payload, 0));
+    // cudaCheck(cudaLaunchHostFunc(stream, freeCallback, payload));
+  }
+};
+
+struct CudaDeviceAlloc : public CudaAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) {
+    Pointer p = nullptr;
+    auto err = cudaMalloc(&p, size);
+    // std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
+    return err == cudaSuccess ? p : nullptr;
+  }
+  static void free(Pointer ptr) {
+    if (ptr == nullptr)
+      std::cout << "free nullptr???" << std::endl;
+    auto err = cudaFree(ptr);
+    // std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <<std::endl;
+    if (err != cudaSuccess)
+      std::cout << " error in cudaFree??" << std::endl;
+  }
+};
+
+struct CudaHostAlloc : public CudaAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) {
+    Pointer p = nullptr;
+    auto err = cudaMallocHost(&p, size);
+    // std::cout << "alloc H " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
+    return err == cudaSuccess ? p : nullptr;
+  }
+  static void free(Pointer ptr) { cudaFreeHost(ptr); }
+};
+
+namespace {
+
+  // FIXME : move it in its own place
+  std::unique_ptr<SimplePoolAllocatorImpl<PosixAlloc>> cpuPool;
+
+  std::unique_ptr<SimplePoolAllocatorImpl<CudaHostAlloc>> hostPool;
+
+  using DevicePool = SimplePoolAllocatorImpl<CudaDeviceAlloc>;
+  std::vector<std::unique_ptr<DevicePool>> devicePools;
+
+  void initDevicePools(int size) {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status == cudaSuccess && devices > 0) {
+      devicePools.reserve(devices);
+      for (int i = 0; i < devices; ++i)
+        devicePools.emplace_back(new DevicePool(size));
+    }
+  }
+
+  DevicePool *getDevicePool() {
+    int dev = -1;
+    cudaGetDevice(&dev);
+    return devicePools[dev].get();
+  }
+
+}  // namespace
+
+namespace memoryPool {
+  namespace cuda {
+
+    void init(bool onlyCPU) {
+      constexpr int poolSize = 128 * 1024;
+      cpuPool = std::make_unique<SimplePoolAllocatorImpl<PosixAlloc>>(poolSize);
+      if (onlyCPU)
+        return;
+      initDevicePools(poolSize);
+      hostPool = std::make_unique<SimplePoolAllocatorImpl<CudaHostAlloc>>(poolSize);
+    }
+
+    void shutdown() {
+      cpuPool.reset();
+      devicePools.clear();
+      hostPool.reset();
+    }
+
+    void dumpStat() {
+      std::cout << "device pool" << std::endl;
+      getDevicePool()->dumpStat();
+      std::cout << "host pool" << std::endl;
+      hostPool->dumpStat();
+    }
+
+    SimplePoolAllocator *getPool(Where where) {
+      return onCPU == where ? (SimplePoolAllocator *)(cpuPool.get())
+                            : (onDevice == where ? (SimplePoolAllocator *)(getDevicePool())
+                                                 : (SimplePoolAllocator *)(hostPool.get()));
+    }
+
+    // allocate either on current device or on host (actually anywhere, not cuda specific)
+    std::pair<void *, int> alloc(uint64_t size, SimplePoolAllocator &pool) {
+      int i = pool.alloc(size);
+      void *p = pool.pointer(i);
+      return std::pair<void *, int>(p, i);
+    }
+
+    // schedule free
+    void free(cudaStream_t stream, std::vector<int> buckets, SimplePoolAllocator &pool) {
+      auto payload = new Payload{&pool, std::move(buckets)};
+      pool.scheduleFree(payload, stream);
+    }
+
+  }  // namespace cuda
+}  // namespace memoryPool
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 53d41efcf4236..9a01d343e5d53 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -129,4 +129,29 @@
     <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
     <flags CXXFLAGS="-g -DGPU_DEBUG"/>
   </bin>
+
+  <bin file="testPoolUI.cu" name="testPoolUI">
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+    <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  </bin>
+
+
+  <bin file="testPoolUImt.cu" name="testPoolUImt">
+    <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
+    <flags CXXFLAGS="-g -DGPU_DEBUG"/>
+  </bin>
+
+  <bin file="testSimplePoolAllocator.cu" name="testSimplePoolAllocatorGPU">
+    <flags CUDA_FLAGS="-g"/>
+    <flags CXXFLAGS="-g"/>
+  </bin>
+
+  <bin file="testSimplePoolAllocator.cpp" name="testSimplePoolAllocatorCPU">
+    <flags CUDA_FLAGS="-g"/>
+    <flags CXXFLAGS="-g"/>
+  </bin>
+
+
 </iftool>
+
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu
new file mode 100644
index 0000000000000..417f42e606c42
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testPoolUI.cu
@@ -0,0 +1,101 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include <iostream>
+
+template <memoryPool::Where where>
+struct dataProducer {
+  auto operator()(cudaStream_t stream) { return memoryPool::cuda::makeBuffer<int>(20, stream, where); }
+};
+
+int main() {
+  {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess || 0 == devices)
+      return 0;
+    std::cout << "found " << devices << " cuda devices" << std::endl;
+  }
+  const int NUMTHREADS = 1;
+
+  printf("Using CUDA %d\n", CUDART_VERSION);
+  int cuda_device = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, cuda_device);
+  printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
+
+  cudaStream_t streams[NUMTHREADS];
+
+  for (int i = 0; i < NUMTHREADS; i++) {
+    cudaStreamCreate(&(streams[i]));
+  }
+
+  memoryPool::cuda::init(false);
+  memoryPool::cuda::dumpStat();
+
+  auto& stream = streams[0];
+
+  {
+    auto pd = memoryPool::cuda::makeBuffer<int>(20, stream, memoryPool::onDevice);
+    auto ph = memoryPool::cuda::makeBuffer<int>(20, stream, memoryPool::onHost);
+    auto pc = memoryPool::cuda::makeBuffer<int>(20, nullptr, memoryPool::onCPU);
+
+    auto dp = dataProducer<memoryPool::onDevice>()(stream);
+
+    cudaCheck(memoryPool::cuda::copy(ph, pd, 20, stream));
+    cudaCheck(memoryPool::cuda::copy(pd, ph, 20, stream));
+    std::cout << "expect 2a 2u 1a 1u" << std::endl;
+    memoryPool::cuda::dumpStat();
+
+    {
+      auto ph = memoryPool::cuda::makeBuffer<int>(20, stream, memoryPool::onHost);
+      cudaCheck(memoryPool::cuda::copy(pd, ph, 20, stream));
+    }
+    cudaStreamSynchronize(stream);
+    std::cout << "expect 2a 2u 2a 1u " << std::endl;
+    memoryPool::cuda::dumpStat();
+    {
+      auto ph = memoryPool::cuda::makeBuffer<int>(20, stream, memoryPool::onHost);
+      cudaCheck(memoryPool::cuda::copy(pd, ph, 20, stream));
+    }
+    std::cout << "expect 2a 2u 2a 1u " << std::endl;
+    cudaStreamSynchronize(stream);
+    memoryPool::cuda::dumpStat();
+  }
+  std::cout << "expect 2a 0u 2a 0u " << std::endl;
+  cudaStreamSynchronize(stream);
+  memoryPool::cuda::dumpStat();
+
+  {
+    memoryPool::Deleter devDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
+    memoryPool::Deleter hosDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onHost));
+
+    auto p0 = memoryPool::cuda::makeBuffer<int>(20, devDeleter);
+    auto p1 = memoryPool::cuda::makeBuffer<double>(20, devDeleter);
+    auto p2 = memoryPool::cuda::makeBuffer<bool>(20, devDeleter);
+    auto p3 = memoryPool::cuda::makeBuffer<int>(20, devDeleter);
+
+    {
+      auto pd = memoryPool::cuda::makeBuffer<int>(40, stream, memoryPool::onDevice);
+      p0.reset(pd.release());
+      memoryPool::cuda::dumpStat();
+    }
+    cudaStreamSynchronize(stream);
+
+    auto hp0 = memoryPool::cuda::makeBuffer<int>(40, hosDeleter);
+    auto hp1 = memoryPool::cuda::makeBuffer<double>(20, hosDeleter);
+    auto hp2 = memoryPool::cuda::makeBuffer<bool>(20, hosDeleter);
+    auto hp3 = memoryPool::cuda::makeBuffer<int>(20, hosDeleter);
+
+    cudaCheck(memoryPool::cuda::copy(hp3, p3, 20, stream));
+    cudaCheck(memoryPool::cuda::copy(p0, hp0, 40, stream));
+    ;
+
+    memoryPool::cuda::dumpStat();
+  }
+
+  cudaStreamSynchronize(stream);
+  memoryPool::cuda::dumpStat();
+  memoryPool::cuda::shutdown();
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu b/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu
new file mode 100644
index 0000000000000..e9b3e0b261a9b
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testPoolUImt.cu
@@ -0,0 +1,249 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+#include <cmath>
+#include <unistd.h>
+
+#include <random>
+#include <limits>
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+
+typedef std::thread Thread;
+typedef std::vector<std::thread> ThreadGroup;
+typedef std::mutex Mutex;
+typedef std::lock_guard<std::mutex> Lock;
+
+struct Node {
+  int it = -1;
+  int i = -1;
+  void *p = nullptr;
+#ifdef __CUDACC__
+  int c = 0;
+#else
+  std::atomic<int> c = 0;
+#endif
+};
+
+#ifdef __CUDACC__
+
+// generic callback
+template <typename F>
+void CUDART_CB myCallback(void *fun) {
+  (*(F *)(fun))();
+}
+
+__global__ void kernel_set(int s, Node **p, int me) {
+  int first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = first; i < s; i += gridDim.x * blockDim.x) {
+    assert(p[i]);
+    auto n = p[i];
+    n->it = me;
+    n->i = i;
+    n->p = p[i];
+    n->c = 1;
+  }
+}
+
+__global__ void kernel_test(int s, Node **p, int me) {
+  int first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = first; i < s; i += gridDim.x * blockDim.x) {
+    assert(p[i]);
+    auto n = p[i];
+    atomicSub(&(n->c), 1);
+    assert(n->it == me);
+    assert(n->i == i);
+    assert(n->p == p[i]);
+    assert(0 == n->c);
+  }
+}
+#endif
+
+template <memoryPool::Where where>
+void go() {
+  auto start = std::chrono::high_resolution_clock::now();
+
+  const int NUMTHREADS = 24;
+
+#ifdef __CUDACC__
+  printf("Using CUDA %d\n", CUDART_VERSION);
+  int cuda_device = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, cuda_device);
+  printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
+
+  cudaStream_t streams[NUMTHREADS];
+
+  for (int i = 0; i < NUMTHREADS; i++) {
+    cudaStreamCreate(&(streams[i]));
+  }
+
+#endif
+
+  memoryPool::cuda::init(false);
+
+  bool stop = false;
+  bool bin24 = false;
+  Thread monitor([&] {
+    int n = 10;
+    while (n--) {
+      sleep(5);
+      memoryPool::cuda::dumpStat();
+      if (5 == n)
+        bin24 = true;
+    }
+    std::cout << "\nstop\n" << std::endl;
+    stop = true;
+  });
+
+  int s = 40;
+  {
+    std::cout << "try to allocate " << s << std::endl;
+    auto stream = streams[0];
+    {
+      auto pd = memoryPool::cuda::makeBuffer<int>(s, stream, where);
+      assert(pd.get());
+      memoryPool::cuda::dumpStat();
+      pd = memoryPool::cuda::makeBuffer<int>(s, stream, where);
+      memoryPool::cuda::dumpStat();
+    }
+    cudaStreamSynchronize(stream);
+    memoryPool::cuda::dumpStat();
+  }
+  std::atomic<int> nt = 0;
+
+  auto test = [&] {
+    int const me = nt++;
+    auto delta = std::chrono::high_resolution_clock::now() - start;
+
+    std::mt19937 eng(me + std::chrono::duration_cast<std::chrono::milliseconds>(delta).count());
+    std::uniform_int_distribution<int> rgen1(1, 100);
+    std::uniform_int_distribution<int> rgen20(3, 20);
+    std::uniform_int_distribution<int> rgen24(3, 24);
+    std::cout << "first RN " << rgen1(eng) << " at "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " in " << me << std::endl;
+
+#ifdef __CUDACC__
+    Node **dp = nullptr;
+    Node **hp = nullptr;
+    cudaMalloc(&dp, 100 * sizeof(void *));
+    assert(dp);
+    cudaMallocHost(&hp, 100 * sizeof(void *));
+    assert(hp);
+#endif
+    auto &stream = streams[me];
+
+    int iter = 0;
+    while (true) {
+      if (stop)
+        break;
+      iter++;
+
+      memoryPool::Deleter devDeleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, where));
+      auto n = rgen1(eng);
+      bool large = 0 == (iter % (128 + me));
+      for (int k = 0; k < n; ++k) {
+        int b = bin24 ? rgen24(eng) : rgen20(eng);
+        // once in while let's allocate 2GB
+        if (large) {
+          b = 31;
+          large = false;
+        }
+        uint64_t s = 1LL << b;
+        assert(s > 0);
+        try {
+          auto p0 = memoryPool::cuda::makeBuffer<Node>(s / sizeof(Node) + sizeof(Node), devDeleter);
+          auto p = p0.get();
+          if (nullptr == p) {
+            std::cout << "error not detected??? " << b << ' ' << std::endl;
+            memoryPool::cuda::dumpStat();
+          }
+          assert(p);
+          hp[k] = p;
+        } catch (...) {
+          std::cout << "\n\n!!!Failed " << me << " at " << iter << std::endl;
+          cudaStreamSynchronize(stream);
+          memoryPool::cuda::dumpStat();
+          return;
+        }
+      }
+#ifdef __CUDACC__
+      assert(n <= 100);
+      // do something???
+      cudaMemcpyAsync(dp, hp, n * sizeof(void *), cudaMemcpyHostToDevice, stream);
+      kernel_set<<<1, 128, 0, stream>>>(n, dp, me);
+      kernel_test<<<1, 128, 0, stream>>>(n, dp, me);
+
+      // better sync each "event"
+      // cudaStreamSynchronize(stream);
+#else
+      // do something???
+      for (int k = 0; k < n; ++k) {
+        auto p = hp[k];
+        assert(p);
+        auto n = p;
+        n->it = me;
+        n->i = i;
+        n->p = p;
+        n->c = 1;
+      }
+      for (int k = 0; k < n; ++k) {
+        auto p = hp[k];
+        assert(p);
+        auto n = p;
+        n->c--;
+        assert(n->it == me);
+        assert(n->i == i);
+        assert(n->p == p);
+        assert(0 == n->c);
+      }
+#endif
+    }
+    cudaStreamSynchronize(stream);
+  };
+
+  ThreadGroup threads;
+  threads.reserve(NUMTHREADS);
+
+  for (int i = 0; i < NUMTHREADS; ++i) {
+    threads.emplace_back(test);
+  }
+
+  for (auto &t : threads)
+    t.join();
+
+  threads.clear();
+  monitor.join();
+  cudaDeviceSynchronize();
+  std::cout << "\nfinished\n" << std::endl;
+  memoryPool::cuda::dumpStat();
+  std::cout << "\nshutdown\n" << std::endl;
+  memoryPool::cuda::shutdown();
+}
+
+#ifdef __CUDACC__
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess || 0 == devices)
+      return 0;
+    std::cout << "found " << devices << " cuda devices" << std::endl;
+  }
+
+  std::cout << "\ntesting cuda device" << std::endl;
+  go<memoryPool::onDevice>();
+#else
+  std::cout << "testing posix" << std::endl;
+  go<memoryPool::onCPU>();
+#endif
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cpp b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cpp
new file mode 100644
index 0000000000000..1e57ad7312056
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cpp
@@ -0,0 +1,304 @@
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+
+#include <cmath>
+#include <unistd.h>
+
+#include <random>
+#include <limits>
+
+#include <atomic>
+#include <thread>
+#include <mutex>
+
+typedef std::thread Thread;
+typedef std::vector<std::thread> ThreadGroup;
+typedef std::mutex Mutex;
+typedef std::lock_guard<std::mutex> Lock;
+
+struct Node {
+  int it = -1;
+  int i = -1;
+  void *p = nullptr;
+#ifdef __CUDACC__
+  int c = 0;
+#else
+  std::atomic<int> c = 0;
+#endif
+};
+
+#ifdef __CUDACC__
+
+// generic callback
+template <typename F>
+void CUDART_CB myCallback(void *fun) {
+  (*(F *)(fun))();
+}
+
+__global__ void kernel_set(int s, void **p, int me) {
+  int first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = first; i < s; i += gridDim.x * blockDim.x) {
+    assert(p[i]);
+    auto n = (Node *)(p[i]);
+    n->it = me;
+    n->i = i;
+    n->p = p[i];
+    n->c = 1;
+  }
+}
+
+__global__ void kernel_test(int s, void **p, int me) {
+  int first = blockIdx.x * blockDim.x + threadIdx.x;
+  for (int i = first; i < s; i += gridDim.x * blockDim.x) {
+    assert(p + i);
+    auto n = (Node *)(p[i]);
+    atomicSub(&(n->c), 1);
+    assert(n->it == me);
+    assert(n->i == i);
+    assert(n->p == p[i]);
+    assert(0 == n->c);
+  }
+}
+#endif
+
+template <typename Traits>
+void go() {
+  auto start = std::chrono::high_resolution_clock::now();
+
+  const int NUMTHREADS = 24;
+
+#ifdef __CUDACC__
+  printf("Using CUDA %d\n", CUDART_VERSION);
+  int cuda_device = 0;
+  cudaDeviceProp deviceProp;
+  cudaGetDeviceProperties(&deviceProp, cuda_device);
+  printf("CUDA Capable: SM %d.%d hardware\n", deviceProp.major, deviceProp.minor);
+
+  cudaStream_t streams[NUMTHREADS];
+
+  for (int i = 0; i < NUMTHREADS; i++) {
+    cudaStreamCreate(&(streams[i]));
+  }
+
+#endif
+
+  SimplePoolAllocatorImpl<Traits> pool(128 * 1024);
+  assert(0 == pool.size());
+
+  bool stop = false;
+  bool bin24 = false;
+  Thread monitor([&] {
+    int n = 10;
+    while (n--) {
+      sleep(5);
+      pool.dumpStat();
+      if (5 == n)
+        bin24 = true;
+    }
+    std::cout << "\nstop\n" << std::endl;
+    stop = true;
+  });
+
+  int s = 40;
+
+  std::cout << "try to allocate " << s << std::endl;
+
+  int i0 = pool.alloc(s);
+  assert(1 == pool.size());
+  assert(i0 >= 0);
+  auto p0 = pool.pointer(i0);
+  assert(nullptr != p0);
+
+  pool.free(i0);
+  assert(1 == pool.size());
+
+  int i1 = pool.alloc(s);
+  assert(1 == pool.size());
+  assert(i1 == i0);
+  auto p1 = pool.pointer(i1);
+  assert(p1 == p0);
+
+  std::atomic<int> nt = 0;
+
+  auto test = [&] {
+    int const me = nt++;
+    auto delta = std::chrono::high_resolution_clock::now() - start;
+
+    std::mt19937 eng(me + std::chrono::duration_cast<std::chrono::milliseconds>(delta).count());
+    std::uniform_int_distribution<int> rgen1(1, 100);
+    std::uniform_int_distribution<int> rgen20(3, 20);
+    std::uniform_int_distribution<int> rgen24(3, 24);
+    std::cout << "first RN " << rgen1(eng) << " at "
+              << std::chrono::duration_cast<std::chrono::milliseconds>(delta).count() << " in " << me << std::endl;
+
+#ifdef __CUDACC__
+    void **dp = nullptr;
+    void **hp = nullptr;
+    cudaMalloc(&dp, 100 * sizeof(void *));
+    assert(dp);
+    cudaMallocHost(&hp, 100 * sizeof(void *));
+    assert(hp);
+#endif
+
+    int iter = 0;
+    while (true) {
+      if (stop)
+        break;
+      iter++;
+      auto n = rgen1(eng);
+      int ind[n];
+      bool large = 0 == (iter % (128 + me));
+      for (auto &i : ind) {
+        int b = bin24 ? rgen24(eng) : rgen20(eng);
+        // once in while let's allocate 2GB
+        if (large) {
+          b = 31;
+          large = false;
+        }
+        uint64_t s = 1LL << b;
+        assert(s > 0);
+        i = pool.alloc(s + sizeof(Node));
+        if (i < 0) {
+          std::cout << "\n\n!!!Failed " << me << " at " << iter << std::endl;
+          pool.dumpStat();
+          return;
+        }
+        assert(i >= 0);
+        auto p = pool.pointer(i);
+        if (nullptr == p) {
+          std::cout << "error not detected??? " << b << ' ' << i << std::endl;
+          pool.dumpStat();
+        }
+        assert(p);
+      }
+#ifdef __CUDACC__
+      assert(n <= 100);
+      auto &stream = streams[me];
+      // do something???
+      for (int k = 0; k < n; ++k) {
+        auto i = ind[k];
+        hp[k] = pool.pointer(i);
+      }
+      cudaMemcpyAsync(dp, hp, n * sizeof(void *), cudaMemcpyHostToDevice, stream);
+      kernel_set<<<1, 128, 0, stream>>>(n, dp, me);
+      kernel_test<<<1, 128, 0, stream>>>(n, dp, me);
+
+      // free
+      auto doFree = [&]() {
+        for (int k = 0; k < n; ++k) {
+          auto i = ind[k];
+          pool.free(i);
+        }
+      };
+      cudaLaunchHostFunc(stream, myCallback<decltype(doFree)>, &doFree);
+
+      // better sync each "event"
+      cudaStreamSynchronize(stream);
+#else
+      // do something???
+      for (auto i : ind) {
+        auto p = pool.pointer(i);
+        assert(p);
+        auto n = (Node *)(p);
+        n->it = me;
+        n->i = i;
+        n->p = p;
+        n->c = 1;
+      }
+      for (auto i : ind) {
+        auto p = pool.pointer(i);
+        assert(p);
+        auto n = (Node *)(p);
+        n->c--;
+        assert(n->it == me);
+        assert(n->i == i);
+        assert(n->p == p);
+        assert(0 == n->c);
+      }
+      // free
+      for (auto i : ind) {
+        pool.free(i);
+      }
+#endif
+    }
+  };
+
+  ThreadGroup threads;
+  threads.reserve(NUMTHREADS);
+
+  for (int i = 0; i < NUMTHREADS; ++i) {
+    threads.emplace_back(test);
+  }
+
+  for (auto &t : threads)
+    t.join();
+
+  threads.clear();
+  monitor.join();
+  std::cout << "\nfinished\n" << std::endl;
+  pool.dumpStat();
+}
+
+#ifdef __CUDACC__
+#include <cuda_runtime.h>
+#include <cuda_runtime_api.h>
+
+// copy from implementation
+
+struct CudaAlloc {
+  // not called in this test (done inline)
+  static void scheduleFree(memoryPool::Payload *payload, void *stream) {
+    std::cout << "schedule free for stream " << stream << std::endl;
+    abort();
+  }
+};
+
+struct CudaDeviceAlloc : public CudaAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) {
+    Pointer p = nullptr;
+    auto err = cudaMalloc(&p, size);
+    // std::cout << "alloc " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
+    return err == cudaSuccess ? p : nullptr;
+  }
+  static void free(Pointer ptr) {
+    auto err = cudaFree(ptr);
+    // std::cout << "free" << ((err == cudaSuccess) ? " ok" : " err") <<std::endl;
+    if (err != cudaSuccess)
+      std::cout << " error in cudaFree??" << std::endl;
+  }
+};
+
+struct CudaHostAlloc : public CudaAlloc {
+  using Pointer = void *;
+
+  static Pointer alloc(size_t size) {
+    Pointer p = nullptr;
+    auto err = cudaMallocHost(&p, size);
+    // std::cout << "alloc H " << size << ((err == cudaSuccess) ? " ok" : " err") << std::endl;
+    return err == cudaSuccess ? p : nullptr;
+  }
+  static void free(Pointer ptr) { cudaFreeHost(ptr); }
+};
+
+#endif
+
+int main() {
+#ifdef __CUDACC__
+  {
+    int devices = 0;
+    auto status = cudaGetDeviceCount(&devices);
+    if (status != cudaSuccess || 0 == devices)
+      return 0;
+    std::cout << "found " << devices << " cuda devices" << std::endl;
+  }
+
+  std::cout << "\ntesting cuda device" << std::endl;
+  go<CudaDeviceAlloc>();
+#else
+  std::cout << "testing posix" << std::endl;
+  go<PosixAlloc>();
+#endif
+
+  return 0;
+}
diff --git a/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cu b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cu
new file mode 100644
index 0000000000000..92511a3ec0f53
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testSimplePoolAllocator.cu
@@ -0,0 +1 @@
+#include "testSimplePoolAllocator.cpp"
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
index 135254fa6e9f2..2d6bf12e665df 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHitGPUKernel.cu
@@ -13,6 +13,8 @@
 #include "PixelRecHitGPUKernel.h"
 #include "gpuPixelRecHits.h"
 
+// #define GPU_DEBUG
+
 namespace {
   __global__ void setHitsLayerStart(uint32_t const* __restrict__ hitsModuleStart,
                                     pixelCPEforGPU::ParamsOnGPU const* cpeParams,
@@ -42,10 +44,21 @@ namespace pixelgpudetails {
                                                           cudaStream_t stream) const {
     auto nHits = clusters_d.nClusters();
 
-    TrackingRecHit2DGPU hits_d(
-        nHits, isPhase2, clusters_d.offsetBPIX2(), cpeParams, clusters_d.clusModuleStart(), stream);
+    TrackingRecHit2DGPU hits_d(nHits,
+                               isPhase2,
+                               clusters_d.offsetBPIX2(),
+                               cpeParams,
+                               clusters_d.clusModuleStart(),
+                               memoryPool::onDevice,
+                               stream);
+
+    assert(hits_d.view());
     assert(hits_d.nMaxModules() == isPhase2 ? phase2PixelTopology::numberOfModules
                                             : phase1PixelTopology::numberOfModules);
+    cudaCheck(cudaGetLastError());
+#ifdef GPU_DEBUG
+    cudaCheck(cudaDeviceSynchronize());
+#endif
 
     int activeModulesWithDigis = digis_d.nModules();
     // protect from empty events
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
index 7ff2da5552e6d..28c912f9da6c3 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitFromCUDA.cc
@@ -47,8 +47,8 @@ class SiPixelRecHitFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
 
   uint32_t nHits_;
   uint32_t nMaxModules_;
-  cms::cuda::host::unique_ptr<float[]> store32_;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+  memoryPool::Buffer<float> store32_;
+  memoryPool::Buffer<uint32_t> hitsModuleStart_;
 };
 
 SiPixelRecHitFromCUDA::SiPixelRecHitFromCUDA(const edm::ParameterSet& iConfig)
@@ -84,7 +84,7 @@ void SiPixelRecHitFromCUDA::acquire(edm::Event const& iEvent,
 }
 
 void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& es) {
-  // allocate a buffer for the indices of the clusters
+  // allocate a Buffer for the indices of the clusters
   auto hmsp = std::make_unique<uint32_t[]>(nMaxModules_ + 1);
 
   SiPixelRecHitCollection output;
@@ -98,7 +98,7 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
   output.reserve(nMaxModules_, nHits_);
 
   std::copy(hitsModuleStart_.get(), hitsModuleStart_.get() + nMaxModules_ + 1, hmsp.get());
-  // wrap the buffer in a HostProduct, and move it to the Event, without reallocating the buffer or affecting hitsModuleStart
+  // wrap the Buffer in a HostProduct, and move it to the Event, without reallocating the Buffer or affecting hitsModuleStart
   iEvent.emplace(hostPutToken_, std::move(hmsp));
 
   auto xl = store32_.get();
@@ -124,8 +124,8 @@ void SiPixelRecHitFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& e
     const PixelGeomDetUnit* pixDet = dynamic_cast<const PixelGeomDetUnit*>(genericDet);
     assert(pixDet);
     SiPixelRecHitCollection::FastFiller recHitsOnDetUnit(output, detid);
-    auto fc = hitsModuleStart_[gind];
-    auto lc = hitsModuleStart_[gind + 1];
+    auto fc = hitsModuleStart_.get()[gind];
+    auto lc = hitsModuleStart_.get()[gind + 1];
     auto nhits = lc - fc;
 
     assert(lc > fc);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
index fda418320e70a..71e775056d150 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromCUDA.cc
@@ -45,9 +45,9 @@ class SiPixelRecHitSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWor
   uint32_t nHits_;
   uint32_t nMaxModules_;
 
-  cms::cuda::host::unique_ptr<float[]> store32_;
-  cms::cuda::host::unique_ptr<uint16_t[]> store16_;
-  cms::cuda::host::unique_ptr<uint32_t[]> hitsModuleStart_;
+  memoryPool::Buffer<float> store32_;
+  memoryPool::Buffer<uint16_t> store16_;
+  memoryPool::Buffer<uint32_t> hitsModuleStart_;
 };
 
 SiPixelRecHitSoAFromCUDA::SiPixelRecHitSoAFromCUDA(const edm::ParameterSet& iConfig)
@@ -89,3 +89,25 @@ void SiPixelRecHitSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const
 }
 
 DEFINE_FWK_MODULE(SiPixelRecHitSoAFromCUDA);
+
+// #define FINAL_POOL_DUMP
+#ifdef FINAL_POOL_DUMP
+
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/SimplePoolAllocator.h"
+
+namespace {
+
+  struct FinalPoolDump {
+    ~FinalPoolDump() {
+      std::cout << "Final Pool Dump\n==== ==== ====\n\n Posix Pool" << std::endl;
+      ((SimplePoolAllocatorImpl<PosixAlloc>*)memoryPool::cuda::getPool(memoryPool::onCPU))->dumpStat();
+      memoryPool::cuda::dumpStat();
+    }
+  };
+
+  FinalPoolDump dump;
+
+}  // namespace
+#endif
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
index d23ecec66fea0..9f9e067734ccc 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitSoAFromLegacy.cc
@@ -157,7 +157,7 @@ void SiPixelRecHitSoAFromLegacy::produce(edm::StreamID streamID, edm::Event& iEv
   // element 96 is the start of BPIX2 (i.e. the number of clusters in BPIX1)
 
   auto output = std::make_unique<TrackingRecHit2DCPU>(
-      numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, nullptr);
+      numberOfClusters, isPhase2_, hitsModuleStart[startBPIX2], &cpeView, hitsModuleStart, memoryPool::onCPU, nullptr);
   assert(output->nMaxModules() == uint32_t(nMaxModules));
 
   if (0 == numberOfClusters) {
diff --git a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
index c5f8c0949dd08..21202f3d21b65 100644
--- a/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
+++ b/RecoLocalTracker/SiPixelRecHits/python/SiPixelRecHits_cfi.py
@@ -61,7 +61,7 @@
 siPixelRecHitsPreSplittingSoA = SwitchProducerCUDA(
     cpu = cms.EDAlias(
             siPixelRecHitsPreSplittingCPU = cms.VPSet(
-                 cms.PSet(type = cms.string("cmscudacompatCPUTraitsTrackingRecHit2DHeterogeneous")),
+                 cms.PSet(type = cms.string("TrackingRecHit2DHeterogeneous")),
                  cms.PSet(type = cms.string("uintAsHostProduct"))
              )),
 )
diff --git a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
index edbcd44eeeef5..473263b5add9d 100644
--- a/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelTrackFitting/plugins/PixelTrackSoAFromCUDA.cc
@@ -1,7 +1,6 @@
 #include <cuda_runtime.h>
 
 #include "CUDADataFormats/Common/interface/Product.h"
-#include "CUDADataFormats/Common/interface/HostProduct.h"
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "FWCore/Framework/interface/Event.h"
@@ -15,6 +14,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 
 // Switch on to enable checks and printout for found tracks
 // #define PIXEL_DEBUG_PRODUCE
@@ -35,7 +35,7 @@ class PixelTrackSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<cms::cuda::Product<PixelTrackHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<PixelTrackHeterogeneous> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<pixelTrack::TrackSoA> soa_;
+  PixelTrackHeterogeneous soa_;
 };
 
 PixelTrackSoAFromCUDA::PixelTrackSoAFromCUDA(const edm::ParameterSet& iConfig)
@@ -56,7 +56,8 @@ void PixelTrackSoAFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
-  soa_ = inputData.toHostAsync(ctx.stream());
+  soa_ = memoryPool::cuda::makeBuffer<PixelTrackHeterogeneous::value_type>(1, ctx.stream(), memoryPool::onHost);
+  memoryPool::cuda::copy(soa_, inputData, 1, ctx.stream());
 }
 
 void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
@@ -79,9 +80,9 @@ void PixelTrackSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& i
 #endif
 
   // DO NOT  make a copy  (actually TWO....)
-  iEvent.emplace(tokenSOA_, PixelTrackHeterogeneous(std::move(soa_)));
+  iEvent.emplace(tokenSOA_, std::move(soa_));
 
-  assert(!soa_);
+  assert(!soa_.get());
 }
 
 DEFINE_FWK_MODULE(PixelTrackSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
index d99a96b705451..e7d20fb0d8158 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/BrokenLineFitOnGPU.cu
@@ -1,5 +1,6 @@
+
 #include "BrokenLineFitOnGPU.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 
 void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                             uint32_t hitsInFit,
@@ -11,13 +12,15 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
   auto numberOfBlocks = (maxNumberOfConcurrentFits_ + blockSize - 1) / blockSize;
 
   //  Fit internals
-  auto tkidGPU = cms::cuda::make_device_unique<caConstants::tindex_type[]>(maxNumberOfConcurrentFits_, stream);
-  auto hitsGPU = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), stream);
-  auto hits_geGPU = cms::cuda::make_device_unique<float[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), stream);
-  auto fast_fit_resultsGPU = cms::cuda::make_device_unique<double[]>(
-      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), stream);
+  memoryPool::Deleter deleter =
+      memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, memoryPool::onDevice));
+  auto tkidGPU = memoryPool::cuda::makeBuffer<caConstants::tindex_type>(maxNumberOfConcurrentFits_, deleter);
+  auto hitsGPU = memoryPool::cuda::makeBuffer<double>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix3xNd<6>) / sizeof(double), deleter);
+  auto hits_geGPU = memoryPool::cuda::makeBuffer<float>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Matrix6xNf<6>) / sizeof(float), deleter);
+  auto fast_fit_resultsGPU = memoryPool::cuda::makeBuffer<double>(
+      maxNumberOfConcurrentFits_ * sizeof(riemannFit::Vector4d) / sizeof(double), deleter);
 
   for (uint32_t offset = 0; offset < maxNumberOfTuples; offset += maxNumberOfConcurrentFits_) {
     // fit triplets
@@ -127,6 +130,9 @@ void HelixFitOnGPU::launchBrokenLineKernels(HitsView const *hv,
                                                    fast_fit_resultsGPU.get());
       cudaCheck(cudaGetLastError());
     }
-
+#ifdef GPU_DEBUG
+    cudaDeviceSynchronize();
+    cudaCheck(cudaGetLastError());
+#endif
   }  // loop on concurrent fits
 }
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
index 66208debdc98d..d48af27a329be 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cc
@@ -22,18 +22,16 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // use "nhits" to heuristically dimension the workspace
-
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_isOuterHitOfCell_ = Traits::template make_unique<GPUCACell::OuterHitOfCell[]>(std::max(1U, nhits), stream);
-  device_isOuterHitOfCell_ = std::make_unique<GPUCACell::OuterHitOfCellContainer[]>(std::max(1U, nhits));
+  memoryPool::Deleter const &deleter = device_storage_.deleter();
+  device_isOuterHitOfCell_ =
+      memoryPool::cuda::makeBuffer<GPUCACell::OuterHitOfCellContainer>(std::max(1U, nhits), deleter);
   assert(device_isOuterHitOfCell_.get());
   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
   auto cellStorageSize = caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
                          caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks);
   // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //cellStorage_ = Traits::template make_unique<unsigned char[]>(cellStorageSize, stream);
-  cellStorage_ = std::make_unique<unsigned char[]>(cellStorageSize);
+  cellStorage_ = memoryPool::cuda::makeBuffer<unsigned char>(cellStorageSize, deleter);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));
@@ -45,9 +43,7 @@ void CAHitNtupletGeneratorKernelsCPU::buildDoublets(HitsOnCPU const &hh, cudaStr
                                  device_theCellTracks_.get(),
                                  device_theCellTracksContainer_);
 
-  // no need to use the Traits allocations, since we know this is being compiled for the CPU
-  //device_theCells_ = Traits::template make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
-  device_theCells_ = std::make_unique<GPUCACell[]>(params_.maxNumberOfDoublets_);
+  device_theCells_ = memoryPool::cuda::makeBuffer<GPUCACell>(params_.maxNumberOfDoublets_, deleter);
   if (0 == nhits)
     return;  // protect against empty events
 
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
index 913b6d5a32d28..e29b15fc80e53 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.cu
@@ -135,8 +135,6 @@ template <>
 void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStream_t stream) {
   int32_t nhits = hh.nHits();
 
-  isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
-
 #ifdef NTUPLE_DEBUG
   std::cout << "building Doublets out of " << nhits << " Hits" << std::endl;
 #endif
@@ -147,16 +145,17 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
 #endif
 
   // in principle we can use "nhits" to heuristically dimension the workspace...
-  device_isOuterHitOfCell_ = cms::cuda::make_device_unique<GPUCACell::OuterHitOfCellContainer[]>(
-      std::max(1, nhits - hh.offsetBPIX2()), stream);
+  memoryPool::Deleter const &deleter = device_storage_.deleter();
+  device_isOuterHitOfCell_ =
+      memoryPool::cuda::makeBuffer<GPUCACell::OuterHitOfCellContainer>(std::max(1, nhits), deleter);
   assert(device_isOuterHitOfCell_.get());
 
   isOuterHitOfCell_ = GPUCACell::OuterHitOfCell{device_isOuterHitOfCell_.get(), hh.offsetBPIX2()};
 
-  cellStorage_ = cms::cuda::make_device_unique<unsigned char[]>(
+  cellStorage_ = memoryPool::cuda::makeBuffer<unsigned char>(
       caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellNeighbors) +
           caConstants::maxNumOfActiveDoublets * sizeof(GPUCACell::CellTracks),
-      stream);
+      deleter);
   device_theCellNeighborsContainer_ = (GPUCACell::CellNeighbors *)cellStorage_.get();
   device_theCellTracksContainer_ = (GPUCACell::CellTracks *)(cellStorage_.get() + caConstants::maxNumOfActiveDoublets *
                                                                                       sizeof(GPUCACell::CellNeighbors));
@@ -174,7 +173,7 @@ void CAHitNtupletGeneratorKernelsGPU::buildDoublets(HitsOnCPU const &hh, cudaStr
     cudaCheck(cudaGetLastError());
   }
 
-  device_theCells_ = cms::cuda::make_device_unique<GPUCACell[]>(params_.maxNumberOfDoublets_, stream);
+  device_theCells_ = memoryPool::cuda::makeBuffer<GPUCACell>(params_.maxNumberOfDoublets_, deleter);
 
 #ifdef GPU_DEBUG
   cudaDeviceSynchronize();
@@ -325,6 +324,9 @@ void CAHitNtupletGeneratorKernelsGPU::classifyTuples(HitsOnCPU const &hh, TkSoA
     cudaCheck(cudaGetLastError());
   }
 #ifdef GPU_DEBUG
+  //std::cout << "sync stream " << cudaStream << std::endl;
+  //cudaStreamSynchronize(cudaStream);
+  //std::cout << "sync stream done " << cudaStream <<    std::endl;
   cudaDeviceSynchronize();
   cudaCheck(cudaGetLastError());
 #endif
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
index 8af1176fe92c6..6c1abc5bb5b10 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernels.h
@@ -6,6 +6,8 @@
 #include "CUDADataFormats/Track/interface/PixelTrackHeterogeneous.h"
 #include "GPUCACell.h"
 
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
+
 // #define DUMP_GPU_TK_TUPLES
 
 namespace cAHitNtupletGenerator {
@@ -153,21 +155,21 @@ namespace cAHitNtupletGenerator {
 
 }  // namespace cAHitNtupletGenerator
 
-template <typename TTraits>
+template <memoryPool::Where w>
 class CAHitNtupletGeneratorKernels {
 public:
-  using Traits = TTraits;
+  static constexpr memoryPool::Where where = w;
 
   using QualityCuts = cAHitNtupletGenerator::QualityCuts;
   using Params = cAHitNtupletGenerator::Params;
   using Counters = cAHitNtupletGenerator::Counters;
 
   template <typename T>
-  using unique_ptr = typename Traits::template unique_ptr<T>;
+  using Buffer = memoryPool::Buffer<T>;
 
   using HitsView = TrackingRecHit2DSOAView;
   using HitsOnGPU = TrackingRecHit2DSOAView;
-  using HitsOnCPU = TrackingRecHit2DHeterogeneous<Traits>;
+  using HitsOnCPU = TrackingRecHit2DHeterogeneous;
 
   using HitToTuple = caConstants::HitToTuple;
   using TupleMultiplicity = caConstants::TupleMultiplicity;
@@ -197,28 +199,28 @@ class CAHitNtupletGeneratorKernels {
   Counters* counters_ = nullptr;
 
   // workspace
-  unique_ptr<unsigned char[]> cellStorage_;
-  unique_ptr<caConstants::CellNeighborsVector> device_theCellNeighbors_;
+  Buffer<unsigned char> cellStorage_;
+  Buffer<caConstants::CellNeighborsVector> device_theCellNeighbors_;
   caConstants::CellNeighbors* device_theCellNeighborsContainer_;
-  unique_ptr<caConstants::CellTracksVector> device_theCellTracks_;
+  Buffer<caConstants::CellTracksVector> device_theCellTracks_;
   caConstants::CellTracks* device_theCellTracksContainer_;
 
-  unique_ptr<GPUCACell[]> device_theCells_;
-  unique_ptr<GPUCACell::OuterHitOfCellContainer[]> device_isOuterHitOfCell_;
+  Buffer<GPUCACell> device_theCells_;
+  Buffer<GPUCACell::OuterHitOfCellContainer> device_isOuterHitOfCell_;
   GPUCACell::OuterHitOfCell isOuterHitOfCell_;
   uint32_t* device_nCells_ = nullptr;
 
-  unique_ptr<HitToTuple> device_hitToTuple_;
-  unique_ptr<HitToTuple::Counter[]> device_hitToTupleStorage_;
+  Buffer<HitToTuple> device_hitToTuple_;
+  Buffer<HitToTuple::Counter> device_hitToTupleStorage_;
   HitToTuple::View hitToTupleView_;
 
   cms::cuda::AtomicPairCounter* device_hitToTuple_apc_ = nullptr;
 
   cms::cuda::AtomicPairCounter* device_hitTuple_apc_ = nullptr;
 
-  unique_ptr<TupleMultiplicity> device_tupleMultiplicity_;
+  Buffer<TupleMultiplicity> device_tupleMultiplicity_;
 
-  unique_ptr<cms::cuda::AtomicPairCounter::c_type[]> device_storage_;
+  Buffer<cms::cuda::AtomicPairCounter::c_type> device_storage_;
   // params
   Params const& params_;
   /// Intermediate result avoiding repeated computations.
@@ -236,7 +238,7 @@ class CAHitNtupletGeneratorKernels {
   }
 };
 
-using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<cms::cudacompat::GPUTraits>;
-using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<cms::cudacompat::CPUTraits>;
+using CAHitNtupletGeneratorKernelsGPU = CAHitNtupletGeneratorKernels<memoryPool::onDevice>;
+using CAHitNtupletGeneratorKernelsCPU = CAHitNtupletGeneratorKernels<memoryPool::onCPU>;
 
 #endif  // RecoPixelVertexing_PixelTriplets_plugins_CAHitNtupletGeneratorKernels_h
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
index 5978ef8851c73..cec96faab2c86 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorKernelsAlloc.cc
@@ -12,31 +12,34 @@ void CAHitNtupletGeneratorKernelsCPU::allocateOnGPU(int32_t nHits, cudaStream_t
   // ALLOCATIONS FOR THE INTERMEDIATE RESULTS (STAYS ON WORKER)
   //////////////////////////////////////////////////////////
 
-  device_theCellNeighbors_ = Traits::template make_unique<caConstants::CellNeighborsVector>(stream);
-  device_theCellTracks_ = Traits::template make_unique<caConstants::CellTracksVector>(stream);
+  memoryPool::Deleter deleter = memoryPool::Deleter(std::make_shared<memoryPool::cuda::BundleDelete>(stream, where));
+
+  device_theCellNeighbors_ = memoryPool::cuda::makeBuffer<caConstants::CellNeighborsVector>(1, deleter);
+  device_theCellTracks_ = memoryPool::cuda::makeBuffer<caConstants::CellTracksVector>(1, deleter);
 
 #ifdef GPU_DEBUG
-  std::cout << "Allocation for tuple building. N hits " << nHits << std::endl;
+  std::cout << "Allocation for tuple building. N hits " << nHits
+            << ((where == memoryPool::onDevice) ? " on GPU" : " on CPU") << std::endl;
 #endif
 
   nHits++;  // storage requires one more counter;
   assert(nHits > 0);
-  device_hitToTuple_ = Traits::template make_unique<HitToTuple>(stream);
-  device_hitToTupleStorage_ = Traits::template make_unique<HitToTuple::Counter[]>(nHits, stream);
+  device_hitToTuple_ = memoryPool::cuda::makeBuffer<HitToTuple>(1, deleter);
+  device_hitToTupleStorage_ = memoryPool::cuda::makeBuffer<HitToTuple::Counter>(nHits, deleter);
   hitToTupleView_.assoc = device_hitToTuple_.get();
   hitToTupleView_.offStorage = device_hitToTupleStorage_.get();
   hitToTupleView_.offSize = nHits;
 
-  device_tupleMultiplicity_ = Traits::template make_unique<TupleMultiplicity>(stream);
+  device_tupleMultiplicity_ = memoryPool::cuda::makeBuffer<TupleMultiplicity>(1, deleter);
 
-  device_storage_ = Traits::template make_unique<cms::cuda::AtomicPairCounter::c_type[]>(3, stream);
+  device_storage_ = memoryPool::cuda::makeBuffer<cms::cuda::AtomicPairCounter::c_type>(3, deleter);
 
   device_hitTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get();
   device_hitToTuple_apc_ = (cms::cuda::AtomicPairCounter*)device_storage_.get() + 1;
   device_nCells_ = (uint32_t*)(device_storage_.get() + 2);
 
   // FIXME: consider collapsing these 3 in one adhoc kernel
-  if constexpr (std::is_same<Traits, cms::cudacompat::GPUTraits>::value) {
+  if constexpr (where == memoryPool::onDevice) {
     cudaCheck(cudaMemsetAsync(device_nCells_, 0, sizeof(uint32_t), stream));
   } else {
     *device_nCells_ = 0;
diff --git a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
index a554d364ee42b..2d61797b47fcf 100644
--- a/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
+++ b/RecoPixelVertexing/PixelTriplets/plugins/CAHitNtupletGeneratorOnGPU.cc
@@ -185,7 +185,8 @@ void CAHitNtupletGeneratorOnGPU::fillDescriptions(edm::ParameterSetDescription&
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecHit2DGPU const& hits_d,
                                                                     float bfield,
                                                                     cudaStream_t stream) const {
-  PixelTrackHeterogeneous tracks(cms::cuda::make_device_unique<pixelTrack::TrackSoA>(stream));
+  auto where = memoryPool::onDevice;
+  PixelTrackHeterogeneous tracks = memoryPool::cuda::makeBuffer<PixelTrackHeterogeneous::value_type>(1, stream, where);
 
   auto* soa = tracks.get();
   assert(soa);
@@ -216,7 +217,8 @@ PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuplesAsync(TrackingRecH
 }
 
 PixelTrackHeterogeneous CAHitNtupletGeneratorOnGPU::makeTuples(TrackingRecHit2DCPU const& hits_d, float bfield) const {
-  PixelTrackHeterogeneous tracks(std::make_unique<pixelTrack::TrackSoA>());
+  auto where = memoryPool::onCPU;
+  PixelTrackHeterogeneous tracks = memoryPool::cuda::makeBuffer<PixelTrackHeterogeneous::value_type>(1, nullptr, where);
 
   auto* soa = tracks.get();
   assert(soa);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
index dc125878b1058..205c84bbf876a 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/PixelVertexSoAFromCUDA.cc
@@ -16,6 +16,7 @@
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 
 class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork> {
 public:
@@ -33,7 +34,7 @@ class PixelVertexSoAFromCUDA : public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<cms::cuda::Product<ZVertexHeterogeneous>> tokenCUDA_;
   edm::EDPutTokenT<ZVertexHeterogeneous> tokenSOA_;
 
-  cms::cuda::host::unique_ptr<ZVertexSoA> m_soa;
+  ZVertexHeterogeneous soa_;
 };
 
 PixelVertexSoAFromCUDA::PixelVertexSoAFromCUDA(const edm::ParameterSet& iConfig)
@@ -54,12 +55,13 @@ void PixelVertexSoAFromCUDA::acquire(edm::Event const& iEvent,
   cms::cuda::ScopedContextAcquire ctx{inputDataWrapped, std::move(waitingTaskHolder)};
   auto const& inputData = ctx.get(inputDataWrapped);
 
-  m_soa = inputData.toHostAsync(ctx.stream());
+  soa_ = memoryPool::cuda::makeBuffer<ZVertexHeterogeneous::value_type>(1, ctx.stream(), memoryPool::onHost);
+  memoryPool::cuda::copy(soa_, inputData, 1, ctx.stream());
 }
 
 void PixelVertexSoAFromCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // No copies....
-  iEvent.emplace(tokenSOA_, ZVertexHeterogeneous(std::move(m_soa)));
+  iEvent.emplace(tokenSOA_, std::move(soa_));
 }
 
 DEFINE_FWK_MODULE(PixelVertexSoAFromCUDA);
diff --git a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
index 20b007d2d029f..f1c652b68c61e 100644
--- a/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
+++ b/RecoPixelVertexing/PixelVertexFinding/plugins/gpuVertexFinder.cc
@@ -1,4 +1,5 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 
 #include "gpuClusterTracksByDensity.h"
 #include "gpuClusterTracksDBSCAN.h"
@@ -96,26 +97,28 @@ namespace gpuVertexFinder {
 
 #ifdef __CUDACC__
   ZVertexHeterogeneous Producer::makeAsync(cudaStream_t stream, TkSoA const* tksoa, float ptMin, float ptMax) const {
-#ifdef PIXVERTEX_DEBUG_PRODUCE
-    std::cout << "producing Vertices on GPU" << std::endl;
-#endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(cms::cuda::make_device_unique<ZVertexSoA>(stream));
+    auto where = memoryPool::onDevice;
 #else
   ZVertexHeterogeneous Producer::make(TkSoA const* tksoa, float ptMin, float ptMax) const {
+    cudaStream_t stream = nullptr;
+    auto where = memoryPool::onCPU;
+#endif
 #ifdef PIXVERTEX_DEBUG_PRODUCE
-    std::cout << "producing Vertices on  CPU" << std::endl;
-#endif  // PIXVERTEX_DEBUG_PRODUCE
-    ZVertexHeterogeneous vertices(std::make_unique<ZVertexSoA>());
+#ifdef __CUDACC__
+    auto whereName = "GPU";
+#else
+    auto whereName = "CPU";
 #endif
+    std::cout << "producing Vertices on " << whereName << std::endl;
+#endif  // PIXVERTEX_DEBUG_PRODUCE
+
+    ZVertexHeterogeneous vertices = memoryPool::cuda::makeBuffer<ZVertexSoA>(1, stream, where);
+
     assert(tksoa);
     auto* soa = vertices.get();
     assert(soa);
 
-#ifdef __CUDACC__
-    auto ws_d = cms::cuda::make_device_unique<WorkSpace>(stream);
-#else
-    auto ws_d = std::make_unique<WorkSpace>();
-#endif
+    auto ws_d = memoryPool::cuda::makeBuffer<WorkSpace>(1, stream, where);
 
 #ifdef __CUDACC__
     init<<<1, 1, 0, stream>>>(soa, ws_d.get());
diff --git a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
index 5f8a0646c726a..b3a2886e2447f 100644
--- a/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
+++ b/RecoPixelVertexing/PixelVertexFinding/test/VertexFinder_t.h
@@ -7,6 +7,7 @@
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/requireDevices.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/launch.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaMemoryPool.h"
 #ifdef USE_DBSCAN
 #include "RecoPixelVertexing/PixelVertexFinding/plugins/gpuClusterTracksDBSCAN.h"
 #define CLUSTERIZE gpuVertexFinder::clusterTracksDBSCAN
@@ -114,235 +115,248 @@ __global__ void print(gpuVertexFinder::ZVertices const* pdata, gpuVertexFinder::
 int main() {
 #ifdef __CUDACC__
   cms::cudatest::requireDevices();
-
-  auto onGPU_d = cms::cuda::make_device_unique<gpuVertexFinder::ZVertices[]>(1, nullptr);
-  auto ws_d = cms::cuda::make_device_unique<gpuVertexFinder::WorkSpace[]>(1, nullptr);
+  auto where = memoryPool::onDevice;
 #else
-  auto onGPU_d = std::make_unique<gpuVertexFinder::ZVertices>();
-  auto ws_d = std::make_unique<gpuVertexFinder::WorkSpace>();
+  auto where = memoryPool::onCPU;
 #endif
 
-  Event ev;
+  memoryPool::cuda::init(false);
+
+  {
+    cudaStream_t stream = nullptr;  // here works as nothing is supposed to be deleted before synch...
+
+    auto onGPU_d = memoryPool::cuda::makeBuffer<gpuVertexFinder::ZVertices>(1, stream, where);
+    auto ws_d = memoryPool::cuda::makeBuffer<gpuVertexFinder::WorkSpace>(1, stream, where);
+
+    Event ev;
 
-  float eps = 0.1f;
-  std::array<float, 3> par{{eps, 0.01f, 9.0f}};
-  for (int nav = 30; nav < 80; nav += 20) {
-    ClusterGenerator gen(nav, 10);
+    float eps = 0.1f;
+    std::array<float, 3> par{{eps, 0.01f, 9.0f}};
+    for (int nav = 30; nav < 80; nav += 20) {
+      ClusterGenerator gen(nav, 10);
 
-    for (int i = 8; i < 20; ++i) {
-      auto kk = i / 4;  // M param
+      for (int i = 8; i < 20; ++i) {
+        auto kk = i / 4;  // M param
 
-      gen(ev);
+        gen(ev);
 
 #ifdef __CUDACC__
-      init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+        init<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 #else
-      onGPU_d->init();
-      ws_d->init();
+        onGPU_d->init();
+        ws_d->init();
 #endif
 
-      std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
-      auto nt = ev.ztrack.size();
+        std::cout << "v,t size " << ev.zvert.size() << ' ' << ev.ztrack.size() << std::endl;
+        auto nt = ev.ztrack.size();
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
-      cudaCheck(cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+        cudaCheck(cudaMemcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t), cudaMemcpyHostToDevice));
+        cudaCheck(cudaMemcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size(), cudaMemcpyHostToDevice));
+        cudaCheck(
+            cudaMemcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
+        cudaCheck(
+            cudaMemcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size(), cudaMemcpyHostToDevice));
 #else
-      ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
-      ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
-      ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
-      ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
+        ::memcpy(LOC_WS(ntrks), &nt, sizeof(uint32_t));
+        ::memcpy(LOC_WS(zt), ev.ztrack.data(), sizeof(float) * ev.ztrack.size());
+        ::memcpy(LOC_WS(ezt2), ev.eztrack.data(), sizeof(float) * ev.eztrack.size());
+        ::memcpy(LOC_WS(ptt2), ev.pttrack.data(), sizeof(float) * ev.eztrack.size());
 #endif
 
-      std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
+        std::cout << "M eps, pset " << kk << ' ' << eps << ' ' << (i % 4) << std::endl;
 
-      if ((i % 4) == 0)
-        par = {{eps, 0.02f, 12.0f}};
-      if ((i % 4) == 1)
-        par = {{eps, 0.02f, 9.0f}};
-      if ((i % 4) == 2)
-        par = {{eps, 0.01f, 9.0f}};
-      if ((i % 4) == 3)
-        par = {{0.7f * eps, 0.01f, 9.0f}};
+        if ((i % 4) == 0)
+          par = {{eps, 0.02f, 12.0f}};
+        if ((i % 4) == 1)
+          par = {{eps, 0.02f, 9.0f}};
+        if ((i % 4) == 2)
+          par = {{eps, 0.01f, 9.0f}};
+        if ((i % 4) == 3)
+          par = {{0.7f * eps, 0.01f, 9.0f}};
 
-      uint32_t nv = 0;
+        uint32_t nv = 0;
 #ifdef __CUDACC__
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
-      cudaCheck(cudaGetLastError());
-      cudaDeviceSynchronize();
+        print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+        cudaCheck(cudaGetLastError());
+        cudaDeviceSynchronize();
 
 #ifdef ONE_KERNEL
-      cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+        cms::cuda::launch(vertexFinderOneKernel, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
 #else
-      cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+        cms::cuda::launch(CLUSTERIZE, {1, 512 + 256}, onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
 #endif
-      print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
+        print<<<1, 1, 0, 0>>>(onGPU_d.get(), ws_d.get());
 
-      cudaCheck(cudaGetLastError());
-      cudaDeviceSynchronize();
+        cudaCheck(cudaGetLastError());
+        cudaDeviceSynchronize();
 
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+        cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+        cudaCheck(cudaGetLastError());
+        cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 
 #else
-      print(onGPU_d.get(), ws_d.get());
-      CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
-      print(onGPU_d.get(), ws_d.get());
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
+        print(onGPU_d.get(), ws_d.get());
+        CLUSTERIZE(onGPU_d.get(), ws_d.get(), kk, par[0], par[1], par[2]);
+        print(onGPU_d.get(), ws_d.get());
+        fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+        nv = onGPU_d->nvFinal;
 #endif
 
-      if (nv == 0) {
-        std::cout << "NO VERTICES???" << std::endl;
-        continue;
-      }
+        if (nv == 0) {
+          std::cout << "NO VERTICES???" << std::endl;
+          continue;
+        }
 
-      float* zv = nullptr;
-      float* wv = nullptr;
-      float* ptv2 = nullptr;
-      int32_t* nn = nullptr;
-      uint16_t* ind = nullptr;
+        float* zv = nullptr;
+        float* wv = nullptr;
+        float* ptv2 = nullptr;
+        int32_t* nn = nullptr;
+        uint16_t* ind = nullptr;
 
-      // keep chi2 separated...
-      float chi2[2 * nv];  // make space for splitting...
+        // keep chi2 separated...
+        float chi2[2 * nv];  // make space for splitting...
 
 #ifdef __CUDACC__
-      float hzv[2 * nv];
-      float hwv[2 * nv];
-      float hptv2[2 * nv];
-      int32_t hnn[2 * nv];
-      uint16_t hind[2 * nv];
-
-      zv = hzv;
-      wv = hwv;
-      ptv2 = hptv2;
-      nn = hnn;
-      ind = hind;
+        float hzv[2 * nv];
+        float hwv[2 * nv];
+        float hptv2[2 * nv];
+        int32_t hnn[2 * nv];
+        uint16_t hind[2 * nv];
+
+        zv = hzv;
+        wv = hwv;
+        ptv2 = hptv2;
+        nn = hnn;
+        ind = hind;
 #else
-      zv = onGPU_d->zv;
-      wv = onGPU_d->wv;
-      ptv2 = onGPU_d->ptv2;
-      nn = onGPU_d->ndof;
-      ind = onGPU_d->sortInd;
+        zv = onGPU_d->zv;
+        wv = onGPU_d->wv;
+        ptv2 = onGPU_d->ptv2;
+        nn = onGPU_d->ndof;
+        ind = onGPU_d->sortInd;
 #endif
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+        memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
-      for (auto j = 0U; j < nv; ++j)
-        if (nn[j] > 0)
-          chi2[j] /= float(nn[j]);
-      {
-        auto mx = std::minmax_element(chi2, chi2 + nv);
-        std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
-      }
+        for (auto j = 0U; j < nv; ++j)
+          if (nn[j] > 0)
+            chi2[j] /= float(nn[j]);
+        {
+          auto mx = std::minmax_element(chi2, chi2 + nv);
+          std::cout << "after fit nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+        }
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+        cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 50.f);
+        cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+        fitVertices(onGPU_d.get(), ws_d.get(), 50.f);
+        nv = onGPU_d->nvFinal;
+        memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
-      for (auto j = 0U; j < nv; ++j)
-        if (nn[j] > 0)
-          chi2[j] /= float(nn[j]);
-      {
-        auto mx = std::minmax_element(chi2, chi2 + nv);
-        std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
-      }
+        for (auto j = 0U; j < nv; ++j)
+          if (nn[j] > 0)
+            chi2[j] /= float(nn[j]);
+        {
+          auto mx = std::minmax_element(chi2, chi2 + nv);
+          std::cout << "before splitting nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+        }
 
 #ifdef __CUDACC__
-      // one vertex per block!!!
-      cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
-      cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+        // one vertex per block!!!
+        cms::cuda::launch(gpuVertexFinder::splitVerticesKernel, {1024, 64}, onGPU_d.get(), ws_d.get(), 9.f);
+        cudaCheck(cudaMemcpy(&nv, LOC_WS(nvIntermediate), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
-      nv = ws_d->nvIntermediate;
+        splitVertices(onGPU_d.get(), ws_d.get(), 9.f);
+        nv = ws_d->nvIntermediate;
 #endif
-      std::cout << "after split " << nv << std::endl;
+        std::cout << "after split " << nv << std::endl;
 
 #ifdef __CUDACC__
-      cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
-      cudaCheck(cudaGetLastError());
+        cms::cuda::launch(gpuVertexFinder::fitVerticesKernel, {1, 1024 - 256}, onGPU_d.get(), ws_d.get(), 5000.f);
+        cudaCheck(cudaGetLastError());
 
-      cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
-      cudaCheck(cudaGetLastError());
-      cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
+        cms::cuda::launch(gpuVertexFinder::sortByPt2Kernel, {1, 256}, onGPU_d.get(), ws_d.get());
+        cudaCheck(cudaGetLastError());
+        cudaCheck(cudaMemcpy(&nv, LOC_ONGPU(nvFinal), sizeof(uint32_t), cudaMemcpyDeviceToHost));
 #else
-      fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
-      sortByPt2(onGPU_d.get(), ws_d.get());
-      nv = onGPU_d->nvFinal;
-      memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
+        fitVertices(onGPU_d.get(), ws_d.get(), 5000.f);
+        sortByPt2(onGPU_d.get(), ws_d.get());
+        nv = onGPU_d->nvFinal;
+        memcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float));
 #endif
 
-      if (nv == 0) {
-        std::cout << "NO VERTICES???" << std::endl;
-        continue;
-      }
+        if (nv == 0) {
+          std::cout << "NO VERTICES???" << std::endl;
+          continue;
+        }
 
 #ifdef __CUDACC__
-      cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
-      cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(zv, LOC_ONGPU(zv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(wv, LOC_ONGPU(wv), nv * sizeof(float), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(chi2, LOC_ONGPU(chi2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(ptv2, LOC_ONGPU(ptv2), nv * sizeof(float), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(nn, LOC_ONGPU(ndof), nv * sizeof(int32_t), cudaMemcpyDeviceToHost));
+        cudaCheck(cudaMemcpy(ind, LOC_ONGPU(sortInd), nv * sizeof(uint16_t), cudaMemcpyDeviceToHost));
 #endif
-      for (auto j = 0U; j < nv; ++j)
-        if (nn[j] > 0)
-          chi2[j] /= float(nn[j]);
-      {
-        auto mx = std::minmax_element(chi2, chi2 + nv);
-        std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
-      }
+        for (auto j = 0U; j < nv; ++j)
+          if (nn[j] > 0)
+            chi2[j] /= float(nn[j]);
+        {
+          auto mx = std::minmax_element(chi2, chi2 + nv);
+          std::cout << "nv, min max chi2 " << nv << " " << *mx.first << ' ' << *mx.second << std::endl;
+        }
 
-      {
-        auto mx = std::minmax_element(wv, wv + nv);
-        std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
-      }
+        {
+          auto mx = std::minmax_element(wv, wv + nv);
+          std::cout << "min max error " << 1. / std::sqrt(*mx.first) << ' ' << 1. / std::sqrt(*mx.second) << std::endl;
+        }
 
-      {
-        auto mx = std::minmax_element(ptv2, ptv2 + nv);
-        std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
-        std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
-                  << ind[nv - 1] << std::endl;
-      }
+        {
+          auto mx = std::minmax_element(ptv2, ptv2 + nv);
+          std::cout << "min max ptv2 " << *mx.first << ' ' << *mx.second << std::endl;
+          std::cout << "min max ptv2 " << ptv2[ind[0]] << ' ' << ptv2[ind[nv - 1]] << " at " << ind[0] << ' '
+                    << ind[nv - 1] << std::endl;
+        }
 
-      float dd[nv];
-      for (auto kv = 0U; kv < nv; ++kv) {
-        auto zr = zv[kv];
-        auto md = 500.0f;
-        for (auto zs : ev.ztrack) {
-          auto d = std::abs(zr - zs);
-          md = std::min(d, md);
+        float dd[nv];
+        for (auto kv = 0U; kv < nv; ++kv) {
+          auto zr = zv[kv];
+          auto md = 500.0f;
+          for (auto zs : ev.ztrack) {
+            auto d = std::abs(zr - zs);
+            md = std::min(d, md);
+          }
+          dd[kv] = md;
         }
-        dd[kv] = md;
-      }
-      if (i == 6) {
+        if (i == 6) {
+          for (auto d : dd)
+            std::cout << d << ' ';
+          std::cout << std::endl;
+        }
+        auto mx = std::minmax_element(dd, dd + nv);
+        float rms = 0;
         for (auto d : dd)
-          std::cout << d << ' ';
-        std::cout << std::endl;
-      }
-      auto mx = std::minmax_element(dd, dd + nv);
-      float rms = 0;
-      for (auto d : dd)
-        rms += d * d;
-      rms = std::sqrt(rms) / (nv - 1);
-      std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
-
-    }  // loop on events
-  }    // lopp on ave vert
+          rms += d * d;
+        rms = std::sqrt(rms) / (nv - 1);
+        std::cout << "min max rms " << *mx.first << ' ' << *mx.second << ' ' << rms << std::endl;
+
+      }  // loop on events
+    }    // lopp on ave vert
+  }
+
+#ifdef __CUDACC__
+  memoryPool::cuda::dumpStat();
+#endif
+  memoryPool::cuda::shutdown();
 
   return 0;
 }
diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
index 8b0de1c739076..3b0c42a78fe65 100644
--- a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
+++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
@@ -31,7 +31,7 @@ namespace {
     BeamSpotPOD* data() { return data_h_.get(); }
     BeamSpotPOD const* data() const { return data_h_.get(); }
 
-    cms::cuda::host::noncached::unique_ptr<BeamSpotPOD>& ptr() { return data_h_; }
+    auto& ptr() { return data_h_; }
     cms::cuda::host::noncached::unique_ptr<BeamSpotPOD> const& ptr() const { return data_h_; }
 
   private:
@@ -93,7 +93,7 @@ void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const e
   bsHost->betaStar = bs.betaStar();
 
   BeamSpotCUDA bsDevice(ctx.stream());
-  cms::cuda::copyAsync(bsDevice.ptr(), bsHost, ctx.stream());
+  cudaCheck(cudaMemcpyAsync(bsDevice.data(), bsHost.get(), sizeof(BeamSpotPOD), cudaMemcpyHostToDevice, ctx.stream()));
 
   ctx.emplace(iEvent, bsPutToken_, std::move(bsDevice));
 }