diff --git a/CUDADataFormats/BeamSpot/BuildFile.xml b/CUDADataFormats/BeamSpot/BuildFile.xml
new file mode 100644
index 0000000000000..b4a05240b567d
--- /dev/null
+++ b/CUDADataFormats/BeamSpot/BuildFile.xml
@@ -0,0 +1,8 @@
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
new file mode 100644
index 0000000000000..70c3f1b3f9d11
--- /dev/null
+++ b/CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h
@@ -0,0 +1,32 @@
+#ifndef CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+#define CUDADataFormats_BeamSpot_interface_BeamSpotCUDA_h
+
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+class BeamSpotCUDA {
+public:
+  // alignas(128) doesn't really make sense as there is only one
+  // beamspot per event? 
+  struct Data {
+    float x,y,z;   // position
+    // TODO: add covariance matrix
+
+    float sigmaZ;
+    float beamWidthX, beamWidthY;
+    float dxdz, dydz;
+    float emittanceX, emittanceY;
+    float betaStar;
+  };
+
+  BeamSpotCUDA() = default;
+  BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream);
+
+  Data const* data() const { return data_d_.get(); }
+
+private:
+  cudautils::device::unique_ptr<Data> data_d_;
+};
+
+#endif
diff --git a/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
new file mode 100644
index 0000000000000..2714df51d2456
--- /dev/null
+++ b/CUDADataFormats/BeamSpot/src/BeamSpotCUDA.cc
@@ -0,0 +1,11 @@
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+BeamSpotCUDA::BeamSpotCUDA(Data const* data_h, cuda::stream_t<>& stream) {
+  edm::Service<CUDAService> cs;
+
+  data_d_ = cs->make_device_unique<Data>(stream);
+  cuda::memory::async::copy(data_d_.get(), data_h, sizeof(Data), stream.id());
+}
diff --git a/CUDADataFormats/BeamSpot/src/classes.h b/CUDADataFormats/BeamSpot/src/classes.h
new file mode 100644
index 0000000000000..62f990c0ba3b3
--- /dev/null
+++ b/CUDADataFormats/BeamSpot/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_BeamSpot_classes_h
+#define CUDADataFormats_BeamSpot_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/BeamSpot/src/classes_def.xml b/CUDADataFormats/BeamSpot/src/classes_def.xml
new file mode 100644
index 0000000000000..29a0eafa04005
--- /dev/null
+++ b/CUDADataFormats/BeamSpot/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDAProduct<BeamSpotCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<BeamSpotCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/Common/interface/CUDAProduct.h b/CUDADataFormats/Common/interface/CUDAProduct.h
index 181024f068c7a..fb90496f33661 100644
--- a/CUDADataFormats/Common/interface/CUDAProduct.h
+++ b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -45,6 +45,12 @@ class CUDAProduct: public CUDAProductBase {
     data_(std::move(data))
   {}
 
+  template <typename... Args>
+  explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, std::shared_ptr<cuda::event_t> event, Args&&... args):
+    CUDAProductBase(device, std::move(stream), std::move(event)),
+    data_(std::forward<Args>(args)...)
+  {}
+
   T data_; //!
 };
 
diff --git a/Configuration/StandardSequences/python/Reconstruction_cff.py b/Configuration/StandardSequences/python/Reconstruction_cff.py
index 36ac26b2197c6..8bb2e5cb2afcb 100644
--- a/Configuration/StandardSequences/python/Reconstruction_cff.py
+++ b/Configuration/StandardSequences/python/Reconstruction_cff.py
@@ -198,9 +198,9 @@
 reconstruction_trackingOnly = cms.Sequence(localreco*globalreco_tracking)
 reconstruction_pixelTrackingOnly = cms.Sequence(
     pixeltrackerlocalreco*
-    offlineBeamSpot*
     siPixelClusterShapeCachePreSplitting*
-    recopixelvertexing
+    recopixelvertexing,
+    offlineBeamSpotTask
 )
 
 #need a fully expanded sequence copy
diff --git a/HeterogeneousCore/CUDAUtilities/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
index 289f0208fd5e7..153430997064d 100644
--- a/HeterogeneousCore/CUDAUtilities/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/BuildFile.xml
@@ -1,5 +1,6 @@
 <use name="cub"/>
 <use name="cuda"/>
+<use name="cuda-api-wrappers"/>
 
 <export>
     <lib name="1"/>
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
new file mode 100644
index 0000000000000..c9f9aff89d975
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h
@@ -0,0 +1,65 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_noncached_unique_ptr_h
+
+#include <memory>
+
+#include <cuda/api_wrappers.h>
+#include <cuda_runtime.h>
+
+namespace cudautils {
+  namespace host {
+    namespace noncached {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          void operator()(void *ptr) {
+            cuda::throw_if_error(cudaFreeHost(ptr));
+          }
+        };
+      }
+
+      template <typename T>
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+
+      namespace impl {
+        template <typename T>
+        struct make_host_unique_selector { using non_array = cudautils::host::noncached::unique_ptr<T>; };
+        template <typename T>
+        struct make_host_unique_selector<T[]> { using unbounded_array = cudautils::host::noncached::unique_ptr<T[]>; };
+        template <typename T, size_t N>
+        struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
+      }
+    }
+  }
+
+  /**
+   * The difference wrt. CUDAService::make_host_unique is that these
+   * do not cache, so they should not be called per-event.
+   */
+  template <typename T>
+  typename host::noncached::impl::make_host_unique_selector<T>::non_array
+  make_host_noncached_unique(unsigned int flags = cudaHostAllocDefault) {
+    static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem;
+    cuda::throw_if_error(cudaHostAlloc(&mem, sizeof(T), flags));
+    return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem));
+  }
+
+  template <typename T>
+  typename host::noncached::impl::make_host_unique_selector<T>::unbounded_array
+  make_host_noncached_unique(size_t n, unsigned int flags = cudaHostAllocDefault) {
+    using element_type = typename std::remove_extent<T>::type;
+    static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
+    void *mem;
+    cuda::throw_if_error(cudaHostAlloc(&mem, n*sizeof(element_type), flags));
+    return typename cudautils::host::noncached::impl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem));
+  }
+
+  template <typename T, typename ...Args>
+  typename cudautils::host::noncached::impl::make_host_unique_selector<T>::bounded_array
+  make_host_noncached_unique(Args&&...) = delete;
+}
+
+#endif
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index 0f48f95a8e4ad..0d06484b72a5e 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -68,7 +68,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
-<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp,host_noncached_unique_ptr_t.cpp" name="cudaMemUtils_t">
   <use name="HeterogeneousCore/CUDAServices"/>
   <use name="catch2"/>
 </bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
new file mode 100644
index 0000000000000..ae9e3c9a3849d
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/host_noncached_unique_ptr_t.cpp
@@ -0,0 +1,22 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
+
+TEST_CASE("host_noncached_unique_ptr", "[cudaMemTools]") {
+  exitSansCUDADevices();
+
+  SECTION("Single element") {
+    auto ptr1 = cudautils::make_host_noncached_unique<int>();
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cudautils::make_host_noncached_unique<int>(cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+
+  SECTION("Multiple elements") {
+    auto ptr1 = cudautils::make_host_noncached_unique<int[]>(10);
+    REQUIRE(ptr1 != nullptr);
+    auto ptr2 = cudautils::make_host_noncached_unique<int[]>(10, cudaHostAllocPortable | cudaHostAllocWriteCombined);
+    REQUIRE(ptr2 != nullptr);
+  }
+}
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index b23faad9e78d3..f2dacd5fbc415 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -24,7 +24,9 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
 #include "RecoTracker/Record/interface/CkfComponentsRecord.h"
 
@@ -62,6 +64,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   std::unique_ptr<PixelUnpackingRegions> regions_;
 
   pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  std::unique_ptr<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender> wordFedAppender_;
   PixelDataFormatter::Errors errors_;
 
   const bool includeErrors_;
@@ -88,6 +91,11 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
   }
 
   if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
+
+  edm::Service<CUDAService> cs;
+  if(cs->enabled()) {
+    wordFedAppender_ = std::make_unique<pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender>();
+  }
 }
 
 void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -161,7 +169,6 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
 
   // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
   ErrorChecker errorcheck;
-  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
   for(int fedId: fedIds_) {
     if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
     if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
@@ -209,13 +216,13 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
     const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
 
     assert(0 == (ew-bw)%2);
-    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordFedAppender_->initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
     wordCounterGPU+=(ew-bw);
 
   } // end of for loop
 
   gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
-                             wordFedAppender,
+                             *wordFedAppender_,
                              std::move(errors_),
                              wordCounterGPU, fedCounter, 
                              useQuality_, includeErrors_,
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 3d4e377eb8221..8fdb2ed8c90d5 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -45,10 +45,9 @@ namespace pixelgpudetails {
   // number of words for all the FEDs
   constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
 
-  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
-    edm::Service<CUDAService> cs;
-    word_ = cs->make_host_unique<unsigned int[]>(MAX_FED_WORDS, cudaStream);
-    fedId_ = cs->make_host_unique<unsigned char[]>(MAX_FED_WORDS, cudaStream);
+  SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender() {
+    word_ = cudautils::make_host_noncached_unique<unsigned int[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
+    fedId_ = cudautils::make_host_noncached_unique<unsigned char[]>(MAX_FED_WORDS, cudaHostAllocWriteCombined);
   }
 
   void SiPixelRawToClusterGPUKernel::WordFedAppender::initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index a0f89dc241c64..0d2b6a8c7fc65 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -11,6 +11,7 @@
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
 #include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
@@ -159,7 +160,7 @@ namespace pixelgpudetails {
   public:
     class WordFedAppender {
     public:
-      WordFedAppender(cuda::stream_t<>& cudaStream);
+      WordFedAppender();
       ~WordFedAppender() = default;
 
       void initializeWordFed(int fedId, unsigned int wordCounterGPU, const cms_uint32_t *src, unsigned int length);
@@ -168,8 +169,8 @@ namespace pixelgpudetails {
       const unsigned char *fedId() const { return fedId_.get(); }
 
     private:
-      cudautils::host::unique_ptr<unsigned int[]> word_;
-      cudautils::host::unique_ptr<unsigned char[]> fedId_;
+      cudautils::host::noncached::unique_ptr<unsigned int[]> word_;
+      cudautils::host::noncached::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
index a8af0c8a7c4f9..27ee3af86e102 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="CUDADataFormats/BeamSpot"/>
 <use name="DataFormats/TrackerCommon"/>
 <use name="HeterogeneousCore/CUDACore"/>
 <use name="HeterogeneousCore/Producer"/>
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 80be13dedd26b..6ac70fce95a88 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -36,7 +36,6 @@ namespace pixelgpudetails {
 
     constexpr auto MAX_HITS = siPixelRecHitsHeterogeneousProduct::maxHits();
 
-    cudaCheck(cudaMalloc((void **) & gpu_.bs_d, 3 * sizeof(float)));
     cudaCheck(cudaMalloc((void **) & gpu_.hitsLayerStart_d, 11 * sizeof(uint32_t)));
 
     // Coalesce all 32bit and 16bit arrays to two big blobs
@@ -111,7 +110,6 @@ namespace pixelgpudetails {
 #endif
   }
   PixelRecHitGPUKernel::~PixelRecHitGPUKernel() {
-    cudaCheck(cudaFree(gpu_.bs_d));
     cudaCheck(cudaFree(gpu_.hitsLayerStart_d));
     cudaCheck(cudaFree(gpu_.owner_32bit_));
     cudaCheck(cudaFree(gpu_.owner_16bit_));
@@ -131,11 +129,10 @@ namespace pixelgpudetails {
 
   void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                                            SiPixelClustersCUDA const& clusters_d,
-                                           float const * bs,
+                                           BeamSpotCUDA const& bs_d,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            bool transferToCPU,
                                            cuda::stream_t<>& stream) {
-    cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
     gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
     gpu_.cpeParams = cpeParams; // copy it for use in clients
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
@@ -148,7 +145,7 @@ namespace pixelgpudetails {
 #endif
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
-      gpu_.bs_d,
+      bs_d.data(),
       digis_d.moduleInd(),
       digis_d.xx(), digis_d.yy(), digis_d.adc(),
       clusters_d.moduleStart(),
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index 49164d24ab335..8e5599c239789 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -1,6 +1,7 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
@@ -34,7 +35,7 @@ namespace pixelgpudetails {
 
     void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
                        SiPixelClustersCUDA const& clusters_d,
-                       float const * bs,
+                       BeamSpotCUDA const& bs_d,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        bool transferToCPU,
                        cuda::stream_t<>& stream);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index e26ec84ddf5b5..be4ca76f23ad5 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -1,9 +1,9 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
-#include "DataFormats/BeamSpot/interface/BeamSpot.h"
 #include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
 #include "DataFormats/TrackerRecHit2D/interface/SiPixelRecHitCollection.h"
 #include "FWCore/Framework/interface/ESHandle.h"
@@ -64,8 +64,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
   void run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output, const pixelgpudetails::HitsOnCPU& hoc) const;
 
 
-  edm::EDGetTokenT<reco::BeamSpot> 	 tBeamSpot;
   // The mess with inputs will be cleaned up when migrating to the new framework
+  edm::EDGetTokenT<CUDAProduct<BeamSpotCUDA>> tBeamSpot;
   edm::EDGetTokenT<CUDAProduct<SiPixelClustersCUDA>> token_;
   edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tokenDigi_;
   edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;
@@ -82,7 +82,7 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 
 SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig):
   HeterogeneousEDProducer(iConfig),
-  tBeamSpot(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
+  tBeamSpot(consumes<CUDAProduct<BeamSpotCUDA>>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
   token_(consumes<CUDAProduct<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
   tokenDigi_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
   cpeName_(iConfig.getParameter<std::string>("CPE"))
@@ -100,7 +100,7 @@ SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet&
 void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
 
-  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
+  desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpotCUDA"));
   desc.add<edm::InputTag>("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting"));
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
   desc.add<std::string>("CPE", "PixelCPEFast");
@@ -183,23 +183,22 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
   iEvent.getByToken(tokenDigi_, hdigis);
   auto const& digis = ctx.get(*hdigis);
 
+  edm::Handle<CUDAProduct<BeamSpotCUDA>> hbs;
+  iEvent.getByToken(tBeamSpot, hbs);
+  auto const& bs = ctx.get(*hbs);
+
   // We're processing in a stream given by base class, so need to
   // synchronize explicitly (implementation is from
   // CUDAScopedContext). In practice these should not be needed
   // (because of synchronizations upstream), but let's play generic.
-  if(not hclusters->isAvailable() && hclusters->event()->has_occurred()) {
+  if(not hclusters->isAvailable() and not hclusters->event()->has_occurred()) {
     cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0));
   }
-  if(not hdigis->isAvailable() && hdigis->event()->has_occurred()) {
+  if(not hdigis->isAvailable() and not hdigis->event()->has_occurred()) {
     cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event()->id(), 0));
   }
-
-  edm::Handle<reco::BeamSpot> bsHandle;
-  iEvent.getByToken( tBeamSpot, bsHandle);
-  float bs[3] = {0.f};
-  if(bsHandle.isValid()) {
-    const auto  & bsh = *bsHandle;
-    bs[0]=bsh.x0(); bs[1]=bsh.y0(); bs[2]=bsh.z0();
+  if(not hbs->isAvailable() and not hbs->event()->has_occurred()) {
+    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hbs->event()->id(), 0));
   }
 
   gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
index 6864a046bf1dc..cbd354e71143e 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/gpuPixelRecHits.h
@@ -5,6 +5,7 @@
 #include <cstdio>
 #include <limits>
 
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
 #include "DataFormats/Math/interface/approx_atan2.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -15,7 +16,7 @@ namespace gpuPixelRecHits {
 
 
   __global__ void getHits(pixelCPEforGPU::ParamsOnGPU const * __restrict__  cpeParams,
-                          float const * __restrict__  bs,
+                          BeamSpotCUDA::Data const * __restrict__  bs,
                           uint16_t const * __restrict__  id,
 			  uint16_t const * __restrict__  x,
 			  uint16_t const * __restrict__  y,
@@ -143,9 +144,9 @@ namespace gpuPixelRecHits {
     // to global and compute phi... 
     cpeParams->detParams(me).frame.toGlobal(xl[h],yl[h], xg[h],yg[h],zg[h]);
     // here correct for the beamspot...
-    xg[h]-=bs[0];
-    yg[h]-=bs[1];
-    zg[h]-=bs[2];
+    xg[h]-=bs->x;
+    yg[h]-=bs->y;
+    zg[h]-=bs->z;
 
     rg[h] = std::sqrt(xg[h]*xg[h]+yg[h]*yg[h]);
     iph[h] = unsafe_atan2s<7>(yg[h],xg[h]);
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
index ea6eaf8458dde..5bed073c01c3a 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h
@@ -22,7 +22,6 @@ namespace siPixelRecHitsHeterogeneousProduct {
 
   struct HitsOnGPU{
      pixelCPEforGPU::ParamsOnGPU const * cpeParams = nullptr;    // forwarded from setup, NOT owned
-     float * bs_d;
      const uint32_t * hitsModuleStart_d; // forwarded from clusters
      uint32_t * hitsLayerStart_d;
      int32_t  * charge_d;
diff --git a/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
new file mode 100644
index 0000000000000..9b15f08b8dfc2
--- /dev/null
+++ b/RecoVertex/BeamSpotProducer/plugins/BeamSpotToCUDA.cc
@@ -0,0 +1,88 @@
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
+#include "CUDADataFormats/BeamSpot/interface/BeamSpotCUDA.h"
+#include "DataFormats/BeamSpot/interface/BeamSpot.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_noncached_unique_ptr.h"
+
+#include <cuda_runtime.h>
+
+namespace {
+  class BSHost {
+  public:
+    BSHost():
+      bs{cudautils::make_host_noncached_unique<BeamSpotCUDA::Data>(cudaHostAllocWriteCombined)}
+    {}
+    BeamSpotCUDA::Data *get() { return bs.get(); }
+
+  private:
+    cudautils::host::noncached::unique_ptr<BeamSpotCUDA::Data> bs;
+  };
+}
+
+class BeamSpotToCUDA: public edm::global::EDProducer<edm::StreamCache<BSHost> > {
+public:
+  explicit BeamSpotToCUDA(const edm::ParameterSet& iConfig);
+  ~BeamSpotToCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  std::unique_ptr<BSHost> beginStream(edm::StreamID) const {
+    edm::Service<CUDAService> cs;
+    if(cs->enabled()) {
+      return std::make_unique<BSHost>();
+    }
+    else {
+      return nullptr;
+    }
+  }
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  edm::EDGetTokenT<reco::BeamSpot> bsGetToken_;
+  edm::EDPutTokenT<CUDAProduct<BeamSpotCUDA>> bsPutToken_;
+};
+
+BeamSpotToCUDA::BeamSpotToCUDA(const edm::ParameterSet& iConfig):
+  bsGetToken_{consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("src"))},
+  bsPutToken_{produces<CUDAProduct<BeamSpotCUDA>>()}
+{}
+
+void BeamSpotToCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("offlineBeamSpot"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void BeamSpotToCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  CUDAScopedContext ctx{streamID};
+
+  const reco::BeamSpot& bs = iEvent.get(bsGetToken_);
+
+  BeamSpotCUDA::Data *bsHost = streamCache(streamID)->get();
+
+  bsHost->x = bs.x0();
+  bsHost->y = bs.y0();
+  bsHost->z = bs.z0();
+
+  bsHost->sigmaZ = bs.sigmaZ();
+  bsHost->beamWidthX = bs.BeamWidthX();
+  bsHost->beamWidthY = bs.BeamWidthY();
+  bsHost->dxdz = bs.dxdz();
+  bsHost->dydz = bs.dydz();
+  bsHost->emittanceX = bs.emittanceX();
+  bsHost->emittanceY = bs.emittanceY();
+  bsHost->betaStar = bs.betaStar();
+
+  ctx.emplace(iEvent, bsPutToken_, bsHost, ctx.stream());
+}
+
+DEFINE_FWK_MODULE(BeamSpotToCUDA);
+
diff --git a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml
index bb4502c33b332..e3ce85df9a81a 100644
--- a/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml
+++ b/RecoVertex/BeamSpotProducer/plugins/BuildFile.xml
@@ -14,6 +14,14 @@
 <library   file="BeamSpotProducer.cc" name="BeamSpotProducer">
   <flags   EDM_PLUGIN="1"/>
 </library>
+<library   file="BeamSpotToCUDA.cc" name="BeamSpotToCUDA">
+  <use name="CUDADataFormats/BeamSpot"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="HeterogeneousCore/CUDAServices"/>
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+  <flags   EDM_PLUGIN="1"/>
+</library>
 <library   file="BeamSpotOnlineProducer.cc" name="BeamSpotOnlineProducer">
   <flags   EDM_PLUGIN="1"/>
   <use name="DataFormats/L1GlobalTrigger"/>
diff --git a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py
index deb62255199e5..9654f9ab410b8 100644
--- a/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py
+++ b/RecoVertex/BeamSpotProducer/python/BeamSpot_cff.py
@@ -1,4 +1,11 @@
 import FWCore.ParameterSet.Config as cms
 
 from RecoVertex.BeamSpotProducer.BeamSpot_cfi import *
+from RecoVertex.BeamSpotProducer.beamSpotToCUDA_cfi import beamSpotToCUDA as _beamSpotToCUDA
 
+offlineBeamSpotCUDA = _beamSpotToCUDA.clone()
+
+offlineBeamSpotTask = cms.Task(
+    offlineBeamSpot,
+    offlineBeamSpotCUDA
+)