From 276808c754cc88ed22fc3bec02b5e20df285ac09 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 17 Jul 2018 10:20:54 +0200
Subject: [PATCH 01/49] Next prototype of the framework integration

---
 HeterogeneousCore/CUDACore/BuildFile.xml      |   2 +
 HeterogeneousCore/CUDACore/README.md          | 283 ++++++++++++++++++
 HeterogeneousCore/CUDACore/interface/CUDA.h   |  77 +++++
 .../CUDACore/interface/CUDAScopedContext.h    |  95 ++++++
 .../CUDACore/interface/CUDAStreamEDProducer.h |  46 +++
 .../CUDACore/interface/CUDAToken.h            |  43 +++
 .../CUDACore/plugins/BuildFile.xml            |  17 ++
 .../CUDACore/plugins/CUDADeviceChooser.cc     |  89 ++++++
 .../CUDACore/plugins/CUDADeviceFilter.cc      |  40 +++
 .../CUDACore/src/CUDAScopedContext.cc         |  21 ++
 HeterogeneousCore/CUDACore/src/CUDAToken.cc   |  23 ++
 HeterogeneousCore/CUDACore/src/classes.h      |  12 +
 .../CUDACore/src/classes_def.xml              |   7 +
 HeterogeneousCore/CUDACore/test/BuildFile.xml |  16 +
 HeterogeneousCore/CUDACore/test/TestCUDA.h    |  23 ++
 .../CUDACore/test/TestCUDAProducerCPU.cc      |  67 +++++
 .../CUDACore/test/TestCUDAProducerFallback.cc |  56 ++++
 .../CUDACore/test/TestCUDAProducerGPU.cc      |  61 ++++
 .../CUDACore/test/TestCUDAProducerGPUEW.cc    |  83 +++++
 .../CUDACore/test/TestCUDAProducerGPUFirst.cc |  63 ++++
 .../test/TestCUDAProducerGPUKernel.cu         | 112 +++++++
 .../CUDACore/test/TestCUDAProducerGPUKernel.h |  43 +++
 .../CUDACore/test/TestCUDAProducerGPUtoCPU.cc |  78 +++++
 .../CUDACore/test/testCUDA_cfg.py             | 121 ++++++++
 HeterogeneousCore/CUDACore/test/test_CUDA.cc  |  59 ++++
 .../CUDACore/test/test_CUDADeviceChooser.cc   |  96 ++++++
 .../CUDACore/test/test_CUDAScopedContext.cc   | 103 +++++++
 .../test/test_CUDAScopedContextKernels.cu     |  24 ++
 .../test/test_CUDAScopedContextKernels.h      |   9 +
 .../test/test_TestCUDAProducerGPUFirst.cc     | 103 +++++++
 HeterogeneousCore/CUDACore/test/test_main.cc  |   2 +
 31 files changed, 1874 insertions(+)
 create mode 100644 HeterogeneousCore/CUDACore/README.md
 create mode 100644 HeterogeneousCore/CUDACore/interface/CUDA.h
 create mode 100644 HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
 create mode 100644 HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h
 create mode 100644 HeterogeneousCore/CUDACore/interface/CUDAToken.h
 create mode 100644 HeterogeneousCore/CUDACore/plugins/BuildFile.xml
 create mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
 create mode 100644 HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
 create mode 100644 HeterogeneousCore/CUDACore/src/CUDAToken.cc
 create mode 100644 HeterogeneousCore/CUDACore/src/classes.h
 create mode 100644 HeterogeneousCore/CUDACore/src/classes_def.xml
 create mode 100644 HeterogeneousCore/CUDACore/test/BuildFile.xml
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDA.h
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerCPU.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerFallback.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerGPU.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUEW.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUFirst.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.cu
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.h
 create mode 100644 HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUtoCPU.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDA.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
 create mode 100644 HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc
 create mode 100644 HeterogeneousCore/CUDACore/test/test_main.cc

diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
index ba6b35c6d0ce7..1ebe999f9746c 100644
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -1,3 +1,4 @@
+<use name="FWCore/Concurrency"/>
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="FWCore/ParameterSet"/>
@@ -5,6 +6,7 @@
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda"/>
 <use name="cuda-api-wrappers"/>
+<use name="root"/>
 
 <export>
     <lib name="1"/>
diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
new file mode 100644
index 0000000000000..05e524a08a87f
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -0,0 +1,283 @@
+# Next iteration of the prototype for CMSSW interface to heterogeneous algorithms
+
+## Introduction
+
+The current prototype with `HeterogeneousEDProducer` and
+`HeterogeneousProduct` is documented [here](../Producer/README.md).
+The main differences wrt. that are
+* Split device-specific code to different EDProducers
+* Plug components together in the configuration
+
+This page documents the CUDA integration, and discusses briefly on how
+to extend to other devices. It will be extended if/when it gets
+deployed and `HeterogeneousEDProducer` retired.
+
+## Sub-packages
+* [`CUDACore`](#cuda-integration) CUDA-specific core components
+* [`CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
+* [`CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+
+# CUDA integration
+
+## Choosing device
+
+The device choosing logic is split to an EDProducer, an EDFilter, and
+use of Paths in the configuration.
+
+First, a `CUDADeviceChooser` EDProducer is run. It has the logic to
+device whether the following chain of EDModules should run on a CUDA
+device or not, and if yes, on which CUDA device. If it decides "yes",
+it produces a `CUDAToken`, which contains the device id and a CUDA
+stream. If it decides "no", it does not produce anything.
+
+Next step is a `CUDADeviceFilter` EDFilter. It checks whether the
+`CUDADeviceChooser` produced a product or not. If "yes", it returns
+`true`, and if "no", it returns `false`.
+
+Finally, the pieces need to be put together in the configuration. The
+`CUDADeviceChooser` can be "anywhere", but the `CUDADeviceFilter`
+should be the first module on a `cms.Path`, followed by the CUDA
+EDProducers (in the future it may become sufficient to have only the
+first EDProducer of a chain in the `Path`).
+```python
+process.fooCUDADevice = cms.EDProducer("CUDADeviceChooser")
+process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceFilter",
+    src = cms.InputTag("fooCUDADevice")
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+process.fooPathCUDA = cms.Path(
+    process.fooCUDADeviceFilter + process.fooCUDA
+)
+process.fooTask = cms.Task(
+    process.fooDevice
+)
+```
+
+## Data model
+
+The GPU data can be a single pointer to device data, or a class/struct
+containing such pointers (among other stuff). When putting the data to
+event, the data is wrapped to `CUDA<T>` template, which holds
+* the GPU data
+  * must be movable, but no other restrictions (except need to be able to generate ROOT dictionaries from it)
+* the current device where the data was produced, and the CUDA stream the data was produced with
+* [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
+
+Note that the `CUDA<T>` wrapper can be constructed only with
+`CUDAScopedContext::wrap()`, and the data `T` can be obtained from it
+only with `CUDAScopedContext::get()`, as described further below.
+
+## CUDA EDProducer
+
+### Class declaration
+
+For time being (may disappear in the future) a CUDA producer should
+inherit from `CUDAStreamEDProducer<...>`. The template parameters are
+the usual
+[stream producer extensions](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#Template_Arguments).
+Note that contrary to `HeterogeneousEDProducer`, the `ExternalWork`
+extension is **not** implied.
+
+```cpp
+#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
+class FooProducerCUDA: public CUDAStreamEDProducer<> {
+  ...
+```
+
+### Memory allocation
+
+The only effect of the `CUDAStreamEDProducer` base class is that
+`beginStream(edm::StreamID)` is replaced with
+`beginStreamCUDA(edm::StreamID)`. This is done in order to set the
+current CUDA device before the user code starts. **If the algorithm
+has to allocate memory buffers for the duration of the whole job, the
+recommended place is here.** Note that a CUDA stream is not passed to
+the user code. If a CUDA stream is really needed, the developer should
+create+synchronize it by him/herself. (although if this appears to be
+common practice, we should try to provide the situation somehow)
+
+### Setting the current device
+
+A CUDA producer should read either `CUDAToken` (from
+`CUDADeviceChooser`) or one or more `CUDA<T>` products. Then, in the
+`acquire()`/`produce()`, it should construct `CUDAScopedContext` from
+one of them
+```cpp
+// From CUDAToken
+edm::Handle<CUDAToken> htoken;
+iEvent.getByToken(srcToken_, htoken);
+auto ctx = CUDAScopedContext(*htoken);
+
+/// From CUDA<T>
+edm::Handle<CUDA<GPUClusters> > handle;
+iEvent.getByToken(srctoken_, handle);
+auto ctx = CUDAScopedContext(*handle);
+```
+
+`CUDAScopedContext` works in the RAII way and does the following
+* Sets the current device (for the scope) from `CUDAToken`/`CUDA<T>`
+* Gives access to the CUDA stream the algorithm should use to queue asynchronous work
+* Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary
+* [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
+* Needed to get/put `CUDA<T>` from/to the event
+
+In case of multiple input products, from possibly different CUDA
+streams and/or CUDA devices, this approach gives the developer full
+control in which of them the kernels of the algorithm should be run.
+
+### Getting input
+
+The real product (`T`) can be obtained from `CUDA<T>` only with the
+help of `CUDAScopedContext`. 
+
+```cpp
+edm::Handle<CUDA<GPUClusters> > hclus;
+iEvent.getByToken(srctoken_, hclus);
+GPUClusters const& clus = ctx.get(*hclus);
+```
+
+This step is needed to
+* check that the data are on the same CUDA device
+  * if not, throw an exception (with unified memory could prefetch instead)
+* if the CUDA streams are different, synchronize between them
+
+### Calling the CUDA kernels
+
+There is nothing special, except the CUDA stream can be obtained from
+the `CUDAScopedContext`
+
+```cpp
+gpuAlgo.makeClustersAsync(..., ctx.stream());
+```
+
+### Putting output
+
+The GPU data needs to be wrapped to `CUDA<T>` template with `CUDAScopedContest.wrap()`
+
+```cpp
+GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
+std::unique_ptr<CUDA<GPUClusters> > ret = ctx.wrap(clusters);
+iEvent.put(std::move(ret));
+
+// or with one line
+iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
+```
+
+This step is needed to
+* store the current device and CUDA stream into `CUDA<T>`
+* record the CUDA event needed for CUDA stream synchronization
+
+### `ExternalWork` extension
+
+Everything above works both with and without `ExternalWork`.
+
+Without `ExternalWork` the `EDProducer`s act similar to TBB
+flowgraph's "streaming node". I.e. they just queue more asynchronous
+work in their `produce()`.
+
+The `ExternalWork` is needed when one would otherwise call
+`cudeStreamSynchronize()`, e.g. transferring something to CPU needed
+for downstream DQM, or to queue more asynchronous work. With
+`ExternalWork` an `acquire()` method needs to be implemented that gets
+an `edm::WaitingTaskWithArenaHolder` parameter. The
+`WaitingTaskWithArenaHolder` should then be passed to the constructor
+of `CUDAScopedContext` along
+
+```cpp
+void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::Handle<CUDA<GPUClusters> > handle;
+  iEvent.getByToken(token_, handle);
+  auto ctx = CUDAScopedContext(*handle, std::move(waitingTaskHolder)); // can also copy instead of move if waitingTaskHolder is needed for something else as well
+  ...
+```
+
+When constructed this way, `CUDAScopedContext` registers a callback
+function to the CUDA stream in its destructor to call
+`waitingTaskHolder.doneWaiting()`.
+
+A GPU->GPU producer needs a `CUDAScopedContext` also in its
+`produce()`. Currently the best way is to read the input again in
+`produce()` and construct the `CUDAScopedContext` from there. This
+point will be improved. 
+
+### Transferring GPU data to CPU
+
+The GPU->CPU data transfer needs synchronization to ensure the CPU
+memory to have all data before putting that to the event. This means
+the `ExternalWork` needs to be used along
+* In `acquire()`
+  * (allocate CPU memory buffers)
+  * Queue all GPU->CPU transfers asynchronously
+* In `produce()`
+  * If needed, read additional CPU products (e.g. from `edm::Ref`s)
+  * Reformat data back to legacy data formats
+  * Note: `CUDAScopedContext` is **not** needed in in `produce()`
+
+### Synchronizing between CUDA streams
+
+In case the producer needs input data that were produced in two (or
+more) CUDA streams, these streams have to be synchronized (since CMSSW
+framework no longer guarantees the synchronization as was the case
+with `HeterogeneousEDProducer`). Here this synchronization is achieved
+with CUDA events.
+
+Each `CUDA<T>` constains also a CUDA event object. The call to
+`CUDAScopedContext::wrap()` will *record* the event in the CUDA stream.
+This means that when all work queued to the CUDA stream up to that
+point has been finished, the CUDA event becomes *occurred*. Then, in
+`CUDAScopedContext::get()`, if the `CUDA<T>` to get from has a
+different CUDA stream than the `CUDAScopedContext`,
+`cudaStreamWaitEvent(stream, event)` is called. This means that all
+subsequent work queued to the CUDA stream will wait for the CUDA event
+to become occurred. Therefore this subsequent work can assume that the
+to-be-getted CUDA product exists.
+
+## Configuration
+
+```python
+process.fooCPU = cms.EDProducer("FooProducer") # legacy CPU
+
+process.fooCUDADevice = cms.EDProducer("CUDADeviceChooser")
+process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceFilter",
+    src = cms.InputTag("fooCUDADevice")
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+process.fooFromCUDA = cms.EDProducer("FooProducerCUDAtoCPU", src="fooCUDA")
+process.foo = cms.EDProducer("FooProducerFallback",
+    src = cms.VInputTag("fooFromCUDA", "fooCPU")
+)
+process.fooPathCUDA = cms.Path(
+    process.fooCUDADeviceFilter + process.fooCUDA
+)
+process.fooPathCPU = cms.Path(
+    ~process.fooCUDADeviceFilter + process.fooCPU
+)
+process.fooTask = cms.Task(
+    process.fooDevice,
+    process.fooFromCUDA,
+    process.foo
+)
+...
+```
+For a more complete example, see [here](test/testCUDA_cfg.py).
+
+# Extension to other devices
+
+The C++ side extends in a straightforward way. One has to add classes
+similar to `CUDAToken`, `CUDA<T>`, and `CUDAScopedContext`. Of course,
+much depends on the exact details. The python configuration side
+extends as well, one "just" has to add more modules there. Also the
+device choosing logic is also extendable
+```python
+process.fooCUDADevice = ...
+process.fooFPGADevice = ...
+process.fooPathCUDA = cms.Path(
+    process.fooCUDADeviceFilter + ...
+)
+process.fooPathFPGA = cms.Path(
+    ~process.fooCUDADeviceFilter + process.fooFPGADeviceFilter + ...
+)    
+process.fooPathCPU = cms.Path(
+    ~process.fooCUDADeviceFilter + ~process.fooFPGADeviceFilter + ...
+)
+```
diff --git a/HeterogeneousCore/CUDACore/interface/CUDA.h b/HeterogeneousCore/CUDACore/interface/CUDA.h
new file mode 100644
index 0000000000000..6008836aebcff
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDA.h
@@ -0,0 +1,77 @@
+#ifndef HeterogeneousCore_CUDACore_CUDA_h
+#define HeterogeneousCore_CUDACore_CUDA_h
+
+#include <optional>
+
+#include <cuda/api_wrappers.h>
+
+/**
+ * The purpose of this class is to wrap CUDA data to edm::Event in a
+ * way which forces correct use of various utilities.
+ *
+ * The non-default construction has to be done with CUDAScopedContext
+ * (in order to properly register the CUDA event).
+ *
+ * The default constructor is needed only for the ROOT dictionary generation.
+ *
+ * The CUDA event is in practice needed only for stream-stream
+ * synchronization, but someone with long-enough lifetime has to own
+ * it. Here is a somewhat natural place. If overhead is too much, we
+ * can e.g. make CUDAService own them (creating them on demand) and
+ * use them only where synchronization between streams is needed.
+ */
+template <typename T>
+class CUDA {
+public:
+  CUDA() = default; // Needed only for ROOT dictionary generation
+
+  CUDA(const CUDA&) = delete;
+  CUDA& operator=(const CUDA&) = delete;
+  CUDA(CUDA&&) = default;
+  CUDA& operator=(CUDA&&) = default;
+
+  bool isValid() const { return streamEvent_.get() != nullptr; }
+
+  int device() const { return device_; }
+
+  const cuda::stream_t<>& stream() const { return streamEvent_->stream; }
+  cuda::stream_t<>& stream() { return streamEvent_->stream; }
+
+  const cuda::event_t& event() const { return streamEvent_->event; }
+  cuda::event_t& event() { return streamEvent_->event; }
+
+private:
+  friend class CUDAScopedContext;
+  friend class TestCUDA;
+
+  template <typename TokenOrContext>
+  explicit CUDA(T data, const TokenOrContext& token):
+    streamEvent_(std::make_unique<StreamEvent>(token)),
+    data_(std::move(data)),
+    device_(token.device())
+  {}
+
+  // Using unique_ptr to support the default constructor. Tried
+  // std::optional, but cuda::stream_t and cuda::event_t have their
+  // move assignment operators deleted. Use a struct to save one
+  // memory allocation.
+public: // need to be public for ROOT dicrionary generation?
+  struct StreamEvent {
+    template <typename TokenOrContext>
+    explicit StreamEvent(const TokenOrContext& token):
+      stream(token.stream()),
+      event(cuda::event::create(token.device(),
+                                cuda::event::sync_by_busy_waiting, // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
+                                cuda::event::dont_record_timings)) // it should be a bit faster to ignore timings
+    {}
+
+    cuda::stream_t<> stream; // stream_t is just a handle, the real CUDA stream is owned by CUDAToken (with long-enough life time)
+    cuda::event_t event;
+  };
+private:
+  std::unique_ptr<StreamEvent> streamEvent_;
+  T data_;
+  int device_ = -1;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
new file mode 100644
index 0000000000000..226ade47deecd
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -0,0 +1,95 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAScopedContext_h
+#define HeterogeneousCore_CUDACore_CUDAScopedContext_h
+
+#include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Utilities/interface/Exception.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+#include <optional>
+
+/**
+ * The aim of this class is to do necessary per-event "initialization":
+ * - setting the current device
+ * - calling edm::WaitingTaskWithArenaHolder::doneWaiting() when necessary
+ * - synchronizing between CUDA streams if necessary
+ * and enforce that those get done in a proper way in RAII fashion.
+ */
+class CUDAScopedContext {
+public:
+  explicit CUDAScopedContext(const CUDAToken& token):
+    currentDevice_(token.device()),
+    setDeviceForThisScope_(currentDevice_),
+    stream_(token.stream())
+  {}
+
+  template<typename T>
+  explicit CUDAScopedContext(const CUDA<T>& data):
+    currentDevice_(data.device()),
+    setDeviceForThisScope_(currentDevice_),
+    stream_(data.stream())
+  {}
+
+  explicit CUDAScopedContext(const CUDAToken& token, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+    CUDAScopedContext(token)
+  {
+    waitingTaskHolder_ = waitingTaskHolder;
+  }
+
+  template <typename T>
+  explicit CUDAScopedContext(const CUDA<T>& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+    CUDAScopedContext(data)
+  {
+    waitingTaskHolder_ = waitingTaskHolder;
+  }
+
+  ~CUDAScopedContext();
+
+  int device() const { return currentDevice_; }
+
+  cuda::stream_t<>& stream() { return stream_; }
+  const cuda::stream_t<>& stream() const { return stream_; }
+
+  template <typename T>
+  const T& get(const CUDA<T>& data) {
+    if(data.device() != currentDevice_) {
+      // Eventually replace with prefetch to current device (assuming unified memory works)
+      // If we won't go to unified memory, need to figure out something else...
+      throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+    }
+
+    if(data.stream().id() != stream_.id()) {
+      // Different streams, need to synchronize
+      if(!data.event().has_occurred()) {
+        // Event not yet occurred, so need to add synchronization
+        // here. Sychronization is done by making the CUDA stream to
+        // wait for an event, so all subsequent work in the stream
+        // will run only after the event has "occurred" (i.e. data
+        // product became available).
+        auto ret = cudaStreamWaitEvent(stream_.id(), data.event().id(), 0);
+        cuda::throw_if_error(ret, "Failed to make a stream to wait for an event");
+      }
+    }
+
+    return data.data_;
+  }
+
+  template <typename T>
+  std::unique_ptr<CUDA<T> > wrap(T data) {
+    // make_unique doesn't work because of private constructor
+    auto ret = std::unique_ptr<CUDA<T> >(new CUDA<T>(std::move(data), *this));
+    // Record CUDA event to the CUDA stream. The event will become
+    // "occurred" after all work queued to the stream before this
+    // point has been finished.
+    ret->event().record(stream_.id());
+    return ret;
+  }
+
+private:
+  int currentDevice_;
+  std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
+  cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
+  cuda::stream_t<> stream_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h b/HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h
new file mode 100644
index 0000000000000..f825cacb91567
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h
@@ -0,0 +1,46 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAStreamEDProducer_h
+#define HeterogeneousCore_CUDACore_CUDAStreamEDProducer_h
+
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include <cuda/api_wrappers.h>
+
+/**
+ * This class is a bit hacky but intended only for a transition
+ * period. It also duplicates the EDM stream -> CUDA device assignment.
+ */
+template <typename ...Args>
+class CUDAStreamEDProducer: public edm::stream::EDProducer<Args...> {
+private:
+  void beginStream(edm::StreamID id) override final {
+    // The following checks only from CUDAService whether it is
+    // enabled or not. Also CUDADeviceChooser can be configured to be
+    // disabled, effectively disabling that "CUDA chain".
+    // Unfortunately we have no (easy) means here to know whether this
+    // EDProducer is part of such a chain. On the other hand,
+    // beginStream() is intended only for block memory allocations
+    // (and we will likely adjust the strategy), and the
+    // CUDADeviceChooser.enabled is intended for debugging/testing
+    // purposes, so maybe this solution is good enough (i.e. for
+    // debugging it doesn't matter if we allocate "too much")
+    edm::Service<CUDAService> cudaService;
+    if(cudaService->enabled(id)) {
+      // This logic is duplicated from CUDADeviceChooser
+      int device = id % cudaService->numberOfDevices();
+      cuda::device::current::scoped_override_t<> setDeviceForThisScope(device);
+      beginStreamCUDA(id);
+    }
+  }
+
+  // It's a bit stupid to change the name, but I don't have anything
+  // additional to pass down.
+  //
+  // Note: contrary to HeterogeneousEDProducer+GPUCuda, the CUDA
+  // stream is *not* passed to the deriving class (there is no good
+  // place for a CUDA stream here in this design).
+  virtual void beginStreamCUDA(edm::StreamID id) = 0;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAToken.h b/HeterogeneousCore/CUDACore/interface/CUDAToken.h
new file mode 100644
index 0000000000000..ef879677c720b
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAToken.h
@@ -0,0 +1,43 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAToken_h
+#define HeterogeneousCore_CUDACore_CUDAToken_h
+
+#include <cuda/api_wrappers.h>
+
+#include <memory>
+
+/**
+ * The purpose of this class is to deliver the device and CUDA stream
+ * information from CUDADeviceChooser to the EDModules with CUDA
+ * implementation.
+ *
+ * Currently the class is declared as transient in the dictionary, but
+ * in principle (for debugging purposes) it could be possible to
+ * persist it by marking only the CUDA stream as transient.
+ *
+ * Note that the CUDA stream is returned only as a const reference.
+ * Various methods (e.g. cuda::stream_t<>::synchronize()) are
+ * non-const, but on the other hand cuda:stream_t is just a handle
+ * wrapping the real CUDA stream, and can thus be cheaply copied as a
+ * non-owning non-const handle.
+ */
+class CUDAToken {
+public:
+  CUDAToken() = default;
+  explicit CUDAToken(int device);
+
+  ~CUDAToken();
+
+  CUDAToken(const CUDAToken&) = delete;
+  CUDAToken& operator=(const CUDAToken&) = delete;
+  CUDAToken(CUDAToken&&) = default;
+  CUDAToken& operator=(CUDAToken&&) = default;
+
+  int device() const { return device_; }
+  const cuda::stream_t<>& stream() const { return *stream_; }
+  
+private:
+  std::unique_ptr<cuda::stream_t<>> stream_;
+  int device_ = -1;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/plugins/BuildFile.xml b/HeterogeneousCore/CUDACore/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..5749d3ccaf1ad
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/BuildFile.xml
@@ -0,0 +1,17 @@
+#Skip building plugins by dropping all files for none-AMD64 build
+<architecture match="!_amd64_">
+  <flags SKIP_FILES="*"/>
+</architecture>
+
+<use name="FWCore/MessageLogger"/>
+<use name="FWCore/ParameterSet"/>
+<use name="FWCore/PluginManager"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="FWCore/Framework"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="HeterogeneousCore/CUDAServices"/>
+<use name="cuda"/>
+<use name="cuda-api-wrappers"/>
+<library file="*.cc" name="HeterogeneousCoreCUDACorePlugins">
+  <flags EDM_PLUGIN="1"/>
+</library>
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
new file mode 100644
index 0000000000000..58411f749d60e
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
@@ -0,0 +1,89 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <memory>
+
+namespace {
+  struct DeviceCache {
+    int device;
+    bool enabled;
+  };
+}
+
+class CUDADeviceChooser: public edm::global::EDProducer<edm::StreamCache<::DeviceCache> > {
+public:
+  explicit CUDADeviceChooser(const edm::ParameterSet& iConfig);
+  ~CUDADeviceChooser() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+
+private:
+  bool enabled_;
+};
+
+CUDADeviceChooser::CUDADeviceChooser(const edm::ParameterSet& iConfig):
+  enabled_(iConfig.getParameter<bool>("enabled"))
+{
+  produces<CUDAToken>();
+}
+
+void CUDADeviceChooser::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("enabled", true)->setComment("This parameter is intended for debugging purposes only. If disabling some CUDA chains is needed for production, it is better to remove the CUDA modules altogether from the configuration.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer chooses whether a chain of CUDA EDModules depending on it should run or not. The decision is communicated downstream by the existence of a 'CUDAToken' event product. Intended to be used with CUDADeviceFilter.");
+}
+
+std::unique_ptr<::DeviceCache> CUDADeviceChooser::beginStream(edm::StreamID id) const {
+  auto ret = std::make_unique<::DeviceCache>();
+
+  edm::Service<CUDAService> cudaService;
+  ret->enabled = (enabled_ && cudaService->enabled(id));
+  if(!ret->enabled) {
+    return ret;
+  }
+
+  // For startes we "statically" assign the device based on
+  // edm::Stream number. This is suboptimal if the number of
+  // edm::Streams is not a multiple of the number of CUDA devices
+  // (and even then there is no load balancing).
+  //
+  // TODO: improve. Possible ideas include
+  // - allocate M (< N(edm::Streams)) buffers per device per "chain of modules", choose dynamically which (buffer, device) to use
+  // - our own CUDA memory allocator
+  //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
+  //   * would probably still need some buffer space/device to hold e.g. conditions data
+  //     - for conditions, how to handle multiple lumis per job?
+  ret->device = id % cudaService->numberOfDevices();
+
+  LogDebug("CUDADeviceChooser") << "EDM stream " << id << " set to CUDA device " << ret->device;
+
+  return ret;
+}
+
+void CUDADeviceChooser::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  auto cache = streamCache(id);
+  if(!cache->enabled) {
+    return;
+  }
+
+  auto ret = std::make_unique<CUDAToken>(cache->device);
+  LogDebug("CUDADeviceChooser") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
+  iEvent.put(std::move(ret));
+}
+
+
+DEFINE_FWK_MODULE(CUDADeviceChooser);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
new file mode 100644
index 0000000000000..cf4a8093a94bd
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
@@ -0,0 +1,40 @@
+#include "FWCore/Framework/interface/global/EDFilter.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+class CUDADeviceFilter: public edm::global::EDFilter<> {
+public:
+  explicit CUDADeviceFilter(const edm::ParameterSet& iConfig);
+  ~CUDADeviceFilter() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  bool filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  edm::EDGetTokenT<CUDAToken> token_;
+};
+
+CUDADeviceFilter::CUDADeviceFilter(const edm::ParameterSet& iConfig):
+  token_(consumes<CUDAToken>(iConfig.getParameter<edm::InputTag>("src")))
+{}
+
+void CUDADeviceFilter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("cudaDeviceChooser"))->setComment("Source of the 'CUDAToken'.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDFilter filters based on the existence of a 'CUDAToken' event product. Intended to be used together with CUDADeviceChooser. Returns 'true' if the product exists, and 'false' if not.");
+}
+
+bool CUDADeviceFilter::filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::Handle<CUDAToken> handle;
+  iEvent.getByToken(token_, handle);
+  return handle.isValid();
+}
+
+DEFINE_FWK_MODULE(CUDADeviceFilter);
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
new file mode 100644
index 0000000000000..639fc4aa2bfb8
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -0,0 +1,21 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+
+CUDAScopedContext::~CUDAScopedContext() {
+  if(waitingTaskHolder_.has_value()) {
+    stream_.enqueue.callback([device=currentDevice_,
+                              waitingTaskHolder=*waitingTaskHolder_]
+                             (cuda::stream::id_t streamId, cuda::status_t status) mutable {
+                               if(cuda::is_success(status)) {
+                                 LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " << streamId;
+                                 waitingTaskHolder.doneWaiting(nullptr);
+                               }
+                               else {
+                                 auto error = cudaGetErrorName(status);
+                                 auto message = cudaGetErrorString(status);
+                                 waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message));
+                               }
+                             });
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/src/CUDAToken.cc b/HeterogeneousCore/CUDACore/src/CUDAToken.cc
new file mode 100644
index 0000000000000..3493bd49a0fcd
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/CUDAToken.cc
@@ -0,0 +1,23 @@
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+namespace {
+  auto make_stream(int device) {
+    cuda::device::current::scoped_override_t<> setDeviceForThisScope(device);
+    auto current_device = cuda::device::current::get();
+    return std::make_unique<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
+  }
+}
+
+CUDAToken::CUDAToken(int device):
+  stream_(make_stream(device)),
+  device_(device)
+{}
+
+CUDAToken::~CUDAToken() {
+  if(stream_) {
+    // The current memory allocation model (large blocks) requires the
+    // CUDA stream to be synchronized before moving on to the next
+    // event in the EDM stream in order to avoid race conditions.
+    stream_->synchronize();
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/src/classes.h b/HeterogeneousCore/CUDACore/src/classes.h
new file mode 100644
index 0000000000000..8e7fcca0008f0
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/classes.h
@@ -0,0 +1,12 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+
+namespace {
+  struct dictionary {
+    CUDAToken ct;
+
+    // These should really be placed elsewhere?
+    CUDA<float *> cf;
+  };
+}
diff --git a/HeterogeneousCore/CUDACore/src/classes_def.xml b/HeterogeneousCore/CUDACore/src/classes_def.xml
new file mode 100644
index 0000000000000..b551c0c74d721
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/src/classes_def.xml
@@ -0,0 +1,7 @@
+<lcgdict>
+    <class name="CUDAToken" persistent="false"/>
+    <class name="edm::Wrapper<CUDAToken>" persistent="false"/>
+
+    <class name="CUDA<float *>" persistent="false"/>
+    <class name="edm::Wrapper<CUDA<float *>>" persistent="false"/>
+</lcgdict>
diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
new file mode 100644
index 0000000000000..97593098d272d
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -0,0 +1,16 @@
+<bin file="test*.cc test*.cu" name="testHeterogeneousCoreCUDACore">
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
+
+<library file="TestCUDAProducerGPUFirst.cc TestCUDAProducerGPU.cc TestCUDAProducerGPUEW.cc TestCUDAProducerGPUtoCPU.cc TestCUDAProducerCPU.cc TestCUDAProducerFallback.cc TestCUDAProducerGPUKernel.cu" name="HeterogeneousCoreCUDACoreTestPlugins">
+  <flags EDM_PLUGIN="1"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+</library>
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDA.h b/HeterogeneousCore/CUDACore/test/TestCUDA.h
new file mode 100644
index 0000000000000..d1dd82df1e9ee
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDA.h
@@ -0,0 +1,23 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDA_h
+#define HeterogeneousCore_CUDACore_TestCUDA_h
+
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+
+/**
+ * This class is intended only for testing purposes. It allows to
+ * construct CUDA<T> and get the T from CUDA<T> without CUDAScopedContext.
+ */
+class TestCUDA {
+public:
+  template <typename T, typename ...Args>
+  static CUDA<T> create(T data, Args&&... args) {
+    return CUDA<T>(std::move(data), std::forward<Args>(args)...);
+  }
+
+  template <typename T>
+  static const T& get(const CUDA<T>& data) {
+    return data.data_;
+  }
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDACore/test/TestCUDAProducerCPU.cc
new file mode 100644
index 0000000000000..0fe14abdfbd27
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerCPU.cc
@@ -0,0 +1,67 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include <chrono>
+#include <random>
+#include <thread>
+
+class TestCUDAProducerCPU: public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerCPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+private:
+  std::string label_;
+  edm::EDGetTokenT<int> srcToken_;
+};
+
+TestCUDAProducerCPU::TestCUDAProducerCPU(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label"))
+{
+  auto srcTag = iConfig.getParameter<edm::InputTag>("src");
+  if(!srcTag.label().empty()) {
+    srcToken_ = consumes<int>(srcTag);
+  }
+
+  produces<int>();
+}
+
+void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Optional source of another TestCUDAProducerCPU.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a CPU algorithm.");
+}
+
+void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogPrint("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
+
+  int input = 0;
+  if(!srcToken_.isUninitialized()) {
+    edm::Handle<int> hin;
+    iEvent.getByToken(srcToken_, hin);
+    input = *hin;
+  }
+
+  std::random_device r;
+  std::mt19937 gen(r());
+  auto dist = std::uniform_real_distribution<>(0.2, 1.5); 
+  auto dur = dist(gen);
+  edm::LogPrint("TestCUDAProducerCPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
+  std::this_thread::sleep_for(std::chrono::seconds(1)*dur);
+
+  const unsigned int output = input + id*100 + iEvent.id().event();
+
+  iEvent.put(std::make_unique<int>(output));
+
+  edm::LogPrint("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event() << " stream " << id << " result " << output;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerCPU);
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerFallback.cc b/HeterogeneousCore/CUDACore/test/TestCUDAProducerFallback.cc
new file mode 100644
index 0000000000000..fd2b6842c39df
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerFallback.cc
@@ -0,0 +1,56 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/transform.h"
+
+class TestCUDAProducerFallback: public edm::global::EDProducer<> {
+public:
+  explicit TestCUDAProducerFallback(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerFallback() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+
+private:
+  std::string label_;
+  std::vector<edm::EDGetTokenT<int>> tokens_;
+};
+
+TestCUDAProducerFallback::TestCUDAProducerFallback(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label")),
+  tokens_(edm::vector_transform(iConfig.getParameter<std::vector<edm::InputTag> >("src"),
+                                [this](const edm::InputTag& tag) {
+                                  return consumes<int>(tag);
+                                }))
+{
+  produces<int>();
+}
+
+void TestCUDAProducerFallback::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<std::vector<edm::InputTag>>("src", std::vector<edm::InputTag>{})->setComment("Ordered list of input 'int' inputs.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It acts as an enhanced EDAlias with a defined order of inputs. I.e. if first input is available, copy that. If not, try the next one etc. If no inputs are available, throw an exception. To be replaced with an EDAlias-style feature in the framework.");
+}
+
+void TestCUDAProducerFallback::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::LogPrint("TestCUDAProducerFallback") << label_ << " TestCUDAProducerFallback::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::Handle<int> hin;
+  for(const auto& token: tokens_) {
+    edm::EDConsumerBase::Labels labels;
+    labelsForToken(token, labels);
+    if(iEvent.getByToken(token, hin)) {
+      edm::LogPrint("TestCUDAProducerFallback") << label_ << "  input " << labels.module << " found";
+      iEvent.put(std::make_unique<int>(*hin));
+      return;
+    }
+    edm::LogPrint("TestCUDAProducerFallback") << label_ << "  input " << labels.module << " NOT found";
+  }
+  throw cms::Exception("ProductNotFound") << "Unable to find product 'int' from any of the inputs";
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerFallback);
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPU.cc
new file mode 100644
index 0000000000000..bf7b6d8563d66
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPU.cc
@@ -0,0 +1,61 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPU: public CUDAStreamEDProducer<> {
+public:
+  explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void beginStreamCUDA(edm::StreamID id) override;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup);
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDA<float *>> srcToken_;
+  std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+};
+
+TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label")),
+  srcToken_(consumes<CUDA<float *>>(iConfig.getParameter<edm::InputTag>("src")))
+{
+  produces<CUDA<float *>>();
+}
+
+void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDA<float *>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDA<float *>.");
+}
+
+void TestCUDAProducerGPU::beginStreamCUDA(edm::StreamID id) {
+  // Allocate device memory via RAII
+  gpuAlgo_ = std::make_unique<TestCUDAProducerGPUKernel>();
+}
+
+void TestCUDAProducerGPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  edm::Handle<CUDA<float *> > hin;
+  iEvent.getByToken(srcToken_, hin);
+  auto ctx = CUDAScopedContext(*hin);
+  const float *input = ctx.get(*hin);
+
+  iEvent.put(ctx.wrap(gpuAlgo_->runAlgo(label_, input, ctx.stream())));
+
+  edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPU);
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUEW.cc
new file mode 100644
index 0000000000000..b37a42ebcf653
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUEW.cc
@@ -0,0 +1,83 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUEW: public CUDAStreamEDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUEW() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void beginStreamCUDA(edm::StreamID id) override;
+
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDA<float *>> srcToken_;
+  std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+  float *devicePtr_ = nullptr;
+  float hostData_ = 0.f;
+};
+
+TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label")),
+  srcToken_(consumes<CUDA<float *>>(iConfig.getParameter<edm::InputTag>("src")))
+{
+  produces<CUDA<float *>>();
+}
+
+void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag());
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void TestCUDAProducerGPUEW::beginStreamCUDA(edm::StreamID id) {
+  // Allocate device memory via RAII
+  gpuAlgo_ = std::make_unique<TestCUDAProducerGPUKernel>();
+}
+
+void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  edm::Handle<CUDA<float *> > hin;
+  iEvent.getByToken(srcToken_, hin);
+  auto ctx = CUDAScopedContext(*hin, std::move(waitingTaskHolder));
+  const float *input = ctx.get(*hin);
+
+  devicePtr_ = gpuAlgo_->runAlgo(label_, input, ctx.stream());
+  // Mimick the need to transfer some of the GPU data back to CPU to
+  // be used for something within this module, or to be put in the
+  // event.
+  cuda::memory::async::copy(&hostData_, devicePtr_+10, sizeof(float), ctx.stream().id());
+
+  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
+
+  // It feels a bit stupid to read the input again here, but for
+  // anything else we'd need to somehow transfer the device+stream
+  // information from acquire.
+  edm::Handle<CUDA<float *> > hin;
+  iEvent.getByToken(srcToken_, hin);
+  auto ctx = CUDAScopedContext(*hin);
+
+  iEvent.put(ctx.wrap(devicePtr_));
+  devicePtr_ = nullptr;
+
+  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUEW);
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..bf9496f93bf5f
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,63 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUFirst: public CUDAStreamEDProducer<> {
+public:
+  explicit TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUFirst() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void beginStreamCUDA(edm::StreamID id) override;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup);
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDAToken> srcToken_;
+  std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+};
+
+TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label")),
+  srcToken_(consumes<CUDAToken>(iConfig.getParameter<edm::InputTag>("src")))
+{
+  produces<CUDA<float *>>();
+}
+
+void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAToken.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers, so it reads a CUDAToken. Produces CUDA<float *>.");
+}
+
+void TestCUDAProducerGPUFirst::beginStreamCUDA(edm::StreamID id) {
+  // Allocate device memory via RAII
+  gpuAlgo_ = std::make_unique<TestCUDAProducerGPUKernel>();
+}
+
+void TestCUDAProducerGPUFirst::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  edm::Handle<CUDAToken> htoken;
+  iEvent.getByToken(srcToken_, htoken);
+
+  auto ctx = CUDAScopedContext(*htoken);
+
+  float *output = gpuAlgo_->runAlgo(label_, ctx.stream());
+  iEvent.put(ctx.wrap(output));
+
+  edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst);
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.cu
new file mode 100644
index 0000000000000..aa22d330be374
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.cu
@@ -0,0 +1,112 @@
+#include "TestCUDAProducerGPUKernel.h"
+
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+namespace {
+  template<typename T>
+  __global__
+  void vectorAdd(const T *a, const T *b, T *c, int numElements) {
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if(i < numElements) { c[i] = a[i] + b[i]; }
+  }
+
+  template <typename T>
+  __global__
+  void vectorProd(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y*blockDim.y + threadIdx.y;
+    int col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(row < numElements && col < numElements) {
+      c[row*numElements + col] = a[row]*b[col];
+    }
+  }
+
+  template <typename T>
+  __global__
+  void matrixMul(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y*blockDim.y + threadIdx.y;
+    int col = blockIdx.x*blockDim.x + threadIdx.x;
+
+    if(row < numElements && col < numElements) {
+      T tmp = 0;
+      for(int i=0; i<numElements; ++i) {
+        tmp += a[row*numElements + i] * b[i*numElements + col];
+      }
+      c[row*numElements + col] = tmp;
+    }
+  }
+
+  template <typename T>
+  __global__
+  void matrixMulVector(const T *a, const T *b, T *c, int numElements) {
+    int row = blockIdx.y*blockDim.y + threadIdx.y;
+
+    if(row < numElements) {
+      T tmp = 0;
+      for(int i=0; i<numElements; ++i) {
+        tmp += a[row*numElements + i] * b[i];
+      }
+      c[row] = tmp;
+    }
+  }
+}
+
+TestCUDAProducerGPUKernel::TestCUDAProducerGPUKernel() {
+  h_a = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
+  h_b = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
+
+  auto current_device = cuda::device::current::get();
+  d_a = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
+  d_b = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
+  d_c = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
+
+  d_ma = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES*NUM_VALUES);
+  d_mb = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES*NUM_VALUES);
+  d_mc = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES*NUM_VALUES);
+}
+
+float *TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) {
+  // First make the sanity check
+  if(d_input != nullptr) {
+    auto h_check = std::make_unique<float[]>(NUM_VALUES);
+    cuda::memory::copy(h_check.get(), d_input, NUM_VALUES*sizeof(float));
+    for(int i=0; i<NUM_VALUES; ++i) {
+      if(h_check[i] != i) {
+        throw cms::Exception("Assert") << "Sanity check on element " << i << " failed, expected " << i << " got " << h_check[i];
+      }
+    }
+  }
+
+  for (auto i=0; i<NUM_VALUES; i++) {
+    h_a[i] = i;
+    h_b[i] = i*i;
+  }
+
+  cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES*sizeof(float), stream.id());
+  cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES*sizeof(float), stream.id());
+
+  int threadsPerBlock {32};
+  int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
+
+  auto current_device = cuda::device::current::get();
+  edm::LogPrint("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id();
+  vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
+  dim3 blocksPerGrid3{1,1};
+  if(NUM_VALUES*NUM_VALUES > 32) {
+    threadsPerBlock3.x = 32;
+    threadsPerBlock3.y = 32;
+    blocksPerGrid3.x = ceil(double(NUM_VALUES)/double(threadsPerBlock3.x));
+    blocksPerGrid3.y = ceil(double(NUM_VALUES)/double(threadsPerBlock3.y));
+  }
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream.id()>>>(d_a.get(), d_b.get(), d_ma.get(), NUM_VALUES);
+  vectorProd<<<blocksPerGrid3, threadsPerBlock3, 0, stream.id()>>>(d_a.get(), d_c.get(), d_mb.get(), NUM_VALUES);
+  matrixMul<<<blocksPerGrid3, threadsPerBlock3, 0, stream.id()>>>(d_ma.get(), d_mb.get(), d_mc.get(), NUM_VALUES);
+
+  matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
+
+  edm::LogPrint("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU kernels launched, returning return pointer device " << current_device.id() << " CUDA stream " << stream.id();
+  return d_a.get();
+}
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.h
new file mode 100644
index 0000000000000..e462516716f70
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.h
@@ -0,0 +1,43 @@
+#ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+#define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
+
+#include <cuda/api_wrappers.h>
+
+/**
+ * This class models the actual CUDA implementation of an algorithm.
+ * It follows RAII, i.e. does all memory allocations in its
+ * constructor.
+ *
+ * The algorithm is intended to waste time with large matrix
+ * operations so that the asynchronous nature of the CUDA integration
+ * becomes visible with debug prints.
+ */
+class TestCUDAProducerGPUKernel {
+public:
+  static constexpr int NUM_VALUES = 4000;
+
+  TestCUDAProducerGPUKernel();
+  ~TestCUDAProducerGPUKernel() = default;
+
+  // returns (non-owning) pointer to device memory
+  float *runAlgo(const std::string& label, cuda::stream_t<>& stream) {
+    return runAlgo(label, nullptr, stream);
+  }
+  float *runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream);
+
+private:
+  // stored for the job duration
+  cuda::memory::host::unique_ptr<float[]> h_a;
+  cuda::memory::host::unique_ptr<float[]> h_b;
+  cuda::memory::device::unique_ptr<float[]> d_a;
+  cuda::memory::device::unique_ptr<float[]> d_b;
+  cuda::memory::device::unique_ptr<float[]> d_c;
+  cuda::memory::device::unique_ptr<float[]> d_ma;
+  cuda::memory::device::unique_ptr<float[]> d_mb;
+  cuda::memory::device::unique_ptr<float[]> d_mc;
+
+  // temporary storage, need to be somewhere to allow async execution
+  cuda::memory::device::unique_ptr<float[]> d_d;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUtoCPU.cc
new file mode 100644
index 0000000000000..7ab018eba6402
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUtoCPU.cc
@@ -0,0 +1,78 @@
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
+
+#include "TestCUDAProducerGPUKernel.h"
+
+class TestCUDAProducerGPUtoCPU: public CUDAStreamEDProducer<edm::ExternalWork> {
+public:
+  explicit TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig);
+  ~TestCUDAProducerGPUtoCPU() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  void beginStreamCUDA(edm::StreamID id) override;
+
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+private:
+  std::string label_;
+  edm::EDGetTokenT<CUDA<float *>> srcToken_;
+  cuda::memory::host::unique_ptr<float[]> buffer_;
+};
+
+TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig):
+  label_(iConfig.getParameter<std::string>("@module_label")),
+  srcToken_(consumes<CUDA<float *>>(iConfig.getParameter<edm::InputTag>("src")))
+{
+  produces<int>();
+}
+
+void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDA<float *>.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
+}
+
+void TestCUDAProducerGPUtoCPU::beginStreamCUDA(edm::StreamID id) {
+  // Pinned host memory has to be allocated here as well so that it is
+  // not done when running on a non-GPU machine
+  buffer_ = cuda::memory::host::make_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES);
+}
+
+void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  edm::Handle<CUDA<float *>> hin;
+  iEvent.getByToken(srcToken_, hin);
+  auto ctx = CUDAScopedContext(*hin, std::move(waitingTaskHolder));
+  const float *device = ctx.get(*hin);
+
+  // Enqueue async copy, continue in produce once finished
+  cuda::memory::async::copy(buffer_.get(), device, TestCUDAProducerGPUKernel::NUM_VALUES*sizeof(float), ctx.stream().id());
+
+  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+}
+
+void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  int counter = 0;
+  for(int i=0; i<TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
+    counter += buffer_[i];
+  }
+
+  iEvent.put(std::make_unique<int>(counter));
+
+  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << counter;
+}
+
+DEFINE_FWK_MODULE(TestCUDAProducerGPUtoCPU);
diff --git a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py b/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
new file mode 100644
index 0000000000000..28f9f2539b854
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
@@ -0,0 +1,121 @@
+import FWCore.ParameterSet.Config as cms
+
+process = cms.Process("Test")
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) )
+
+process.options = cms.untracked.PSet(
+#    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(0)
+)
+#process.Tracer = cms.Service("Tracer")
+
+# Flow diagram of the modules
+#
+#     1   5
+#    / \
+#   2  4
+#   |
+#   3
+#
+# CPU producers
+from HeterogeneousCore.CUDACore.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+process.prod1CPU = testCUDAProducerCPU.clone()
+process.prod2CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
+process.prod3CPU = testCUDAProducerCPU.clone(src = "prod2CPU")
+process.prod4CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
+process.prod5CPU = testCUDAProducerCPU.clone()
+
+# Module to decide whether the chain of CUDA modules are run
+from HeterogeneousCore.CUDACore.cudaDeviceChooser_cfi import cudaDeviceChooser
+process.prodCUDADevice = cudaDeviceChooser.clone()
+
+# Filter to disable a Path in case we don't run on CUDA
+from HeterogeneousCore.CUDACore.cudaDeviceFilter_cfi import cudaDeviceFilter
+process.prodCUDADeviceFilter = cudaDeviceFilter.clone(src = "prodCUDADevice")
+
+from HeterogeneousCore.CUDACore.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDACore.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDACore.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDACore.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+
+# GPU producers
+process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADevice")
+process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
+process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
+process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
+process.prod5CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADevice")
+
+# Modules to copy data from GPU to CPU (as "on demand" as any other
+# EDProducer, i.e. according to consumes() and prefetching)
+process.prod1FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod1CUDA")
+process.prod2FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
+process.prod3FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
+process.prod4FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
+process.prod5FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod5CUDA")
+
+# These ones are to provide backwards compatibility to the downstream
+# clients. To be replaced with an enhanced version of EDAlias (with an
+# ordered fallback mechanism).
+from HeterogeneousCore.CUDACore.testCUDAProducerFallback_cfi import testCUDAProducerFallback
+process.prod1 = testCUDAProducerFallback.clone(src = ["prod1FromCUDA", "prod1cpu"])
+process.prod2 = testCUDAProducerFallback.clone(src = ["prod2FromCUDA", "prod2cpu"])
+process.prod3 = testCUDAProducerFallback.clone(src = ["prod3FromCUDA", "prod3cpu"])
+process.prod4 = testCUDAProducerFallback.clone(src = ["prod4FromCUDA", "prod4cpu"])
+process.prod5 = testCUDAProducerFallback.clone(src = ["prod5FromCUDA", "prod5cpu"])
+
+process.out = cms.OutputModule("AsciiOutputModule",
+    outputCommands = cms.untracked.vstring(
+        "keep *_prod3_*_*",
+        "keep *_prod4_*_*",
+        "keep *_prod5_*_*",
+    ),
+    verbosity = cms.untracked.uint32(0),
+)
+
+process.prodCPU1 = cms.Path(
+    ~process.prodCUDADeviceFilter +
+    process.prod1CPU +
+    process.prod2CPU +
+    process.prod3CPU +
+    process.prod4CPU
+)
+process.prodCUDA1 = cms.Path(
+    process.prodCUDADeviceFilter +
+    process.prod1CUDA +
+    process.prod2CUDA +
+    process.prod3CUDA +
+    process.prod4CUDA
+)
+
+process.prodCPU5 = cms.Path(
+    ~process.prodCUDADeviceFilter +
+    process.prod5CPU
+)
+process.prodCUDA5 = cms.Path(
+    process.prodCUDADeviceFilter +
+    process.prod5CUDA
+)
+
+process.t = cms.Task(
+    process.prodCUDADevice,
+
+    # Eventually the goal is to specify these as part of a Task,
+    # but (at least) as long as the fallback mechanism is implemented
+    # with an EDProducer, they must be in a Path.
+#    process.prod2CPU, process.prod3CPU, process.prod4CPU,
+#    process.prod2CUDA, process.prod3CUDA, process.prod4CUDA,
+
+    process.prod1FromCUDA, process.prod2FromCUDA, process.prod3FromCUDA, process.prod4FromCUDA, process.prod5FromCUDA,
+    process.prod1, process.prod2, process.prod3, process.prod4, process.prod5,
+)
+process.p = cms.Path()
+process.p.associate(process.t)
+process.ep = cms.EndPath(process.out)
+
+# Example of limiting the number of EDM streams per device
+#process.CUDAService.numberOfStreamsPerDevice = 1
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDA.cc b/HeterogeneousCore/CUDACore/test/test_CUDA.cc
new file mode 100644
index 0000000000000..8524f6d552bba
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDA.cc
@@ -0,0 +1,59 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "TestCUDA.h"
+
+#include <cuda_runtime_api.h>
+
+TEST_CASE("Use of CUDA template", "[CUDACore]") {
+  SECTION("Default constructed") {
+    auto foo = CUDA<int>();
+    REQUIRE(!foo.isValid());
+
+    auto bar = std::move(foo);
+  }
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  {
+    auto token = CUDAToken(defaultDevice);
+    CUDA<int> data = TestCUDA::create(10, token);
+
+    SECTION("Construct from CUDAToken") {
+      REQUIRE(data.isValid());
+      REQUIRE(data.device() == defaultDevice);
+      REQUIRE(data.stream().id() == token.stream().id());
+      REQUIRE(&data.event() != nullptr);
+    }
+
+    SECTION("Move constructor") {
+      auto data2 = CUDA<int>(std::move(data));
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+
+    SECTION("Move assignment") {
+      CUDA<int> data2;
+      data2 = std::move(data);
+      REQUIRE(data2.isValid());
+      REQUIRE(!data.isValid());
+    }
+  }
+
+  // Destroy and clean up all resources so that the next test can
+  // assume to start from a clean state.
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  cudaDeviceReset();
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
new file mode 100644
index 0000000000000..40518eea7a330
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
@@ -0,0 +1,96 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+#include <cuda_runtime_api.h>
+
+static constexpr auto s_tag = "[CUDADeviceChooser]";
+
+TEST_CASE("Standard checks of CUDADeviceChooser", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooser")
+process.moduleToTest(process.toTest)
+)_"
+  };
+  
+  edm::test::TestProcessor::Config config{ baseConfig };  
+  SECTION("base configuration is OK") {
+    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
+  }
+  
+  SECTION("No event data") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.test());
+  }
+  
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+
+}
+
+TEST_CASE("CUDADeviceChooser enabled", s_tag) {
+  const std::string config{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooser")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  SECTION("CUDAToken") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+    
+    REQUIRE(event.get<CUDAToken>()->device() >= 0);
+    REQUIRE(event.get<CUDAToken>()->stream().id() != nullptr);
+  }
+}
+
+TEST_CASE("CUDADeviceChooser disabled", s_tag) {
+  const std::string config{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooser", enabled=cms.bool(False))
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  SECTION("CUDAToken") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+
+    REQUIRE_THROWS_AS(event.get<CUDAToken>()->device(), cms::Exception);
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
new file mode 100644
index 0000000000000..5cdde908ba69b
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -0,0 +1,103 @@
+#include "catch.hpp"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+
+#include "TestCUDA.h"
+#include "test_CUDAScopedContextKernels.h"
+
+namespace {
+  std::unique_ptr<CUDA<int *> > produce(const CUDAToken& token, int *d, int *h) {
+    auto ctx = CUDAScopedContext(token);
+
+    cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id());
+    testCUDAScopedContextKernels_single(d, ctx.stream());
+    return ctx.wrap(d);
+  }
+}
+
+TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  constexpr int defaultDevice = 0;
+  {
+    auto token = CUDAToken(defaultDevice);
+
+    SECTION("From CUDAToken") {
+      auto ctx = CUDAScopedContext(token);
+      REQUIRE(cuda::device::current::get().id() == token.device());
+      REQUIRE(ctx.stream().id() == token.stream().id());
+    }
+
+    SECTION("From CUDA<T>") {
+      const CUDA<int> data = TestCUDA::create(10, token);
+
+      auto ctx = CUDAScopedContext(data);
+      REQUIRE(cuda::device::current::get().id() == data.device());
+      REQUIRE(ctx.stream().id() == data.stream().id());
+    }
+
+    SECTION("Wrap T to CUDA<T>") {
+      auto ctx = CUDAScopedContext(token);
+
+      std::unique_ptr<CUDA<int> > dataPtr = ctx.wrap(10);
+      REQUIRE(dataPtr.get() != nullptr);
+      REQUIRE(dataPtr->device() == ctx.device());
+      REQUIRE(dataPtr->stream().id() == ctx.stream().id());
+    }
+
+    SECTION("Joining multiple CUDA streams") {
+      cuda::device::current::scoped_override_t<> setDeviceForThisScope(defaultDevice);
+      auto current_device = cuda::device::current::get();
+
+      // Mimick a producer on the second CUDA stream
+      int h_a1 = 1;
+      auto d_a1 = cuda::memory::device::make_unique<int>(current_device);
+      auto wprod1 = produce(token, d_a1.get(), &h_a1);
+
+      // Mimick a producer on the second CUDA stream
+      auto token2 = CUDAToken(defaultDevice);
+      REQUIRE(token.stream().id() != token2.stream().id());
+      int h_a2 = 2;
+      auto d_a2 = cuda::memory::device::make_unique<int>(current_device);
+      auto wprod2 = produce(token2, d_a2.get(), &h_a2);
+
+      // Mimick a third producer "joining" the two streams
+      auto ctx = CUDAScopedContext(token);
+
+      auto prod1 = ctx.get(*wprod1);
+      auto prod2 = ctx.get(*wprod2);
+
+      auto d_a3 = cuda::memory::device::make_unique<int>(current_device);
+      testCUDAScopedContextKernels_join(prod1, prod2, d_a3.get(), ctx.stream());
+      ctx.stream().synchronize();
+      REQUIRE(wprod2->event().has_occurred());
+
+      h_a1 = 0;
+      h_a2 = 0;
+      int h_a3 = 0;
+      cuda::memory::async::copy(&h_a1, d_a1.get(), sizeof(int), ctx.stream().id());
+      cuda::memory::async::copy(&h_a2, d_a2.get(), sizeof(int), ctx.stream().id());
+      cuda::memory::async::copy(&h_a3, d_a3.get(), sizeof(int), ctx.stream().id());
+
+      REQUIRE(h_a1 == 2);
+      REQUIRE(h_a2 == 4);
+      REQUIRE(h_a3 == 6);
+    }
+  }
+
+  // Destroy and clean up all resources so that the next test can
+  // assume to start from a clean state.
+  cudaCheck(cudaSetDevice(defaultDevice));
+  cudaCheck(cudaDeviceSynchronize());
+  cudaDeviceReset();
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
new file mode 100644
index 0000000000000..18bdf50abeaa5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.cu
@@ -0,0 +1,24 @@
+#include "test_CUDAScopedContextKernels.h"
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+namespace {
+  __global__
+  void single_mul(int *d) {
+    d[0] = d[0]*2;
+  }
+
+  __global__
+  void join_add(const int *d1, const int *d2, int *d3) {
+    d3[0] = d1[0] + d2[0];
+  }
+}
+
+void testCUDAScopedContextKernels_single(int *d, cuda::stream_t<>& stream) {
+  single_mul<<<1, 1, 0, stream.id()>>>(d);
+}
+
+void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cuda::stream_t<>& stream) {
+  join_add<<<1, 1, 0, stream.id()>>>(d1, d2, d3);
+}
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
new file mode 100644
index 0000000000000..9d3f9ce33bc97
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContextKernels.h
@@ -0,0 +1,9 @@
+#ifndef HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
+#define HeterogeneousCore_CUDACore_test_CUDAScopedContextKernels_h
+
+#include <cuda/api_wrappers.h>
+
+void testCUDAScopedContextKernels_single(int *d, cuda::stream_t<>& stream);
+void testCUDAScopedContextKernels_join(const int *d1, const int *d2, int *d3, cuda::stream_t<>& stream);
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc
new file mode 100644
index 0000000000000..c1a57db940018
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc
@@ -0,0 +1,103 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+#include "TestCUDA.h"
+
+#include <iostream>
+
+static constexpr auto s_tag = "[TestCUDAProducerGPUFirst]";
+
+TEST_CASE("Standard checks of TestCUDAProducerGPUFirst", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
+process.moduleToTest(process.toTest)
+)_"
+  };
+  
+  edm::test::TestProcessor::Config config{ baseConfig };  
+  SECTION("base configuration is OK") {
+    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
+  }
+  
+  SECTION("No event data") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_THROWS_AS(tester.test(), cms::Exception);
+    //If the module does not throw when given no data, substitute 
+    //REQUIRE_NOTHROW for REQUIRE_THROWS_AS
+  }
+  
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+
+}
+
+TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst",
+    src = cms.InputTag("deviceChooser")
+)
+process.moduleToTest(process.toTest)
+)_"
+  };
+  edm::test::TestProcessor::Config config{ baseConfig };  
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  auto putToken = config.produces<CUDAToken>("deviceChooser");
+
+  constexpr int defaultDevice = 0;
+
+  SECTION("Produce") {
+    edm::test::TestProcessor tester{config};
+    auto tokenPtr = std::make_unique<CUDAToken>(defaultDevice);
+    auto event = tester.test(std::make_pair(putToken, std::move(tokenPtr)));
+    auto prod = event.get<CUDA<float *> >();
+    REQUIRE(prod->device() == defaultDevice);
+    const float *data = TestCUDA::get(*prod);
+    REQUIRE(data != nullptr);
+
+    float firstElements[10];
+    cuda::memory::async::copy(firstElements, data, sizeof(float)*10, prod->stream().id());
+
+    std::cout << "Synchronizing with CUDA stream" << std::endl;
+    auto stream = prod->stream();
+    stream.synchronize();
+    std::cout << "Synchronized" << std::endl;
+    REQUIRE(firstElements[0] == 0.f);
+    REQUIRE(firstElements[1] == 1.f);
+    REQUIRE(firstElements[9] == 9.f);
+  }
+};
diff --git a/HeterogeneousCore/CUDACore/test/test_main.cc b/HeterogeneousCore/CUDACore/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"

From c186d16cf23cb456108103fcbd91be6a8987e68f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 3 Aug 2018 15:21:18 +0200
Subject: [PATCH 02/49] Merge CUDADeviceChooser and CUDADeviceFilter to
 CUDADeviceChooserFilter, add CUDADeviceChooserProducer

Developments

Fix unit test

Unit test for CUDADeviceChooserProducer
---
 HeterogeneousCore/CUDACore/README.md          |  50 +++++----
 .../CUDACore/plugins/CUDADeviceChooser.cc     |  89 ---------------
 .../plugins/CUDADeviceChooserFilter.cc        |  76 +++++++++++++
 .../plugins/CUDADeviceChooserProducer.cc      |  68 ++++++++++++
 .../CUDACore/plugins/CUDADeviceFilter.cc      |  40 -------
 .../CUDACore/plugins/chooseCUDADevice.cc      |  17 +++
 .../CUDACore/plugins/chooseCUDADevice.h       |  10 ++
 .../CUDACore/test/testCUDA_cfg.py             |  16 +--
 ...ser.cc => test_CUDADeviceChooserFilter.cc} |  14 +--
 .../test/test_CUDADeviceChooserProducer.cc    | 103 ++++++++++++++++++
 10 files changed, 313 insertions(+), 170 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
 rename HeterogeneousCore/CUDACore/test/{test_CUDADeviceChooser.cc => test_CUDADeviceChooserFilter.cc} (84%)
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc

diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 05e524a08a87f..28e24f648e7eb 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -21,38 +21,42 @@ deployed and `HeterogeneousEDProducer` retired.
 
 ## Choosing device
 
-The device choosing logic is split to an EDProducer, an EDFilter, and
-use of Paths in the configuration.
-
-First, a `CUDADeviceChooser` EDProducer is run. It has the logic to
-device whether the following chain of EDModules should run on a CUDA
-device or not, and if yes, on which CUDA device. If it decides "yes",
-it produces a `CUDAToken`, which contains the device id and a CUDA
-stream. If it decides "no", it does not produce anything.
-
-Next step is a `CUDADeviceFilter` EDFilter. It checks whether the
-`CUDADeviceChooser` produced a product or not. If "yes", it returns
-`true`, and if "no", it returns `false`.
-
-Finally, the pieces need to be put together in the configuration. The
-`CUDADeviceChooser` can be "anywhere", but the `CUDADeviceFilter`
-should be the first module on a `cms.Path`, followed by the CUDA
-EDProducers (in the future it may become sufficient to have only the
-first EDProducer of a chain in the `Path`).
+### Dynamically between GPU and CPU
+
+The device choosing (CPU vs. GPU, which GPU) logic is done by an
+EDFilter and using Paths in the configuration.
+
+First, a `CUDADeviceChooserFilter` EDFilter is run. It has the logic
+to device whether the following chain of EDModules should run on a
+CUDA device or not, and if yes, on which CUDA device. If it decides
+"yes", it returns `true` and produces a `CUDAToken`, which contains
+the device id and a CUDA stream. If it decides "no", it returns
+`false` and does not produce anything.
+
+Then, the pieces need to be put together in the configuration. The
+`CUDADeviceChooserFilter` should be put as the first module on a
+`cms.Path`, followed by the CUDA EDProducers (in the future it may
+become sufficient to have only the first EDProducer of a chain in the
+`Path`).
 ```python
-process.fooCUDADevice = cms.EDProducer("CUDADeviceChooser")
-process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceFilter",
+process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceChooserFilter",
     src = cms.InputTag("fooCUDADevice")
 )
 process.fooCUDA = cms.EDProducer("FooProducerCUDA")
 process.fooPathCUDA = cms.Path(
     process.fooCUDADeviceFilter + process.fooCUDA
 )
-process.fooTask = cms.Task(
-    process.fooDevice
-)
 ```
 
+### Always on GPU
+
+In case the chain of modules should always be run on a GPU, the
+EDFilter and Paths are not needed. In this case, a
+`CUDADeviceChooserProducer` should be used to produce the `CUDAToken`.
+If the machine has no GPUs or `CUDAService` is disabled, the producer
+throws an exception.
+
+
 ## Data model
 
 The GPU data can be a single pointer to device data, or a class/struct
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
deleted file mode 100644
index 58411f749d60e..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-#include <cuda/api_wrappers.h>
-
-#include <memory>
-
-namespace {
-  struct DeviceCache {
-    int device;
-    bool enabled;
-  };
-}
-
-class CUDADeviceChooser: public edm::global::EDProducer<edm::StreamCache<::DeviceCache> > {
-public:
-  explicit CUDADeviceChooser(const edm::ParameterSet& iConfig);
-  ~CUDADeviceChooser() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
-
-  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
-
-private:
-  bool enabled_;
-};
-
-CUDADeviceChooser::CUDADeviceChooser(const edm::ParameterSet& iConfig):
-  enabled_(iConfig.getParameter<bool>("enabled"))
-{
-  produces<CUDAToken>();
-}
-
-void CUDADeviceChooser::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<bool>("enabled", true)->setComment("This parameter is intended for debugging purposes only. If disabling some CUDA chains is needed for production, it is better to remove the CUDA modules altogether from the configuration.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer chooses whether a chain of CUDA EDModules depending on it should run or not. The decision is communicated downstream by the existence of a 'CUDAToken' event product. Intended to be used with CUDADeviceFilter.");
-}
-
-std::unique_ptr<::DeviceCache> CUDADeviceChooser::beginStream(edm::StreamID id) const {
-  auto ret = std::make_unique<::DeviceCache>();
-
-  edm::Service<CUDAService> cudaService;
-  ret->enabled = (enabled_ && cudaService->enabled(id));
-  if(!ret->enabled) {
-    return ret;
-  }
-
-  // For startes we "statically" assign the device based on
-  // edm::Stream number. This is suboptimal if the number of
-  // edm::Streams is not a multiple of the number of CUDA devices
-  // (and even then there is no load balancing).
-  //
-  // TODO: improve. Possible ideas include
-  // - allocate M (< N(edm::Streams)) buffers per device per "chain of modules", choose dynamically which (buffer, device) to use
-  // - our own CUDA memory allocator
-  //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
-  //   * would probably still need some buffer space/device to hold e.g. conditions data
-  //     - for conditions, how to handle multiple lumis per job?
-  ret->device = id % cudaService->numberOfDevices();
-
-  LogDebug("CUDADeviceChooser") << "EDM stream " << id << " set to CUDA device " << ret->device;
-
-  return ret;
-}
-
-void CUDADeviceChooser::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  auto cache = streamCache(id);
-  if(!cache->enabled) {
-    return;
-  }
-
-  auto ret = std::make_unique<CUDAToken>(cache->device);
-  LogDebug("CUDADeviceChooser") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
-  iEvent.put(std::move(ret));
-}
-
-
-DEFINE_FWK_MODULE(CUDADeviceChooser);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
new file mode 100644
index 0000000000000..15216edb020c5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
@@ -0,0 +1,76 @@
+#include "FWCore/Framework/interface/global/EDFilter.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+namespace {
+  struct DeviceCache {
+    int device;
+    bool enabled;
+  };
+}
+
+class CUDADeviceChooserFilter: public edm::global::EDFilter<edm::StreamCache<::DeviceCache>> {
+public:
+  explicit CUDADeviceChooserFilter(const edm::ParameterSet& iConfig);
+  ~CUDADeviceChooserFilter() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
+
+  bool filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  bool enabled_;
+};
+
+CUDADeviceChooserFilter::CUDADeviceChooserFilter(const edm::ParameterSet& iConfig):
+  enabled_(iConfig.getParameter<bool>("enabled"))
+{
+  produces<CUDAToken>();
+}
+
+void CUDADeviceChooserFilter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("enabled", true)->setComment("This parameter is intended for debugging purposes only. If disabling some CUDA chains is needed for production, it is better to remove the CUDA modules altogether from the configuration.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDFilter chooses whether a chain of CUDA EDModules depending on it should run or not, and on which CUDA device they should run. The decision is communicated downstream with the filter decision. In addition, if the filter returns true, a 'CUDAToken' is produced into the event (for false nothing is produced).");
+}
+
+std::unique_ptr<::DeviceCache> CUDADeviceChooserFilter::beginStream(edm::StreamID id) const {
+  auto ret = std::make_unique<::DeviceCache>();
+
+  edm::Service<CUDAService> cudaService;
+  ret->enabled = (enabled_ && cudaService->enabled(id));
+  if(!ret->enabled) {
+    return ret;
+  }
+
+  ret->device = cudacore::chooseCUDADevice(id);
+
+  LogDebug("CUDADeviceChooserFilter") << "EDM stream " << id << " set to CUDA device " << ret->device;
+
+  return ret;
+}
+
+bool CUDADeviceChooserFilter::filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  auto cache = streamCache(id);
+  if(!cache->enabled) {
+    return false;
+  }
+
+  auto ret = std::make_unique<CUDAToken>(cache->device);
+  LogDebug("CUDADeviceChooserFilter") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
+  iEvent.put(std::move(ret));
+  return true;
+}
+
+DEFINE_FWK_MODULE(CUDADeviceChooserFilter);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
new file mode 100644
index 0000000000000..13a4d6b34e521
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
@@ -0,0 +1,68 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+#include <memory>
+
+namespace {
+  struct DeviceCache {
+    int device;
+  };
+}
+
+class CUDADeviceChooserProducer: public edm::global::EDProducer<edm::StreamCache<::DeviceCache>> {
+public:
+  explicit CUDADeviceChooserProducer(const edm::ParameterSet& iConfig);
+  ~CUDADeviceChooserProducer() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+};
+
+CUDADeviceChooserProducer::CUDADeviceChooserProducer(const edm::ParameterSet& iConfig) {
+  edm::Service<CUDAService> cudaService;
+  if(!cudaService->enabled()) {
+    throw cms::Exception("Configuration") << "CUDAService is disabled so CUDADeviceChooserProducer is unable to make decisions on which CUDA device to run. If you need to run without CUDA devices, please use CUDADeviceChooserFilter for conditional execution, or remove all CUDA modules from your configuration.";
+  }
+  produces<CUDAToken>();
+}
+
+void CUDADeviceChooserProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer chooses on which CUDA device the chain of CUDA EDModules depending on it should run. The decision is communicated downstream with the 'CUDAToken' event product. It is an error if there are no CUDA devices, or CUDAService is disabled.");
+}
+
+std::unique_ptr<::DeviceCache> CUDADeviceChooserProducer::beginStream(edm::StreamID id) const {
+  auto ret = std::make_unique<::DeviceCache>();
+
+  edm::Service<CUDAService> cudaService;
+  if(!cudaService->enabled(id)) {
+    throw cms::Exception("LogicError") << "CUDA is disabled for EDM stream " << id << " in CUDAService, so CUDADeviceChooser is unable to decide the CUDA device for this EDM stream. If you need to dynamically decide whether a chain of CUDA EDModules is run or not, please use CUDADeviceChooserFilter instead.";
+  }
+  ret->device = cudacore::chooseCUDADevice(id);
+
+  LogDebug("CUDADeviceChooserProducer") << "EDM stream " << id << " set to CUDA device " << ret->device;
+
+  return ret;
+}
+
+void CUDADeviceChooserProducer::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  auto ret = std::make_unique<CUDAToken>(streamCache(id)->device);
+  LogDebug("CUDADeviceChooserProducer") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
+  iEvent.put(std::move(ret));
+}
+
+
+DEFINE_FWK_MODULE(CUDADeviceChooserProducer);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
deleted file mode 100644
index cf4a8093a94bd..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "FWCore/Framework/interface/global/EDFilter.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-
-class CUDADeviceFilter: public edm::global::EDFilter<> {
-public:
-  explicit CUDADeviceFilter(const edm::ParameterSet& iConfig);
-  ~CUDADeviceFilter() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  bool filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
-
-private:
-  edm::EDGetTokenT<CUDAToken> token_;
-};
-
-CUDADeviceFilter::CUDADeviceFilter(const edm::ParameterSet& iConfig):
-  token_(consumes<CUDAToken>(iConfig.getParameter<edm::InputTag>("src")))
-{}
-
-void CUDADeviceFilter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag("cudaDeviceChooser"))->setComment("Source of the 'CUDAToken'.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDFilter filters based on the existence of a 'CUDAToken' event product. Intended to be used together with CUDADeviceChooser. Returns 'true' if the product exists, and 'false' if not.");
-}
-
-bool CUDADeviceFilter::filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::Handle<CUDAToken> handle;
-  iEvent.getByToken(token_, handle);
-  return handle.isValid();
-}
-
-DEFINE_FWK_MODULE(CUDADeviceFilter);
diff --git a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
new file mode 100644
index 0000000000000..b17158626a4e8
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
@@ -0,0 +1,17 @@
+#include "chooseCUDADevice.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id) {
+    edm::Service<CUDAService> cudaService;
+
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve the "assignment" logic
+    return id % cudaService->numberOfDevices();
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
new file mode 100644
index 0000000000000..bb09c302af7f5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h
+#define HeterogeneousCore_CUDACore_chooseCUDADevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py b/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
index 28f9f2539b854..6ea678259f245 100644
--- a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
@@ -30,13 +30,9 @@
 process.prod4CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
 process.prod5CPU = testCUDAProducerCPU.clone()
 
-# Module to decide whether the chain of CUDA modules are run
-from HeterogeneousCore.CUDACore.cudaDeviceChooser_cfi import cudaDeviceChooser
-process.prodCUDADevice = cudaDeviceChooser.clone()
-
-# Filter to disable a Path in case we don't run on CUDA
-from HeterogeneousCore.CUDACore.cudaDeviceFilter_cfi import cudaDeviceFilter
-process.prodCUDADeviceFilter = cudaDeviceFilter.clone(src = "prodCUDADevice")
+# Module to decide whether the chain of CUDA modules are run, and to disable a Path in case we don't run on CUDA
+from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter
+process.prodCUDADeviceFilter = cudaDeviceChooserFilter.clone()
 
 from HeterogeneousCore.CUDACore.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
 from HeterogeneousCore.CUDACore.testCUDAProducerGPU_cfi import testCUDAProducerGPU
@@ -44,11 +40,11 @@
 from HeterogeneousCore.CUDACore.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
 
 # GPU producers
-process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADevice")
+process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
 process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
 process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
 process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
-process.prod5CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADevice")
+process.prod5CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
 
 # Modules to copy data from GPU to CPU (as "on demand" as any other
 # EDProducer, i.e. according to consumes() and prefetching)
@@ -102,8 +98,6 @@
 )
 
 process.t = cms.Task(
-    process.prodCUDADevice,
-
     # Eventually the goal is to specify these as part of a Task,
     # but (at least) as long as the fallback mechanism is implemented
     # with an EDProducer, they must be in a Path.
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
similarity index 84%
rename from HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
rename to HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
index 40518eea7a330..09f419c33357e 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
@@ -6,14 +6,14 @@
 
 #include <cuda_runtime_api.h>
 
-static constexpr auto s_tag = "[CUDADeviceChooser]";
+static constexpr auto s_tag = "[CUDADeviceChooserFilter]";
 
-TEST_CASE("Standard checks of CUDADeviceChooser", s_tag) {
+TEST_CASE("Standard checks of CUDADeviceChooserFilter", s_tag) {
   const std::string baseConfig{
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooser")
+process.toTest = cms.EDFilter("CUDADeviceChooserFilter")
 process.moduleToTest(process.toTest)
 )_"
   };
@@ -49,12 +49,12 @@ process.moduleToTest(process.toTest)
 
 }
 
-TEST_CASE("CUDADeviceChooser enabled", s_tag) {
+TEST_CASE("CUDADeviceChooserFilter enabled", s_tag) {
   const std::string config{
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooser")
+process.toTest = cms.EDFilter("CUDADeviceChooserFilter")
 process.moduleToTest(process.toTest)
 )_"
   };
@@ -77,12 +77,12 @@ process.moduleToTest(process.toTest)
   }
 }
 
-TEST_CASE("CUDADeviceChooser disabled", s_tag) {
+TEST_CASE("CUDADeviceChooserFilter disabled", s_tag) {
   const std::string config{
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooser", enabled=cms.bool(False))
+process.toTest = cms.EDFilter("CUDADeviceChooserFilter", enabled=cms.bool(False))
 process.moduleToTest(process.toTest)
 )_"
   };
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc
new file mode 100644
index 0000000000000..f567838730c41
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc
@@ -0,0 +1,103 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+#include <cuda_runtime_api.h>
+
+static constexpr auto s_tag = "[CUDADeviceChooserFilter]";
+
+TEST_CASE("Standard checks of CUDADeviceProducer", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+  
+  edm::test::TestProcessor::Config config{ baseConfig };  
+  SECTION("base configuration is OK") {
+    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
+  }
+  
+  SECTION("No event data") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.test());
+  }
+  
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+
+}
+
+TEST_CASE("CUDAService enabled", s_tag) {
+  const std::string config{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  SECTION("CUDAToken") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+    
+    REQUIRE(event.get<CUDAToken>()->device() >= 0);
+    REQUIRE(event.get<CUDAToken>()->stream().id() != nullptr);
+  }
+}
+
+TEST_CASE("CUDAService disabled", s_tag) {
+  const std::string config{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.CUDAService.enabled = False
+process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  SECTION("Construction") {
+    REQUIRE_THROWS_AS(edm::test::TestProcessor{config}, cms::Exception);
+  }
+}

From d156534595df4b6b46250e6d4de2e0b294190b01 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 31 Jul 2018 15:16:53 +0200
Subject: [PATCH 03/49] Move the CUDA test plugins to a CUDATest package to
 test better the configuration building

---
 HeterogeneousCore/CUDACore/test/BuildFile.xml | 10 ---------
 .../CUDATest/plugins/BuildFile.xml            |  9 ++++++++
 .../plugins}/TestCUDAProducerCPU.cc           |  0
 .../plugins}/TestCUDAProducerFallback.cc      |  0
 .../plugins}/TestCUDAProducerGPU.cc           |  0
 .../plugins}/TestCUDAProducerGPUEW.cc         |  0
 .../plugins}/TestCUDAProducerGPUFirst.cc      |  0
 .../plugins}/TestCUDAProducerGPUKernel.cu     |  0
 .../plugins}/TestCUDAProducerGPUKernel.h      |  0
 .../plugins}/TestCUDAProducerGPUtoCPU.cc      |  0
 HeterogeneousCore/CUDATest/test/BuildFile.xml |  5 +++++
 .../test/testCUDA_cfg.py                      | 22 +++++++++----------
 .../test/test_TestCUDAProducerGPUFirst.cc     |  6 ++---
 HeterogeneousCore/CUDATest/test/test_main.cc  |  2 ++
 14 files changed, 30 insertions(+), 24 deletions(-)
 create mode 100644 HeterogeneousCore/CUDATest/plugins/BuildFile.xml
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerCPU.cc (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerFallback.cc (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerGPU.cc (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerGPUEW.cc (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerGPUFirst.cc (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerGPUKernel.cu (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerGPUKernel.h (100%)
 rename HeterogeneousCore/{CUDACore/test => CUDATest/plugins}/TestCUDAProducerGPUtoCPU.cc (100%)
 create mode 100644 HeterogeneousCore/CUDATest/test/BuildFile.xml
 rename HeterogeneousCore/{CUDACore => CUDATest}/test/testCUDA_cfg.py (89%)
 rename HeterogeneousCore/{CUDACore => CUDATest}/test/test_TestCUDAProducerGPUFirst.cc (97%)
 create mode 100644 HeterogeneousCore/CUDATest/test/test_main.cc

diff --git a/HeterogeneousCore/CUDACore/test/BuildFile.xml b/HeterogeneousCore/CUDACore/test/BuildFile.xml
index 97593098d272d..cd2c3b094243c 100644
--- a/HeterogeneousCore/CUDACore/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/test/BuildFile.xml
@@ -4,13 +4,3 @@
   <use name="catch2"/>
   <use name="cuda"/>
 </bin>
-
-<library file="TestCUDAProducerGPUFirst.cc TestCUDAProducerGPU.cc TestCUDAProducerGPUEW.cc TestCUDAProducerGPUtoCPU.cc TestCUDAProducerCPU.cc TestCUDAProducerFallback.cc TestCUDAProducerGPUKernel.cu" name="HeterogeneousCoreCUDACoreTestPlugins">
-  <flags EDM_PLUGIN="1"/>
-  <use name="FWCore/Framework"/>
-  <use name="FWCore/PluginManager"/>
-  <use name="FWCore/ParameterSet"/>
-  <use name="HeterogeneousCore/CUDACore"/>
-  <use name="cuda"/>
-  <use name="cuda-api-wrappers"/>
-</library>
diff --git a/HeterogeneousCore/CUDATest/plugins/BuildFile.xml b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
new file mode 100644
index 0000000000000..09a8fb844d4c9
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/plugins/BuildFile.xml
@@ -0,0 +1,9 @@
+<library file="*.cc *.cu" name="HeterogeneousCoreCUDATestPlugins">
+  <flags EDM_PLUGIN="1"/>
+  <use name="FWCore/Framework"/>
+  <use name="FWCore/PluginManager"/>
+  <use name="FWCore/ParameterSet"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="cuda"/>
+  <use name="cuda-api-wrappers"/>
+</library>
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerCPU.cc
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerFallback.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerFallback.cc
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerFallback.cc
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerFallback.cc
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerGPU.cc
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUEW.cc
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUFirst.cc
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.cu
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUKernel.h
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
similarity index 100%
rename from HeterogeneousCore/CUDACore/test/TestCUDAProducerGPUtoCPU.cc
rename to HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
new file mode 100644
index 0000000000000..f73f9e7b7c5a6
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -0,0 +1,5 @@
+<bin file="test*.cc" name="testHeterogeneousCoreCUDATest">
+  <use name="FWCore/TestProcessor"/>
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+</bin>
diff --git a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
similarity index 89%
rename from HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
rename to HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
index 6ea678259f245..0c54569d2b201 100644
--- a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
@@ -23,7 +23,7 @@
 #   3
 #
 # CPU producers
-from HeterogeneousCore.CUDACore.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
 process.prod1CPU = testCUDAProducerCPU.clone()
 process.prod2CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
 process.prod3CPU = testCUDAProducerCPU.clone(src = "prod2CPU")
@@ -34,10 +34,10 @@
 from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter
 process.prodCUDADeviceFilter = cudaDeviceChooserFilter.clone()
 
-from HeterogeneousCore.CUDACore.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
-from HeterogeneousCore.CUDACore.testCUDAProducerGPU_cfi import testCUDAProducerGPU
-from HeterogeneousCore.CUDACore.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
-from HeterogeneousCore.CUDACore.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
 
 # GPU producers
 process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
@@ -57,12 +57,12 @@
 # These ones are to provide backwards compatibility to the downstream
 # clients. To be replaced with an enhanced version of EDAlias (with an
 # ordered fallback mechanism).
-from HeterogeneousCore.CUDACore.testCUDAProducerFallback_cfi import testCUDAProducerFallback
-process.prod1 = testCUDAProducerFallback.clone(src = ["prod1FromCUDA", "prod1cpu"])
-process.prod2 = testCUDAProducerFallback.clone(src = ["prod2FromCUDA", "prod2cpu"])
-process.prod3 = testCUDAProducerFallback.clone(src = ["prod3FromCUDA", "prod3cpu"])
-process.prod4 = testCUDAProducerFallback.clone(src = ["prod4FromCUDA", "prod4cpu"])
-process.prod5 = testCUDAProducerFallback.clone(src = ["prod5FromCUDA", "prod5cpu"])
+from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback
+process.prod1 = testCUDAProducerFallback.clone(src = ["prod1FromCUDA", "prod1CPU"])
+process.prod2 = testCUDAProducerFallback.clone(src = ["prod2FromCUDA", "prod2CPU"])
+process.prod3 = testCUDAProducerFallback.clone(src = ["prod3FromCUDA", "prod3CPU"])
+process.prod4 = testCUDAProducerFallback.clone(src = ["prod4FromCUDA", "prod4CPU"])
+process.prod5 = testCUDAProducerFallback.clone(src = ["prod5FromCUDA", "prod5CPU"])
 
 process.out = cms.OutputModule("AsciiOutputModule",
     outputCommands = cms.untracked.vstring(
diff --git a/HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
similarity index 97%
rename from HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc
rename to HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index c1a57db940018..3a25491418446 100644
--- a/HeterogeneousCore/CUDACore/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -5,7 +5,7 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
 
-#include "TestCUDA.h"
+#include "HeterogeneousCore/CUDACore/test/TestCUDA.h" // ugly...
 
 #include <iostream>
 
@@ -28,7 +28,7 @@ process.moduleToTest(process.toTest)
   
   SECTION("No event data") {
     edm::test::TestProcessor tester(config);
-    
+
     REQUIRE_THROWS_AS(tester.test(), cms::Exception);
     //If the module does not throw when given no data, substitute 
     //REQUIRE_NOTHROW for REQUIRE_THROWS_AS
@@ -36,7 +36,7 @@ process.moduleToTest(process.toTest)
   
   SECTION("beginJob and endJob only") {
     edm::test::TestProcessor tester(config);
-    
+
     REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
   }
 
diff --git a/HeterogeneousCore/CUDATest/test/test_main.cc b/HeterogeneousCore/CUDATest/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"

From bf032800dae057f44c1bf366bee61a0cf0e66445 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 31 Jul 2018 16:07:02 +0200
Subject: [PATCH 04/49] Prototype of a helper function

---
 .../CUDATest/python/prod1CPU_cfi.py           |   4 +
 .../python/prod1CUDADeviceFilter_cfi.py       |   4 +
 .../CUDATest/python/prod1CUDA_cfi.py          |   4 +
 .../CUDATest/python/prod1FromCUDA_cfi.py      |   4 +
 .../CUDATest/python/prod1_cff.py              |  23 ++++
 .../CUDATest/python/prod5CPU_cfi.py           |   4 +
 .../python/prod5CUDADeviceFilter_cfi.py       |   4 +
 .../CUDATest/python/prod5CUDA_cfi.py          |   4 +
 .../CUDATest/python/prod5Fallback_cfi.py      |   4 +
 .../CUDATest/python/prod5FromCUDA_cfi.py      |   4 +
 .../CUDATest/python/prod5_cff.py              |   9 ++
 .../CUDATest/python/prod6CPU_cfi.py           |   4 +
 .../CUDATest/python/prod6CUDA_cfi.py          |   4 +
 .../CUDATest/python/prod6Fallback_cfi.py      |   4 +
 .../CUDATest/python/prod6FromCUDA_cfi.py      |   4 +
 .../CUDATest/python/prod6_cff.py              |   9 ++
 .../CUDATest/python/setupHeterogeneous.py     | 101 ++++++++++++++++++
 .../CUDATest/test/testCUDA_cfg.py             |  42 +++-----
 18 files changed, 206 insertions(+), 30 deletions(-)
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1_cff.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5_cff.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod6_cff.py
 create mode 100644 HeterogeneousCore/CUDATest/python/setupHeterogeneous.py

diff --git a/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
new file mode 100644
index 0000000000000..421d01c07ea46
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod1CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py
new file mode 100644
index 0000000000000..c8f9c3574c561
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter as _cudaDeviceChooserFilter
+prod1CUDADeviceFilter = _cudaDeviceChooserFilter.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
new file mode 100644
index 0000000000000..b333aa78562cb
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod1CUDA = _testCUDAProducerGPUFirst.clone(src = "prod1CUDADeviceFilter")
diff --git a/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de6ed9b0ff179
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod1FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod1CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod1_cff.py b/HeterogeneousCore/CUDATest/python/prod1_cff.py
new file mode 100644
index 0000000000000..734609b373ac4
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1_cff.py
@@ -0,0 +1,23 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod1CUDADeviceFilter_cfi import prod1CUDADeviceFilter
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA
+
+from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback as _testCUDAProducerFallback
+
+prod1 = _testCUDAProducerFallback.clone(src = ["prod1CUDA", "prod1CPU"])
+
+prod1PathCUDA = cms.Path(
+    prod1CUDADeviceFilter +
+    prod1CUDA
+)
+prod1PathCPU = cms.Path(
+    ~prod1CUDADeviceFilter +
+    prod1CPU
+)
+
+prod1Task = cms.Task(
+    prod1FromCUDA, prod1
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
new file mode 100644
index 0000000000000..3353c774f8f27
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod5CPU = _testCUDAProducerCPU.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py
new file mode 100644
index 0000000000000..cfe4f4048b8f3
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter as _cudaDeviceChooserFilter
+prod5CUDADeviceFilter = _cudaDeviceChooserFilter.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
new file mode 100644
index 0000000000000..ae7f605d07975
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
+prod5CUDA = _testCUDAProducerGPUFirst.clone(src = "prod5CUDADeviceFilter")
diff --git a/HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py b/HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py
new file mode 100644
index 0000000000000..4ad84bdeee4a7
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback as _testCUDAProducerFallback
+prod5Fallback = _testCUDAProducerFallback.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
new file mode 100644
index 0000000000000..c25aa2c5a1043
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod5FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod5_cff.py b/HeterogeneousCore/CUDATest/python/prod5_cff.py
new file mode 100644
index 0000000000000..2b8948551733c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5_cff.py
@@ -0,0 +1,9 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CUDADeviceFilter_cfi import prod5CUDADeviceFilter
+
+# The prod6 is the final, (legacy) CPU-only product name, and the
+# prod6Task is the Task containing all modules. The function itself
+# sets up everything else.
+from HeterogeneousCore.CUDATest.setupHeterogeneous import setupCUDA
+(prod5, prod5Task) = setupCUDA("prod5", prod5CUDADeviceFilter, globals())
diff --git a/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
new file mode 100644
index 0000000000000..1cb1bba3f12b1
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CPU_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU as _testCUDAProducerCPU
+prod6CPU = _testCUDAProducerCPU.clone(src = "prod5CPU")
diff --git a/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
new file mode 100644
index 0000000000000..0e1ea6ebbca0f
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6CUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU as _testCUDAProducerGPU
+prod6CUDA = _testCUDAProducerGPU.clone(src = "prod5CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py b/HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py
new file mode 100644
index 0000000000000..60e9f1910188e
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback as _testCUDAProducerFallback
+prod6Fallback = _testCUDAProducerFallback.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
new file mode 100644
index 0000000000000..de894a5f32617
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6FromCUDA_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU as _testCUDAProducerGPUtoCPU
+prod6FromCUDA = _testCUDAProducerGPUtoCPU.clone(src = "prod6CUDA")
diff --git a/HeterogeneousCore/CUDATest/python/prod6_cff.py b/HeterogeneousCore/CUDATest/python/prod6_cff.py
new file mode 100644
index 0000000000000..7df34426ab67f
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6_cff.py
@@ -0,0 +1,9 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CUDADeviceFilter_cfi import prod5CUDADeviceFilter
+
+# The prod6 is the final, (legacy) CPU-only product name, and the
+# prod6Task is the Task containing all modules. The function itself
+# sets up everything else.
+from HeterogeneousCore.CUDATest.setupHeterogeneous import setupHeterogeneous
+(prod6, prod6Task) = setupHeterogeneous("prod6", ["CUDA", "CPU"], {"CUDA": prod5CUDADeviceFilter}, globals())
diff --git a/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py b/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
new file mode 100644
index 0000000000000..0fc5d7980ceeb
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
@@ -0,0 +1,101 @@
+import FWCore.ParameterSet.Config as cms
+
+# Prototype of the function
+import importlib
+def setupHeterogeneous(prefix, deviceTypes, deviceFilters, modDict,
+                       package=None, transferModuleNames={}, fallbackModuleName=None):
+    """
+    Mandatory parameters:
+    prefix   -- common prefix of the CPU, CUDA, etc producers
+    deviceTypes -- list of strings for the device types
+    deviceFilters -- dict of non-CPU device types to device filter modules
+    modDict  -- globals()
+
+    Optional parameters:
+    package -- Package of the modules (default None signals to use the current package)
+    transferModuleName -- Dictionary for names of the device->CPU modules to be loaded and inserted in modDict (if the dictionary does not contain a key 'prefix', a default value of 'prefix+'From<device>' will be used)
+    fallbackModuleName -- Name of the devices+CPU product fallback producer to be loaded (default None means prefix+'Fallback')
+
+    Returns a pair of
+    - something which looks like an EDProducer picking the product from devices+CPU
+    - Task containing all the added modules
+    """
+    path = ""
+    if package is None:
+        pkgs = __name__.split(".")
+        if len(pkgs) > 1:
+            path = ".".join(pkgs[:-1])+"."
+    else:
+        path = package+"."
+
+    # Per-device producers
+    for dt in deviceTypes:
+        modName = prefix+dt
+        pkg = importlib.import_module(path+modName+"_cfi")
+        mod = getattr(pkg, modName)
+        modDict[modName] = mod
+
+    # device->CPU
+    for dt in deviceTypes:
+        if dt == "CPU":
+            continue
+        transferModName = transferModuleNames.get(dt, prefix+"From"+dt)
+
+        transferModPath = path+transferModName+"_cfi"
+        transferModPkg = importlib.import_module(transferModPath)
+        transferMod = getattr(transferModPkg, transferModName).clone(src=prefix+dt)
+        modDict[transferModName] = transferMod
+
+    # Fallback
+    if fallbackModuleName is None:
+        fallbackModName = prefix+"Fallback"
+    else:
+        fallbackModName = fallbackModuleName
+    fallbackModPath = path+fallbackModName+"_cfi"
+    fallbackModPkg = importlib.import_module(fallbackModPath)
+    def _from(s):
+        if s == "CPU":
+            return s
+        return "From"+s
+    fallback = getattr(fallbackModPkg, fallbackModName).clone(src=[prefix+_from(dt) for dt in deviceTypes])
+
+    # Paths
+    tmp = {}
+    for dt in deviceTypes:
+        tmp[dt] = cms.Path()
+
+    for dt in deviceTypes:
+        p = cms.Path()
+
+        # Add inverted filters until the current device type is found, then insert filter and stop
+        # For [CUDA, FPGA, CPU] results in
+        # CUDA: CUDAFilter
+        # FPGA: ~CUDAFilter + FPGAFilter
+        # CPU: ~CUDAFilter + ~FPGAFilter
+        for dt2 in deviceTypes:
+            if dt2 == "CPU":
+                continue
+            filt = deviceFilters[dt2]
+            if dt2 == dt:
+                p += filt
+                break
+            else:
+                p += ~filt
+
+        # Finally add the producer of the type
+        p += modDict[prefix+dt]
+
+        modDict[prefix+"Path"+dt] = p
+
+    # Task
+    task = cms.Task(transferMod, fallback)
+
+    return (fallback, task)
+
+def setupCUDA(prefix, deviceFilter, modDict,
+              package=None, transferModule=None, **kwargs):
+    transfer = {}
+    if transferModule is not None:
+        transfer["CUDA"] = transferModule
+    return setupHeterogeneous(prefix, ["CUDA", "CPU"], {"CUDA": deviceFilter}, modDict,
+                              package, transfer, **kwargs)
diff --git a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
index 0c54569d2b201..e9ddebfaa35b5 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
@@ -17,22 +17,20 @@
 # Flow diagram of the modules
 #
 #     1   5
-#    / \
-#   2  4
+#    / \  |
+#   2  4  6
 #   |
 #   3
-#
+
+process.load("HeterogeneousCore.CUDATest.prod1_cff")
+process.load("HeterogeneousCore.CUDATest.prod5_cff")
+process.load("HeterogeneousCore.CUDATest.prod6_cff")
+
 # CPU producers
 from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
-process.prod1CPU = testCUDAProducerCPU.clone()
 process.prod2CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
 process.prod3CPU = testCUDAProducerCPU.clone(src = "prod2CPU")
 process.prod4CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
-process.prod5CPU = testCUDAProducerCPU.clone()
-
-# Module to decide whether the chain of CUDA modules are run, and to disable a Path in case we don't run on CUDA
-from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter
-process.prodCUDADeviceFilter = cudaDeviceChooserFilter.clone()
 
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
 from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
@@ -40,29 +38,23 @@
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
 
 # GPU producers
-process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
 process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
 process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
 process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
-process.prod5CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
 
 # Modules to copy data from GPU to CPU (as "on demand" as any other
 # EDProducer, i.e. according to consumes() and prefetching)
-process.prod1FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod1CUDA")
 process.prod2FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
 process.prod3FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
 process.prod4FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
-process.prod5FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod5CUDA")
 
 # These ones are to provide backwards compatibility to the downstream
 # clients. To be replaced with an enhanced version of EDAlias (with an
 # ordered fallback mechanism).
 from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback
-process.prod1 = testCUDAProducerFallback.clone(src = ["prod1FromCUDA", "prod1CPU"])
 process.prod2 = testCUDAProducerFallback.clone(src = ["prod2FromCUDA", "prod2CPU"])
 process.prod3 = testCUDAProducerFallback.clone(src = ["prod3FromCUDA", "prod3CPU"])
 process.prod4 = testCUDAProducerFallback.clone(src = ["prod4FromCUDA", "prod4CPU"])
-process.prod5 = testCUDAProducerFallback.clone(src = ["prod5FromCUDA", "prod5CPU"])
 
 process.out = cms.OutputModule("AsciiOutputModule",
     outputCommands = cms.untracked.vstring(
@@ -74,29 +66,18 @@
 )
 
 process.prodCPU1 = cms.Path(
-    ~process.prodCUDADeviceFilter +
-    process.prod1CPU +
+    ~process.prod1CUDADeviceFilter +
     process.prod2CPU +
     process.prod3CPU +
     process.prod4CPU
 )
 process.prodCUDA1 = cms.Path(
-    process.prodCUDADeviceFilter +
-    process.prod1CUDA +
+    process.prod1CUDADeviceFilter +
     process.prod2CUDA +
     process.prod3CUDA +
     process.prod4CUDA
 )
 
-process.prodCPU5 = cms.Path(
-    ~process.prodCUDADeviceFilter +
-    process.prod5CPU
-)
-process.prodCUDA5 = cms.Path(
-    process.prodCUDADeviceFilter +
-    process.prod5CUDA
-)
-
 process.t = cms.Task(
     # Eventually the goal is to specify these as part of a Task,
     # but (at least) as long as the fallback mechanism is implemented
@@ -104,8 +85,9 @@
 #    process.prod2CPU, process.prod3CPU, process.prod4CPU,
 #    process.prod2CUDA, process.prod3CUDA, process.prod4CUDA,
 
-    process.prod1FromCUDA, process.prod2FromCUDA, process.prod3FromCUDA, process.prod4FromCUDA, process.prod5FromCUDA,
-    process.prod1, process.prod2, process.prod3, process.prod4, process.prod5,
+    process.prod2FromCUDA, process.prod3FromCUDA, process.prod4FromCUDA,
+    process.prod2, process.prod3, process.prod4,
+    process.prod1Task, process.prod5Task, process.prod6Task
 )
 process.p = cms.Path()
 process.p.associate(process.t)

From abdc7a6c1bedbeff548db3a644d90dfb0d2744ed Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 4 Sep 2018 16:36:32 +0200
Subject: [PATCH 05/49] Add a mechanism to deliver the CUDAScopedContext from
 ExternalWork acquire() to produce()

---
 .../CUDACore/interface/CUDAContextToken.h     | 40 +++++++++++++++++++
 .../CUDACore/interface/CUDAScopedContext.h    | 12 ++++++
 .../CUDACore/test/test_CUDAScopedContext.cc   | 14 +++++++
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 11 +++--
 4 files changed, 71 insertions(+), 6 deletions(-)
 create mode 100644 HeterogeneousCore/CUDACore/interface/CUDAContextToken.h

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
new file mode 100644
index 0000000000000..e4c214d8e983b
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
@@ -0,0 +1,40 @@
+#ifndef HeterogeneousCore_CUDACore_CUDAContextToken_h
+#define HeterogeneousCore_CUDACore_CUDAContextToken_h
+
+#include <memory>
+
+/**
+ * The purpose of this class is to deliver the device and CUDA stream
+ * information from ExternalWork's acquire() to producer() via a
+ * member/StreamCache variable.
+ */
+class CUDAContextToken {
+public:
+  CUDAContextToken() = default;
+  ~CUDAContextToken() = default;
+
+  CUDAContextToken(const CUDAContextToken&) = delete;
+  CUDAContextToken& operator=(const CUDAToken&) = delete;
+  CUDAContextToken(CUDAContextToken&&) = default;
+  CUDAContextToken& operator=(CUDAContextToken&& other) = default;
+
+private:
+  friend class CUDAScopedContext;
+
+  explicit CUDAContextToken(int device, cuda::stream_t<>&& stream):
+    stream_(std::make_unique<cuda::stream_t<>>(std::move(stream))),
+    device_(device)
+  {}
+
+  int device() { return device_; }
+  cuda::stream_t<>&& stream() {
+    auto ret = std::move(*stream_);
+    stream_.reset();
+    return std::move(ret);
+  }
+
+  std::unique_ptr<cuda::stream_t<>> stream_;
+  int device_;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 226ade47deecd..e70ae581d8014 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -5,6 +5,7 @@
 #include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 
 #include <optional>
 
@@ -23,6 +24,12 @@ class CUDAScopedContext {
     stream_(token.stream())
   {}
 
+  explicit CUDAScopedContext(CUDAContextToken&& token):
+    currentDevice_(token.device()),
+    setDeviceForThisScope_(currentDevice_),
+    stream_(std::move(token.stream()))
+  {}
+
   template<typename T>
   explicit CUDAScopedContext(const CUDA<T>& data):
     currentDevice_(data.device()),
@@ -50,6 +57,11 @@ class CUDAScopedContext {
   cuda::stream_t<>& stream() { return stream_; }
   const cuda::stream_t<>& stream() const { return stream_; }
 
+  CUDAContextToken toToken() {
+    // TODO: should we add a flag to check whether the CUDAScopedContext is valid or not?
+    return CUDAContextToken(currentDevice_, std::move(stream_));
+  }
+
   template <typename T>
   const T& get(const CUDA<T>& data) {
     if(data.device() != currentDevice_) {
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 5cdde908ba69b..da05951f9aad0 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -55,6 +55,20 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       REQUIRE(dataPtr->stream().id() == ctx.stream().id());
     }
 
+    SECTION("Storing state as CUDAContextToken") {
+      CUDAContextToken ctxtok;
+      { // acquire
+        auto ctx = CUDAScopedContext(token);
+        ctxtok = ctx.toToken();
+      }
+
+      { // produce
+        auto ctx = CUDAScopedContext(std::move(ctxtok));
+        REQUIRE(cuda::device::current::get().id() == token.device());
+        REQUIRE(ctx.stream().id() == token.stream().id());
+      }
+    }
+
     SECTION("Joining multiple CUDA streams") {
       cuda::device::current::scoped_override_t<> setDeviceForThisScope(defaultDevice);
       auto current_device = cuda::device::current::get();
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index b37a42ebcf653..cc20789f3451b 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -6,6 +6,7 @@
 
 #include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 
 #include "TestCUDAProducerGPUKernel.h"
@@ -25,6 +26,7 @@ class TestCUDAProducerGPUEW: public CUDAStreamEDProducer<edm::ExternalWork> {
   std::string label_;
   edm::EDGetTokenT<CUDA<float *>> srcToken_;
   std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+  CUDAContextToken ctxTmp_;
   float *devicePtr_ = nullptr;
   float hostData_ = 0.f;
 };
@@ -62,17 +64,14 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
   cuda::memory::async::copy(&hostData_, devicePtr_+10, sizeof(float), ctx.stream().id());
 
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+
+  ctxTmp_ = ctx.toToken();
 }
 
 void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
 
-  // It feels a bit stupid to read the input again here, but for
-  // anything else we'd need to somehow transfer the device+stream
-  // information from acquire.
-  edm::Handle<CUDA<float *> > hin;
-  iEvent.getByToken(srcToken_, hin);
-  auto ctx = CUDAScopedContext(*hin);
+  auto ctx = CUDAScopedContext(std::move(ctxTmp_));
 
   iEvent.put(ctx.wrap(devicePtr_));
   devicePtr_ = nullptr;

From 22d33d957193138fe421184a970aec3fb03c461b Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 17 Oct 2018 23:11:55 +0200
Subject: [PATCH 06/49] Fix the test configuration for non-GPU case

---
 HeterogeneousCore/CUDATest/python/setupHeterogeneous.py | 7 ++++++-
 HeterogeneousCore/CUDATest/test/testCUDA_cfg.py         | 7 +++++--
 2 files changed, 11 insertions(+), 3 deletions(-)

diff --git a/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py b/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
index 0fc5d7980ceeb..630817f27d2b4 100644
--- a/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
+++ b/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
@@ -85,10 +85,15 @@ def _from(s):
         # Finally add the producer of the type
         p += modDict[prefix+dt]
 
+        # Add (until we get the proper fallback mechanism) the transfer module to the path
+        if dt != "CPU":
+            transferModName = transferModuleNames.get(dt, prefix+"From"+dt)
+            p += modDict[transferModName]
+
         modDict[prefix+"Path"+dt] = p
 
     # Task
-    task = cms.Task(transferMod, fallback)
+    task = cms.Task(fallback)
 
     return (fallback, task)
 
diff --git a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
index e9ddebfaa35b5..cd5b7da2d3103 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
@@ -74,8 +74,11 @@
 process.prodCUDA1 = cms.Path(
     process.prod1CUDADeviceFilter +
     process.prod2CUDA +
+    process.prod2FromCUDA +
     process.prod3CUDA +
-    process.prod4CUDA
+    process.prod3FromCUDA +
+    process.prod4CUDA +
+    process.prod4FromCUDA
 )
 
 process.t = cms.Task(
@@ -84,8 +87,8 @@
     # with an EDProducer, they must be in a Path.
 #    process.prod2CPU, process.prod3CPU, process.prod4CPU,
 #    process.prod2CUDA, process.prod3CUDA, process.prod4CUDA,
+#    process.prod2FromCUDA, process.prod3FromCUDA, process.prod4FromCUDA,
 
-    process.prod2FromCUDA, process.prod3FromCUDA, process.prod4FromCUDA,
     process.prod2, process.prod3, process.prod4,
     process.prod1Task, process.prod5Task, process.prod6Task
 )

From b5f84fe50a9154d88f8db69bc7faf294354ebb6e Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Sat, 1 Dec 2018 00:05:21 +0100
Subject: [PATCH 07/49] Remove CUDADeviceChooserFilter, go to
 configuration-time device type selection

---
 .../plugins/CUDADeviceChooserFilter.cc        |  76 -------------
 .../test/test_CUDADeviceChooserFilter.cc      |  96 ----------------
 .../plugins/TestCUDAProducerFallback.cc       |  56 ---------
 .../python/prod1CUDADeviceFilter_cfi.py       |   4 -
 .../python/prod1CUDADeviceProducer_cfi.py     |   4 +
 .../CUDATest/python/prod1CUDA_cfi.py          |   2 +-
 .../CUDATest/python/prod1_cff.py              |  24 ++--
 .../python/prod5CUDADeviceFilter_cfi.py       |   4 -
 .../python/prod5CUDADeviceProducer_cfi.py     |   4 +
 .../CUDATest/python/prod5CUDA_cfi.py          |   2 +-
 .../CUDATest/python/prod5Fallback_cfi.py      |   4 -
 .../CUDATest/python/prod5_cff.py              |  20 +++-
 .../CUDATest/python/prod6Fallback_cfi.py      |   4 -
 .../CUDATest/python/prod6_cff.py              |  19 +++-
 .../CUDATest/python/setupHeterogeneous.py     | 106 ------------------
 .../CUDATest/test/testCUDA_cfg.py             |  62 ++++------
 16 files changed, 68 insertions(+), 419 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
 delete mode 100644 HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
 delete mode 100644 HeterogeneousCore/CUDATest/plugins/TestCUDAProducerFallback.cc
 delete mode 100644 HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py
 delete mode 100644 HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py
 delete mode 100644 HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py
 delete mode 100644 HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py
 delete mode 100644 HeterogeneousCore/CUDATest/python/setupHeterogeneous.py

diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
deleted file mode 100644
index 15216edb020c5..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-#include "FWCore/Framework/interface/global/EDFilter.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-#include "chooseCUDADevice.h"
-
-namespace {
-  struct DeviceCache {
-    int device;
-    bool enabled;
-  };
-}
-
-class CUDADeviceChooserFilter: public edm::global::EDFilter<edm::StreamCache<::DeviceCache>> {
-public:
-  explicit CUDADeviceChooserFilter(const edm::ParameterSet& iConfig);
-  ~CUDADeviceChooserFilter() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
-
-  bool filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
-
-private:
-  bool enabled_;
-};
-
-CUDADeviceChooserFilter::CUDADeviceChooserFilter(const edm::ParameterSet& iConfig):
-  enabled_(iConfig.getParameter<bool>("enabled"))
-{
-  produces<CUDAToken>();
-}
-
-void CUDADeviceChooserFilter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<bool>("enabled", true)->setComment("This parameter is intended for debugging purposes only. If disabling some CUDA chains is needed for production, it is better to remove the CUDA modules altogether from the configuration.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDFilter chooses whether a chain of CUDA EDModules depending on it should run or not, and on which CUDA device they should run. The decision is communicated downstream with the filter decision. In addition, if the filter returns true, a 'CUDAToken' is produced into the event (for false nothing is produced).");
-}
-
-std::unique_ptr<::DeviceCache> CUDADeviceChooserFilter::beginStream(edm::StreamID id) const {
-  auto ret = std::make_unique<::DeviceCache>();
-
-  edm::Service<CUDAService> cudaService;
-  ret->enabled = (enabled_ && cudaService->enabled(id));
-  if(!ret->enabled) {
-    return ret;
-  }
-
-  ret->device = cudacore::chooseCUDADevice(id);
-
-  LogDebug("CUDADeviceChooserFilter") << "EDM stream " << id << " set to CUDA device " << ret->device;
-
-  return ret;
-}
-
-bool CUDADeviceChooserFilter::filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  auto cache = streamCache(id);
-  if(!cache->enabled) {
-    return false;
-  }
-
-  auto ret = std::make_unique<CUDAToken>(cache->device);
-  LogDebug("CUDADeviceChooserFilter") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
-  iEvent.put(std::move(ret));
-  return true;
-}
-
-DEFINE_FWK_MODULE(CUDADeviceChooserFilter);
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
deleted file mode 100644
index 09f419c33357e..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
+++ /dev/null
@@ -1,96 +0,0 @@
-#include "catch.hpp"
-#include "FWCore/TestProcessor/interface/TestProcessor.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-
-#include <cuda_runtime_api.h>
-
-static constexpr auto s_tag = "[CUDADeviceChooserFilter]";
-
-TEST_CASE("Standard checks of CUDADeviceChooserFilter", s_tag) {
-  const std::string baseConfig{
-R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDFilter("CUDADeviceChooserFilter")
-process.moduleToTest(process.toTest)
-)_"
-  };
-  
-  edm::test::TestProcessor::Config config{ baseConfig };  
-  SECTION("base configuration is OK") {
-    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
-  }
-  
-  SECTION("No event data") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.test());
-  }
-  
-  SECTION("beginJob and endJob only") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
-  }
-
-  SECTION("Run with no LuminosityBlocks") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
-  }
-
-  SECTION("LuminosityBlock with no Events") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
-  }
-
-}
-
-TEST_CASE("CUDADeviceChooserFilter enabled", s_tag) {
-  const std::string config{
-R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDFilter("CUDADeviceChooserFilter")
-process.moduleToTest(process.toTest)
-)_"
-  };
-
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
-
-  SECTION("CUDAToken") {
-    edm::test::TestProcessor tester{config};
-    auto event = tester.test();
-    
-    REQUIRE(event.get<CUDAToken>()->device() >= 0);
-    REQUIRE(event.get<CUDAToken>()->stream().id() != nullptr);
-  }
-}
-
-TEST_CASE("CUDADeviceChooserFilter disabled", s_tag) {
-  const std::string config{
-R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDFilter("CUDADeviceChooserFilter", enabled=cms.bool(False))
-process.moduleToTest(process.toTest)
-)_"
-  };
-
-  SECTION("CUDAToken") {
-    edm::test::TestProcessor tester{config};
-    auto event = tester.test();
-
-    REQUIRE_THROWS_AS(event.get<CUDAToken>()->device(), cms::Exception);
-  }
-}
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerFallback.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerFallback.cc
deleted file mode 100644
index fd2b6842c39df..0000000000000
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerFallback.cc
+++ /dev/null
@@ -1,56 +0,0 @@
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/Utilities/interface/transform.h"
-
-class TestCUDAProducerFallback: public edm::global::EDProducer<> {
-public:
-  explicit TestCUDAProducerFallback(const edm::ParameterSet& iConfig);
-  ~TestCUDAProducerFallback() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
-
-private:
-  std::string label_;
-  std::vector<edm::EDGetTokenT<int>> tokens_;
-};
-
-TestCUDAProducerFallback::TestCUDAProducerFallback(const edm::ParameterSet& iConfig):
-  label_(iConfig.getParameter<std::string>("@module_label")),
-  tokens_(edm::vector_transform(iConfig.getParameter<std::vector<edm::InputTag> >("src"),
-                                [this](const edm::InputTag& tag) {
-                                  return consumes<int>(tag);
-                                }))
-{
-  produces<int>();
-}
-
-void TestCUDAProducerFallback::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<std::vector<edm::InputTag>>("src", std::vector<edm::InputTag>{})->setComment("Ordered list of input 'int' inputs.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It acts as an enhanced EDAlias with a defined order of inputs. I.e. if first input is available, copy that. If not, try the next one etc. If no inputs are available, throw an exception. To be replaced with an EDAlias-style feature in the framework.");
-}
-
-void TestCUDAProducerFallback::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::LogPrint("TestCUDAProducerFallback") << label_ << " TestCUDAProducerFallback::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
-  edm::Handle<int> hin;
-  for(const auto& token: tokens_) {
-    edm::EDConsumerBase::Labels labels;
-    labelsForToken(token, labels);
-    if(iEvent.getByToken(token, hin)) {
-      edm::LogPrint("TestCUDAProducerFallback") << label_ << "  input " << labels.module << " found";
-      iEvent.put(std::make_unique<int>(*hin));
-      return;
-    }
-    edm::LogPrint("TestCUDAProducerFallback") << label_ << "  input " << labels.module << " NOT found";
-  }
-  throw cms::Exception("ProductNotFound") << "Unable to find product 'int' from any of the inputs";
-}
-
-DEFINE_FWK_MODULE(TestCUDAProducerFallback);
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py
deleted file mode 100644
index c8f9c3574c561..0000000000000
--- a/HeterogeneousCore/CUDATest/python/prod1CUDADeviceFilter_cfi.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter as _cudaDeviceChooserFilter
-prod1CUDADeviceFilter = _cudaDeviceChooserFilter.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py
new file mode 100644
index 0000000000000..23f9ac24ca16c
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDACore.cudaDeviceChooserProducer_cfi import cudaDeviceChooserProducer as _cudaDeviceChooserProducer
+prod1CUDADeviceProducer = _cudaDeviceChooserProducer.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
index b333aa78562cb..a77f8faee8605 100644
--- a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
@@ -1,4 +1,4 @@
 import FWCore.ParameterSet.Config as cms
 
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
-prod1CUDA = _testCUDAProducerGPUFirst.clone(src = "prod1CUDADeviceFilter")
+prod1CUDA = _testCUDAProducerGPUFirst.clone(src = "prod1CUDADeviceProducer")
diff --git a/HeterogeneousCore/CUDATest/python/prod1_cff.py b/HeterogeneousCore/CUDATest/python/prod1_cff.py
index 734609b373ac4..8b08cd94f56b6 100644
--- a/HeterogeneousCore/CUDATest/python/prod1_cff.py
+++ b/HeterogeneousCore/CUDATest/python/prod1_cff.py
@@ -1,23 +1,17 @@
 import FWCore.ParameterSet.Config as cms
 
-from HeterogeneousCore.CUDATest.prod1CUDADeviceFilter_cfi import prod1CUDADeviceFilter
-from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDADeviceProducer_cfi import prod1CUDADeviceProducer
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
 from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
-from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
 
-from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback as _testCUDAProducerFallback
+from Configuration.ProcessModifiers.gpu_cff import gpu
 
-prod1 = _testCUDAProducerFallback.clone(src = ["prod1CUDA", "prod1CPU"])
-
-prod1PathCUDA = cms.Path(
-    prod1CUDADeviceFilter +
-    prod1CUDA
-)
-prod1PathCPU = cms.Path(
-    ~prod1CUDADeviceFilter +
-    prod1CPU
-)
+prod1 = _prod1CPU.clone()
+gpu.toReplaceWith(prod1, _prod1FromCUDA)
 
 prod1Task = cms.Task(
-    prod1FromCUDA, prod1
+    prod1CUDADeviceProducer,
+    prod1CUDA,
+    prod1
 )
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py
deleted file mode 100644
index cfe4f4048b8f3..0000000000000
--- a/HeterogeneousCore/CUDATest/python/prod5CUDADeviceFilter_cfi.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter as _cudaDeviceChooserFilter
-prod5CUDADeviceFilter = _cudaDeviceChooserFilter.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py
new file mode 100644
index 0000000000000..b94357d967039
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py
@@ -0,0 +1,4 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDACore.cudaDeviceChooserProducer_cfi import cudaDeviceChooserProducer as _cudaDeviceChooserProducer
+prod5CUDADeviceProducer = _cudaDeviceChooserProducer.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
index ae7f605d07975..18080f60f4ff2 100644
--- a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
@@ -1,4 +1,4 @@
 import FWCore.ParameterSet.Config as cms
 
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
-prod5CUDA = _testCUDAProducerGPUFirst.clone(src = "prod5CUDADeviceFilter")
+prod5CUDA = _testCUDAProducerGPUFirst.clone(src = "prod5CUDADeviceProducer")
diff --git a/HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py b/HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py
deleted file mode 100644
index 4ad84bdeee4a7..0000000000000
--- a/HeterogeneousCore/CUDATest/python/prod5Fallback_cfi.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback as _testCUDAProducerFallback
-prod5Fallback = _testCUDAProducerFallback.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5_cff.py b/HeterogeneousCore/CUDATest/python/prod5_cff.py
index 2b8948551733c..49609f9a8062e 100644
--- a/HeterogeneousCore/CUDATest/python/prod5_cff.py
+++ b/HeterogeneousCore/CUDATest/python/prod5_cff.py
@@ -1,9 +1,17 @@
 import FWCore.ParameterSet.Config as cms
 
-from HeterogeneousCore.CUDATest.prod5CUDADeviceFilter_cfi import prod5CUDADeviceFilter
+from HeterogeneousCore.CUDATest.prod5CUDADeviceProducer_cfi import prod5CUDADeviceProducer
+from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
+from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
+from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
 
-# The prod6 is the final, (legacy) CPU-only product name, and the
-# prod6Task is the Task containing all modules. The function itself
-# sets up everything else.
-from HeterogeneousCore.CUDATest.setupHeterogeneous import setupCUDA
-(prod5, prod5Task) = setupCUDA("prod5", prod5CUDADeviceFilter, globals())
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+prod5 = _prod5CPU.clone()
+gpu.toReplaceWith(prod5, _prod5FromCUDA)
+
+prod5Task = cms.Task(
+    prod5CUDADeviceProducer,
+    prod5CUDA,
+    prod5
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py b/HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py
deleted file mode 100644
index 60e9f1910188e..0000000000000
--- a/HeterogeneousCore/CUDATest/python/prod6Fallback_cfi.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback as _testCUDAProducerFallback
-prod6Fallback = _testCUDAProducerFallback.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod6_cff.py b/HeterogeneousCore/CUDATest/python/prod6_cff.py
index 7df34426ab67f..9847cd896ce5b 100644
--- a/HeterogeneousCore/CUDATest/python/prod6_cff.py
+++ b/HeterogeneousCore/CUDATest/python/prod6_cff.py
@@ -1,9 +1,16 @@
 import FWCore.ParameterSet.Config as cms
 
-from HeterogeneousCore.CUDATest.prod5CUDADeviceFilter_cfi import prod5CUDADeviceFilter
+from HeterogeneousCore.CUDATest.prod6CPU_cfi import prod6CPU as _prod6CPU
+from HeterogeneousCore.CUDATest.prod6CUDA_cfi import prod6CUDA
+from HeterogeneousCore.CUDATest.prod6FromCUDA_cfi import prod6FromCUDA as _prod6FromCUDA
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+
+prod6 = _prod6CPU.clone()
+gpu.toReplaceWith(prod6, _prod6FromCUDA)
+
+prod6Task = cms.Task(
+    prod6CUDA,
+    prod6
+)
 
-# The prod6 is the final, (legacy) CPU-only product name, and the
-# prod6Task is the Task containing all modules. The function itself
-# sets up everything else.
-from HeterogeneousCore.CUDATest.setupHeterogeneous import setupHeterogeneous
-(prod6, prod6Task) = setupHeterogeneous("prod6", ["CUDA", "CPU"], {"CUDA": prod5CUDADeviceFilter}, globals())
diff --git a/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py b/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
deleted file mode 100644
index 630817f27d2b4..0000000000000
--- a/HeterogeneousCore/CUDATest/python/setupHeterogeneous.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-# Prototype of the function
-import importlib
-def setupHeterogeneous(prefix, deviceTypes, deviceFilters, modDict,
-                       package=None, transferModuleNames={}, fallbackModuleName=None):
-    """
-    Mandatory parameters:
-    prefix   -- common prefix of the CPU, CUDA, etc producers
-    deviceTypes -- list of strings for the device types
-    deviceFilters -- dict of non-CPU device types to device filter modules
-    modDict  -- globals()
-
-    Optional parameters:
-    package -- Package of the modules (default None signals to use the current package)
-    transferModuleName -- Dictionary for names of the device->CPU modules to be loaded and inserted in modDict (if the dictionary does not contain a key 'prefix', a default value of 'prefix+'From<device>' will be used)
-    fallbackModuleName -- Name of the devices+CPU product fallback producer to be loaded (default None means prefix+'Fallback')
-
-    Returns a pair of
-    - something which looks like an EDProducer picking the product from devices+CPU
-    - Task containing all the added modules
-    """
-    path = ""
-    if package is None:
-        pkgs = __name__.split(".")
-        if len(pkgs) > 1:
-            path = ".".join(pkgs[:-1])+"."
-    else:
-        path = package+"."
-
-    # Per-device producers
-    for dt in deviceTypes:
-        modName = prefix+dt
-        pkg = importlib.import_module(path+modName+"_cfi")
-        mod = getattr(pkg, modName)
-        modDict[modName] = mod
-
-    # device->CPU
-    for dt in deviceTypes:
-        if dt == "CPU":
-            continue
-        transferModName = transferModuleNames.get(dt, prefix+"From"+dt)
-
-        transferModPath = path+transferModName+"_cfi"
-        transferModPkg = importlib.import_module(transferModPath)
-        transferMod = getattr(transferModPkg, transferModName).clone(src=prefix+dt)
-        modDict[transferModName] = transferMod
-
-    # Fallback
-    if fallbackModuleName is None:
-        fallbackModName = prefix+"Fallback"
-    else:
-        fallbackModName = fallbackModuleName
-    fallbackModPath = path+fallbackModName+"_cfi"
-    fallbackModPkg = importlib.import_module(fallbackModPath)
-    def _from(s):
-        if s == "CPU":
-            return s
-        return "From"+s
-    fallback = getattr(fallbackModPkg, fallbackModName).clone(src=[prefix+_from(dt) for dt in deviceTypes])
-
-    # Paths
-    tmp = {}
-    for dt in deviceTypes:
-        tmp[dt] = cms.Path()
-
-    for dt in deviceTypes:
-        p = cms.Path()
-
-        # Add inverted filters until the current device type is found, then insert filter and stop
-        # For [CUDA, FPGA, CPU] results in
-        # CUDA: CUDAFilter
-        # FPGA: ~CUDAFilter + FPGAFilter
-        # CPU: ~CUDAFilter + ~FPGAFilter
-        for dt2 in deviceTypes:
-            if dt2 == "CPU":
-                continue
-            filt = deviceFilters[dt2]
-            if dt2 == dt:
-                p += filt
-                break
-            else:
-                p += ~filt
-
-        # Finally add the producer of the type
-        p += modDict[prefix+dt]
-
-        # Add (until we get the proper fallback mechanism) the transfer module to the path
-        if dt != "CPU":
-            transferModName = transferModuleNames.get(dt, prefix+"From"+dt)
-            p += modDict[transferModName]
-
-        modDict[prefix+"Path"+dt] = p
-
-    # Task
-    task = cms.Task(fallback)
-
-    return (fallback, task)
-
-def setupCUDA(prefix, deviceFilter, modDict,
-              package=None, transferModule=None, **kwargs):
-    transfer = {}
-    if transferModule is not None:
-        transfer["CUDA"] = transferModule
-    return setupHeterogeneous(prefix, ["CUDA", "CPU"], {"CUDA": deviceFilter}, modDict,
-                              package, transfer, **kwargs)
diff --git a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
index cd5b7da2d3103..626eef7b207e1 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
@@ -1,6 +1,9 @@
 import FWCore.ParameterSet.Config as cms
 
-process = cms.Process("Test")
+enableGPU = True
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+process = cms.Process("Test", gpu) if enableGPU else cms.Process("Test")
 process.load("FWCore.MessageService.MessageLogger_cfi")
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
 
@@ -28,9 +31,9 @@
 
 # CPU producers
 from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
-process.prod2CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
-process.prod3CPU = testCUDAProducerCPU.clone(src = "prod2CPU")
-process.prod4CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
+process.prod2 = testCUDAProducerCPU.clone(src = "prod1")
+process.prod3 = testCUDAProducerCPU.clone(src = "prod2")
+process.prod4 = testCUDAProducerCPU.clone(src = "prod1")
 
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
 from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
@@ -43,18 +46,12 @@
 process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
 
 # Modules to copy data from GPU to CPU (as "on demand" as any other
-# EDProducer, i.e. according to consumes() and prefetching)
-process.prod2FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
-process.prod3FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
-process.prod4FromCUDA = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
-
-# These ones are to provide backwards compatibility to the downstream
-# clients. To be replaced with an enhanced version of EDAlias (with an
-# ordered fallback mechanism).
-from HeterogeneousCore.CUDATest.testCUDAProducerFallback_cfi import testCUDAProducerFallback
-process.prod2 = testCUDAProducerFallback.clone(src = ["prod2FromCUDA", "prod2CPU"])
-process.prod3 = testCUDAProducerFallback.clone(src = ["prod3FromCUDA", "prod3CPU"])
-process.prod4 = testCUDAProducerFallback.clone(src = ["prod4FromCUDA", "prod4CPU"])
+# EDProducer, i.e. according to consumes() and prefetching). If a
+# separate conversion step is needed to get the same data formats as
+# the CPU modules, those are then ones that should be replaced-with here.
+gpu.toReplaceWith(process.prod2, testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA"))
+gpu.toReplaceWith(process.prod3, testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA"))
+gpu.toReplaceWith(process.prod4, testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA"))
 
 process.out = cms.OutputModule("AsciiOutputModule",
     outputCommands = cms.untracked.vstring(
@@ -65,32 +62,17 @@
     verbosity = cms.untracked.uint32(0),
 )
 
-process.prodCPU1 = cms.Path(
-    ~process.prod1CUDADeviceFilter +
-    process.prod2CPU +
-    process.prod3CPU +
-    process.prod4CPU
-)
-process.prodCUDA1 = cms.Path(
-    process.prod1CUDADeviceFilter +
-    process.prod2CUDA +
-    process.prod2FromCUDA +
-    process.prod3CUDA +
-    process.prod3FromCUDA +
-    process.prod4CUDA +
-    process.prod4FromCUDA
-)
+process.prod2Task = cms.Task(process.prod2, process.prod2CUDA)
+process.prod3Task = cms.Task(process.prod3, process.prod3CUDA)
+process.prod4Task = cms.Task(process.prod4, process.prod4CUDA)
 
 process.t = cms.Task(
-    # Eventually the goal is to specify these as part of a Task,
-    # but (at least) as long as the fallback mechanism is implemented
-    # with an EDProducer, they must be in a Path.
-#    process.prod2CPU, process.prod3CPU, process.prod4CPU,
-#    process.prod2CUDA, process.prod3CUDA, process.prod4CUDA,
-#    process.prod2FromCUDA, process.prod3FromCUDA, process.prod4FromCUDA,
-
-    process.prod2, process.prod3, process.prod4,
-    process.prod1Task, process.prod5Task, process.prod6Task
+    process.prod1Task,
+    process.prod2Task,
+    process.prod3Task,
+    process.prod4Task,
+    process.prod5Task,
+    process.prod6Task
 )
 process.p = cms.Path()
 process.p.associate(process.t)

From 67fa0165bd5aad2814f31dbf51b772f40b48b8e1 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 7 Dec 2018 18:33:07 +0100
Subject: [PATCH 08/49] Moving device selection and stream creation to
 CUDAScopedContext

---
 HeterogeneousCore/CUDACore/interface/CUDA.h   | 48 +++++++---------
 .../CUDACore/interface/CUDAContextToken.h     | 14 ++---
 .../CUDACore/interface/CUDAScopedContext.h    | 38 +++++++------
 .../{plugins => interface}/chooseCUDADevice.h |  0
 .../plugins/CUDADeviceChooserProducer.cc      |  2 +-
 .../CUDACore/src/CUDAScopedContext.cc         | 37 ++++++++-----
 .../{plugins => src}/chooseCUDADevice.cc      |  2 +-
 HeterogeneousCore/CUDACore/test/TestCUDA.h    | 23 --------
 HeterogeneousCore/CUDACore/test/test_CUDA.cc  | 13 ++---
 .../CUDACore/test/test_CUDAScopedContext.cc   | 55 +++++++++----------
 .../plugins/TestCUDAProducerGPUFirst.cc       | 13 +----
 .../python/prod1CUDADeviceProducer_cfi.py     |  4 --
 .../CUDATest/python/prod1CUDA_cfi.py          |  2 +-
 .../CUDATest/python/prod1_cff.py              |  2 -
 .../python/prod5CUDADeviceProducer_cfi.py     |  4 --
 .../CUDATest/python/prod5CUDA_cfi.py          |  2 +-
 .../CUDATest/python/prod5_cff.py              |  2 -
 .../test/test_TestCUDAProducerGPUFirst.cc     | 20 ++-----
 18 files changed, 116 insertions(+), 165 deletions(-)
 rename HeterogeneousCore/CUDACore/{plugins => interface}/chooseCUDADevice.h (100%)
 rename HeterogeneousCore/CUDACore/{plugins => src}/chooseCUDADevice.cc (89%)
 delete mode 100644 HeterogeneousCore/CUDACore/test/TestCUDA.h
 delete mode 100644 HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py
 delete mode 100644 HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py

diff --git a/HeterogeneousCore/CUDACore/interface/CUDA.h b/HeterogeneousCore/CUDACore/interface/CUDA.h
index 6008836aebcff..bda3f4e2a0cd9 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDA.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDA.h
@@ -30,46 +30,40 @@ class CUDA {
   CUDA(CUDA&&) = default;
   CUDA& operator=(CUDA&&) = default;
 
-  bool isValid() const { return streamEvent_.get() != nullptr; }
+  bool isValid() const { return stream_.get() != nullptr; }
 
   int device() const { return device_; }
 
-  const cuda::stream_t<>& stream() const { return streamEvent_->stream; }
-  cuda::stream_t<>& stream() { return streamEvent_->stream; }
+  const cuda::stream_t<>& stream() const { return *stream_; }
+  cuda::stream_t<>& stream() { return *stream_; }
+  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
 
-  const cuda::event_t& event() const { return streamEvent_->event; }
-  cuda::event_t& event() { return streamEvent_->event; }
+  const cuda::event_t& event() const { return *event_; }
+  cuda::event_t& event() { return *event_; }
 
 private:
   friend class CUDAScopedContext;
-  friend class TestCUDA;
 
-  template <typename TokenOrContext>
-  explicit CUDA(T data, const TokenOrContext& token):
-    streamEvent_(std::make_unique<StreamEvent>(token)),
+  // Using template to break circular dependency
+  template <typename Context>
+  explicit CUDA(T data, const Context& ctx):
+    stream_(ctx.streamPtr()),
+    event_(std::make_unique<cuda::event_t>(cuda::event::create(ctx.device(),
+                                                               cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
+                                                               cuda::event::dont_record_timings))), // it should be a bit faster to ignore timings
     data_(std::move(data)),
-    device_(token.device())
+    device_(ctx.device())
   {}
 
+private:
+  // The cuda::stream_t is really shared among edm::Event products, so
+  // using shared_ptr also here
+  std::shared_ptr<cuda::stream_t<>> stream_;
   // Using unique_ptr to support the default constructor. Tried
-  // std::optional, but cuda::stream_t and cuda::event_t have their
-  // move assignment operators deleted. Use a struct to save one
-  // memory allocation.
-public: // need to be public for ROOT dicrionary generation?
-  struct StreamEvent {
-    template <typename TokenOrContext>
-    explicit StreamEvent(const TokenOrContext& token):
-      stream(token.stream()),
-      event(cuda::event::create(token.device(),
-                                cuda::event::sync_by_busy_waiting, // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
-                                cuda::event::dont_record_timings)) // it should be a bit faster to ignore timings
-    {}
+  // std::optional, but cuda::event_t has its move assignment
+  // operators deleted.
+  std::unique_ptr<cuda::event_t> event_;
 
-    cuda::stream_t<> stream; // stream_t is just a handle, the real CUDA stream is owned by CUDAToken (with long-enough life time)
-    cuda::event_t event;
-  };
-private:
-  std::unique_ptr<StreamEvent> streamEvent_;
   T data_;
   int device_ = -1;
 };
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
index e4c214d8e983b..1a599132d13f1 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
@@ -14,26 +14,24 @@ class CUDAContextToken {
   ~CUDAContextToken() = default;
 
   CUDAContextToken(const CUDAContextToken&) = delete;
-  CUDAContextToken& operator=(const CUDAToken&) = delete;
+  CUDAContextToken& operator=(const CUDAContextToken&) = delete;
   CUDAContextToken(CUDAContextToken&&) = default;
   CUDAContextToken& operator=(CUDAContextToken&& other) = default;
 
 private:
   friend class CUDAScopedContext;
 
-  explicit CUDAContextToken(int device, cuda::stream_t<>&& stream):
-    stream_(std::make_unique<cuda::stream_t<>>(std::move(stream))),
+  explicit CUDAContextToken(int device, std::shared_ptr<cuda::stream_t<>> stream):
+    stream_(std::move(stream)),
     device_(device)
   {}
 
   int device() { return device_; }
-  cuda::stream_t<>&& stream() {
-    auto ret = std::move(*stream_);
-    stream_.reset();
-    return std::move(ret);
+  std::shared_ptr<cuda::stream_t<>>&& streamPtr() {
+    return std::move(stream_);
   }
 
-  std::unique_ptr<cuda::stream_t<>> stream_;
+  std::shared_ptr<cuda::stream_t<>> stream_;
   int device_;
 };
 
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index e70ae581d8014..6900d3f63442a 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -3,8 +3,8 @@
 
 #include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
 #include "FWCore/Utilities/interface/Exception.h"
+#include "FWCore/Utilities/interface/StreamID.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 
 #include <optional>
@@ -18,27 +18,29 @@
  */
 class CUDAScopedContext {
 public:
-  explicit CUDAScopedContext(const CUDAToken& token):
-    currentDevice_(token.device()),
-    setDeviceForThisScope_(currentDevice_),
-    stream_(token.stream())
-  {}
+  explicit CUDAScopedContext(edm::StreamID streamID);
+
+  // This constructor takes the device as a parameter. It is mainly
+  // inteded for testing, but can be used for special cases if you
+  // really know what you're doing. Please use the StreamID overload
+  // if at all possible.
+  explicit CUDAScopedContext(int device);
 
   explicit CUDAScopedContext(CUDAContextToken&& token):
     currentDevice_(token.device()),
     setDeviceForThisScope_(currentDevice_),
-    stream_(std::move(token.stream()))
+    stream_(std::move(token.streamPtr()))
   {}
 
   template<typename T>
   explicit CUDAScopedContext(const CUDA<T>& data):
     currentDevice_(data.device()),
     setDeviceForThisScope_(currentDevice_),
-    stream_(data.stream())
+    stream_(data.streamPtr())
   {}
 
-  explicit CUDAScopedContext(const CUDAToken& token, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
-    CUDAScopedContext(token)
+  explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+    CUDAScopedContext(streamID)
   {
     waitingTaskHolder_ = waitingTaskHolder;
   }
@@ -54,12 +56,12 @@ class CUDAScopedContext {
 
   int device() const { return currentDevice_; }
 
-  cuda::stream_t<>& stream() { return stream_; }
-  const cuda::stream_t<>& stream() const { return stream_; }
+  cuda::stream_t<>& stream() { return *stream_; }
+  const cuda::stream_t<>& stream() const { return *stream_; }
+  const std::shared_ptr<cuda::stream_t<>> streamPtr() const { return stream_; }
 
   CUDAContextToken toToken() {
-    // TODO: should we add a flag to check whether the CUDAScopedContext is valid or not?
-    return CUDAContextToken(currentDevice_, std::move(stream_));
+    return CUDAContextToken(currentDevice_, stream_);
   }
 
   template <typename T>
@@ -70,7 +72,7 @@ class CUDAScopedContext {
       throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
     }
 
-    if(data.stream().id() != stream_.id()) {
+    if(data.stream().id() != stream_->id()) {
       // Different streams, need to synchronize
       if(!data.event().has_occurred()) {
         // Event not yet occurred, so need to add synchronization
@@ -78,7 +80,7 @@ class CUDAScopedContext {
         // wait for an event, so all subsequent work in the stream
         // will run only after the event has "occurred" (i.e. data
         // product became available).
-        auto ret = cudaStreamWaitEvent(stream_.id(), data.event().id(), 0);
+        auto ret = cudaStreamWaitEvent(stream_->id(), data.event().id(), 0);
         cuda::throw_if_error(ret, "Failed to make a stream to wait for an event");
       }
     }
@@ -93,7 +95,7 @@ class CUDAScopedContext {
     // Record CUDA event to the CUDA stream. The event will become
     // "occurred" after all work queued to the stream before this
     // point has been finished.
-    ret->event().record(stream_.id());
+    ret->event().record(stream_->id());
     return ret;
   }
 
@@ -101,7 +103,7 @@ class CUDAScopedContext {
   int currentDevice_;
   std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
   cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
-  cuda::stream_t<> stream_;
+  std::shared_ptr<cuda::stream_t<>> stream_;
 };
 
 #endif
diff --git a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h
similarity index 100%
rename from HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
rename to HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
index 13a4d6b34e521..c1c3db48413ac 100644
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
@@ -6,9 +6,9 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
-#include "chooseCUDADevice.h"
 
 #include <memory>
 
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index 639fc4aa2bfb8..a0bcba27c5936 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -1,21 +1,32 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h"
+
+
+CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID): CUDAScopedContext(cudacore::chooseCUDADevice(streamID)) {}
+CUDAScopedContext::CUDAScopedContext(int device):
+  currentDevice_(device),
+  setDeviceForThisScope_(device)
+{
+  auto current_device = cuda::device::current::get();
+  stream_ = std::make_shared<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
+}
 
 CUDAScopedContext::~CUDAScopedContext() {
   if(waitingTaskHolder_.has_value()) {
-    stream_.enqueue.callback([device=currentDevice_,
-                              waitingTaskHolder=*waitingTaskHolder_]
-                             (cuda::stream::id_t streamId, cuda::status_t status) mutable {
-                               if(cuda::is_success(status)) {
-                                 LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " << streamId;
-                                 waitingTaskHolder.doneWaiting(nullptr);
-                               }
-                               else {
-                                 auto error = cudaGetErrorName(status);
-                                 auto message = cudaGetErrorString(status);
-                                 waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message));
-                               }
-                             });
+    stream_->enqueue.callback([device=currentDevice_,
+                               waitingTaskHolder=*waitingTaskHolder_]
+                              (cuda::stream::id_t streamId, cuda::status_t status) mutable {
+                                if(cuda::is_success(status)) {
+                                  LogTrace("CUDAScopedContext") << " GPU kernel finished (in callback) device " << device << " CUDA stream " << streamId;
+                                  waitingTaskHolder.doneWaiting(nullptr);
+                                }
+                                else {
+                                  auto error = cudaGetErrorName(status);
+                                  auto message = cudaGetErrorString(status);
+                                  waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message));
+                                }
+                              });
   }
 }
diff --git a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
similarity index 89%
rename from HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
rename to HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
index b17158626a4e8..55764b1137d1f 100644
--- a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -1,5 +1,5 @@
-#include "chooseCUDADevice.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 namespace cudacore {
diff --git a/HeterogeneousCore/CUDACore/test/TestCUDA.h b/HeterogeneousCore/CUDACore/test/TestCUDA.h
deleted file mode 100644
index d1dd82df1e9ee..0000000000000
--- a/HeterogeneousCore/CUDACore/test/TestCUDA.h
+++ /dev/null
@@ -1,23 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_TestCUDA_h
-#define HeterogeneousCore_CUDACore_TestCUDA_h
-
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
-
-/**
- * This class is intended only for testing purposes. It allows to
- * construct CUDA<T> and get the T from CUDA<T> without CUDAScopedContext.
- */
-class TestCUDA {
-public:
-  template <typename T, typename ...Args>
-  static CUDA<T> create(T data, Args&&... args) {
-    return CUDA<T>(std::move(data), std::forward<Args>(args)...);
-  }
-
-  template <typename T>
-  static const T& get(const CUDA<T>& data) {
-    return data.data_;
-  }
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDA.cc b/HeterogeneousCore/CUDACore/test/test_CUDA.cc
index 8524f6d552bba..6572ffd9d258d 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDA.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDA.cc
@@ -1,11 +1,9 @@
 #include "catch.hpp"
 
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "TestCUDA.h"
-
 #include <cuda_runtime_api.h>
 
 TEST_CASE("Use of CUDA template", "[CUDACore]") {
@@ -27,13 +25,14 @@ TEST_CASE("Use of CUDA template", "[CUDACore]") {
 
   constexpr int defaultDevice = 0;
   {
-    auto token = CUDAToken(defaultDevice);
-    CUDA<int> data = TestCUDA::create(10, token);
+    auto ctx = CUDAScopedContext(defaultDevice);
+    std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
+    auto& data = *dataPtr;
 
-    SECTION("Construct from CUDAToken") {
+    SECTION("Construct from CUDAScopedContext") {
       REQUIRE(data.isValid());
       REQUIRE(data.device() == defaultDevice);
-      REQUIRE(data.stream().id() == token.stream().id());
+      REQUIRE(data.stream().id() == ctx.stream().id());
       REQUIRE(&data.event() != nullptr);
     }
 
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index da05951f9aad0..632e4d7cb493b 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -2,15 +2,13 @@
 
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
-#include "TestCUDA.h"
 #include "test_CUDAScopedContextKernels.h"
 
 namespace {
-  std::unique_ptr<CUDA<int *> > produce(const CUDAToken& token, int *d, int *h) {
-    auto ctx = CUDAScopedContext(token);
+  std::unique_ptr<CUDA<int *> > produce(int device, int *d, int *h) {
+    auto ctx = CUDAScopedContext(device);
 
     cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id());
     testCUDAScopedContextKernels_single(d, ctx.stream());
@@ -30,42 +28,41 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
 
   constexpr int defaultDevice = 0;
   {
-    auto token = CUDAToken(defaultDevice);
+    auto ctx = CUDAScopedContext(defaultDevice);
 
-    SECTION("From CUDAToken") {
-      auto ctx = CUDAScopedContext(token);
-      REQUIRE(cuda::device::current::get().id() == token.device());
-      REQUIRE(ctx.stream().id() == token.stream().id());
-    }
-
-    SECTION("From CUDA<T>") {
-      const CUDA<int> data = TestCUDA::create(10, token);
-
-      auto ctx = CUDAScopedContext(data);
-      REQUIRE(cuda::device::current::get().id() == data.device());
-      REQUIRE(ctx.stream().id() == data.stream().id());
+    SECTION("Construct from device ID") {
+      REQUIRE(cuda::device::current::get().id() == defaultDevice);
     }
 
     SECTION("Wrap T to CUDA<T>") {
-      auto ctx = CUDAScopedContext(token);
-
       std::unique_ptr<CUDA<int> > dataPtr = ctx.wrap(10);
       REQUIRE(dataPtr.get() != nullptr);
       REQUIRE(dataPtr->device() == ctx.device());
       REQUIRE(dataPtr->stream().id() == ctx.stream().id());
     }
 
+    SECTION("Construct from from CUDA<T>") {
+      std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
+      const auto& data = *dataPtr;
+
+      auto ctx2 = CUDAScopedContext(data);
+      REQUIRE(cuda::device::current::get().id() == data.device());
+      REQUIRE(ctx2.stream().id() == data.stream().id());
+    }
+
     SECTION("Storing state as CUDAContextToken") {
       CUDAContextToken ctxtok;
       { // acquire
-        auto ctx = CUDAScopedContext(token);
-        ctxtok = ctx.toToken();
+        std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
+        const auto& data = *dataPtr;
+        auto ctx2 = CUDAScopedContext(data);
+        ctxtok = ctx2.toToken();
       }
 
       { // produce
-        auto ctx = CUDAScopedContext(std::move(ctxtok));
-        REQUIRE(cuda::device::current::get().id() == token.device());
-        REQUIRE(ctx.stream().id() == token.stream().id());
+        auto ctx2 = CUDAScopedContext(std::move(ctxtok));
+        REQUIRE(cuda::device::current::get().id() == ctx.device());
+        REQUIRE(ctx2.stream().id() == ctx.stream().id());
       }
     }
 
@@ -76,17 +73,17 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       // Mimick a producer on the second CUDA stream
       int h_a1 = 1;
       auto d_a1 = cuda::memory::device::make_unique<int>(current_device);
-      auto wprod1 = produce(token, d_a1.get(), &h_a1);
+      auto wprod1 = produce(defaultDevice, d_a1.get(), &h_a1);
 
       // Mimick a producer on the second CUDA stream
-      auto token2 = CUDAToken(defaultDevice);
-      REQUIRE(token.stream().id() != token2.stream().id());
       int h_a2 = 2;
       auto d_a2 = cuda::memory::device::make_unique<int>(current_device);
-      auto wprod2 = produce(token2, d_a2.get(), &h_a2);
+      auto wprod2 = produce(defaultDevice, d_a2.get(), &h_a2);
+
+      REQUIRE(wprod1->stream().id() != wprod2->stream().id());
 
       // Mimick a third producer "joining" the two streams
-      auto ctx = CUDAScopedContext(token);
+      auto ctx2 = CUDAScopedContext(*wprod1);
 
       auto prod1 = ctx.get(*wprod1);
       auto prod2 = ctx.get(*wprod2);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index bf9496f93bf5f..7167f3b895aaf 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -6,7 +6,6 @@
 
 #include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 
 #include "TestCUDAProducerGPUKernel.h"
@@ -23,22 +22,19 @@ class TestCUDAProducerGPUFirst: public CUDAStreamEDProducer<> {
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup);
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDAToken> srcToken_;
   std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
 };
 
 TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig):
-  label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDAToken>(iConfig.getParameter<edm::InputTag>("src")))
+  label_(iConfig.getParameter<std::string>("@module_label"))
 {
   produces<CUDA<float *>>();
 }
 
 void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAToken.");
   descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers, so it reads a CUDAToken. Produces CUDA<float *>.");
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA<float *>.");
 }
 
 void TestCUDAProducerGPUFirst::beginStreamCUDA(edm::StreamID id) {
@@ -49,10 +45,7 @@ void TestCUDAProducerGPUFirst::beginStreamCUDA(edm::StreamID id) {
 void TestCUDAProducerGPUFirst::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
   edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDAToken> htoken;
-  iEvent.getByToken(srcToken_, htoken);
-
-  auto ctx = CUDAScopedContext(*htoken);
+  auto ctx = CUDAScopedContext(iEvent.streamID());
 
   float *output = gpuAlgo_->runAlgo(label_, ctx.stream());
   iEvent.put(ctx.wrap(output));
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py
deleted file mode 100644
index 23f9ac24ca16c..0000000000000
--- a/HeterogeneousCore/CUDATest/python/prod1CUDADeviceProducer_cfi.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from HeterogeneousCore.CUDACore.cudaDeviceChooserProducer_cfi import cudaDeviceChooserProducer as _cudaDeviceChooserProducer
-prod1CUDADeviceProducer = _cudaDeviceChooserProducer.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
index a77f8faee8605..cf50287bcb15e 100644
--- a/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
+++ b/HeterogeneousCore/CUDATest/python/prod1CUDA_cfi.py
@@ -1,4 +1,4 @@
 import FWCore.ParameterSet.Config as cms
 
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
-prod1CUDA = _testCUDAProducerGPUFirst.clone(src = "prod1CUDADeviceProducer")
+prod1CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod1_cff.py b/HeterogeneousCore/CUDATest/python/prod1_cff.py
index 8b08cd94f56b6..b15847fb3d935 100644
--- a/HeterogeneousCore/CUDATest/python/prod1_cff.py
+++ b/HeterogeneousCore/CUDATest/python/prod1_cff.py
@@ -1,6 +1,5 @@
 import FWCore.ParameterSet.Config as cms
 
-from HeterogeneousCore.CUDATest.prod1CUDADeviceProducer_cfi import prod1CUDADeviceProducer
 from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
 from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
 from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
@@ -11,7 +10,6 @@
 gpu.toReplaceWith(prod1, _prod1FromCUDA)
 
 prod1Task = cms.Task(
-    prod1CUDADeviceProducer,
     prod1CUDA,
     prod1
 )
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py
deleted file mode 100644
index b94357d967039..0000000000000
--- a/HeterogeneousCore/CUDATest/python/prod5CUDADeviceProducer_cfi.py
+++ /dev/null
@@ -1,4 +0,0 @@
-import FWCore.ParameterSet.Config as cms
-
-from HeterogeneousCore.CUDACore.cudaDeviceChooserProducer_cfi import cudaDeviceChooserProducer as _cudaDeviceChooserProducer
-prod5CUDADeviceProducer = _cudaDeviceChooserProducer.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
index 18080f60f4ff2..2eb97358a1b4c 100644
--- a/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
+++ b/HeterogeneousCore/CUDATest/python/prod5CUDA_cfi.py
@@ -1,4 +1,4 @@
 import FWCore.ParameterSet.Config as cms
 
 from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst as _testCUDAProducerGPUFirst
-prod5CUDA = _testCUDAProducerGPUFirst.clone(src = "prod5CUDADeviceProducer")
+prod5CUDA = _testCUDAProducerGPUFirst.clone()
diff --git a/HeterogeneousCore/CUDATest/python/prod5_cff.py b/HeterogeneousCore/CUDATest/python/prod5_cff.py
index 49609f9a8062e..2ced616802f9c 100644
--- a/HeterogeneousCore/CUDATest/python/prod5_cff.py
+++ b/HeterogeneousCore/CUDATest/python/prod5_cff.py
@@ -1,6 +1,5 @@
 import FWCore.ParameterSet.Config as cms
 
-from HeterogeneousCore.CUDATest.prod5CUDADeviceProducer_cfi import prod5CUDADeviceProducer
 from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
 from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
 from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
@@ -11,7 +10,6 @@
 gpu.toReplaceWith(prod5, _prod5FromCUDA)
 
 prod5Task = cms.Task(
-    prod5CUDADeviceProducer,
     prod5CUDA,
     prod5
 )
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 3a25491418446..9cc28aec8469c 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -3,9 +3,7 @@
 #include "FWCore/Utilities/interface/Exception.h"
 
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-
-#include "HeterogeneousCore/CUDACore/test/TestCUDA.h" // ugly...
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 
 #include <iostream>
 
@@ -29,9 +27,7 @@ process.moduleToTest(process.toTest)
   SECTION("No event data") {
     edm::test::TestProcessor tester(config);
 
-    REQUIRE_THROWS_AS(tester.test(), cms::Exception);
-    //If the module does not throw when given no data, substitute 
-    //REQUIRE_NOTHROW for REQUIRE_THROWS_AS
+    REQUIRE_NOTHROW(tester.test());
   }
   
   SECTION("beginJob and endJob only") {
@@ -59,9 +55,7 @@ TEST_CASE("TestCUDAProducerGPUFirst operation", s_tag) {
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst",
-    src = cms.InputTag("deviceChooser")
-)
+process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
 process.moduleToTest(process.toTest)
 )_"
   };
@@ -76,17 +70,15 @@ process.moduleToTest(process.toTest)
     return;
   }
 
-  auto putToken = config.produces<CUDAToken>("deviceChooser");
-
   constexpr int defaultDevice = 0;
 
   SECTION("Produce") {
     edm::test::TestProcessor tester{config};
-    auto tokenPtr = std::make_unique<CUDAToken>(defaultDevice);
-    auto event = tester.test(std::make_pair(putToken, std::move(tokenPtr)));
+    auto event = tester.test();
     auto prod = event.get<CUDA<float *> >();
     REQUIRE(prod->device() == defaultDevice);
-    const float *data = TestCUDA::get(*prod);
+    auto ctx = CUDAScopedContext(*prod);
+    const float *data = ctx.get(*prod);
     REQUIRE(data != nullptr);
 
     float firstElements[10];

From 997cda58eefd55cd58690e57709d6e019c96ff58 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 10 Dec 2018 22:30:42 +0100
Subject: [PATCH 09/49] Remove CUDADeviceChooserProducer as obsolete

---
 .../CUDACore/plugins/BuildFile.xml            |  17 ---
 .../plugins/CUDADeviceChooserProducer.cc      |  68 ------------
 .../CUDACore/src/CUDAScopedContext.cc         |   3 +-
 .../CUDACore/src/chooseCUDADevice.cc          |   3 +-
 .../{interface => src}/chooseCUDADevice.h     |   0
 .../test/test_CUDADeviceChooserProducer.cc    | 103 ------------------
 6 files changed, 4 insertions(+), 190 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/BuildFile.xml
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
 rename HeterogeneousCore/CUDACore/{interface => src}/chooseCUDADevice.h (100%)
 delete mode 100644 HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc

diff --git a/HeterogeneousCore/CUDACore/plugins/BuildFile.xml b/HeterogeneousCore/CUDACore/plugins/BuildFile.xml
deleted file mode 100644
index 5749d3ccaf1ad..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/BuildFile.xml
+++ /dev/null
@@ -1,17 +0,0 @@
-#Skip building plugins by dropping all files for none-AMD64 build
-<architecture match="!_amd64_">
-  <flags SKIP_FILES="*"/>
-</architecture>
-
-<use name="FWCore/MessageLogger"/>
-<use name="FWCore/ParameterSet"/>
-<use name="FWCore/PluginManager"/>
-<use name="FWCore/ServiceRegistry"/>
-<use name="FWCore/Framework"/>
-<use name="HeterogeneousCore/CUDACore"/>
-<use name="HeterogeneousCore/CUDAServices"/>
-<use name="cuda"/>
-<use name="cuda-api-wrappers"/>
-<library file="*.cc" name="HeterogeneousCoreCUDACorePlugins">
-  <flags EDM_PLUGIN="1"/>
-</library>
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
deleted file mode 100644
index c1c3db48413ac..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
+++ /dev/null
@@ -1,68 +0,0 @@
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-#include "HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-
-#include <memory>
-
-namespace {
-  struct DeviceCache {
-    int device;
-  };
-}
-
-class CUDADeviceChooserProducer: public edm::global::EDProducer<edm::StreamCache<::DeviceCache>> {
-public:
-  explicit CUDADeviceChooserProducer(const edm::ParameterSet& iConfig);
-  ~CUDADeviceChooserProducer() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
-
-  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
-};
-
-CUDADeviceChooserProducer::CUDADeviceChooserProducer(const edm::ParameterSet& iConfig) {
-  edm::Service<CUDAService> cudaService;
-  if(!cudaService->enabled()) {
-    throw cms::Exception("Configuration") << "CUDAService is disabled so CUDADeviceChooserProducer is unable to make decisions on which CUDA device to run. If you need to run without CUDA devices, please use CUDADeviceChooserFilter for conditional execution, or remove all CUDA modules from your configuration.";
-  }
-  produces<CUDAToken>();
-}
-
-void CUDADeviceChooserProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer chooses on which CUDA device the chain of CUDA EDModules depending on it should run. The decision is communicated downstream with the 'CUDAToken' event product. It is an error if there are no CUDA devices, or CUDAService is disabled.");
-}
-
-std::unique_ptr<::DeviceCache> CUDADeviceChooserProducer::beginStream(edm::StreamID id) const {
-  auto ret = std::make_unique<::DeviceCache>();
-
-  edm::Service<CUDAService> cudaService;
-  if(!cudaService->enabled(id)) {
-    throw cms::Exception("LogicError") << "CUDA is disabled for EDM stream " << id << " in CUDAService, so CUDADeviceChooser is unable to decide the CUDA device for this EDM stream. If you need to dynamically decide whether a chain of CUDA EDModules is run or not, please use CUDADeviceChooserFilter instead.";
-  }
-  ret->device = cudacore::chooseCUDADevice(id);
-
-  LogDebug("CUDADeviceChooserProducer") << "EDM stream " << id << " set to CUDA device " << ret->device;
-
-  return ret;
-}
-
-void CUDADeviceChooserProducer::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  auto ret = std::make_unique<CUDAToken>(streamCache(id)->device);
-  LogDebug("CUDADeviceChooserProducer") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
-  iEvent.put(std::move(ret));
-}
-
-
-DEFINE_FWK_MODULE(CUDADeviceChooserProducer);
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index a0bcba27c5936..702c01448db69 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -1,7 +1,8 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h"
+
+#include "chooseCUDADevice.h"
 
 
 CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID): CUDAScopedContext(cudacore::chooseCUDADevice(streamID)) {}
diff --git a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
index 55764b1137d1f..a582ed2f72866 100644
--- a/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
+++ b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.cc
@@ -1,7 +1,8 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
+#include "chooseCUDADevice.h"
+
 namespace cudacore {
   int chooseCUDADevice(edm::StreamID id) {
     edm::Service<CUDAService> cudaService;
diff --git a/HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
similarity index 100%
rename from HeterogeneousCore/CUDACore/interface/chooseCUDADevice.h
rename to HeterogeneousCore/CUDACore/src/chooseCUDADevice.h
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc
deleted file mode 100644
index f567838730c41..0000000000000
--- a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc
+++ /dev/null
@@ -1,103 +0,0 @@
-#include "catch.hpp"
-#include "FWCore/TestProcessor/interface/TestProcessor.h"
-#include "FWCore/Utilities/interface/Exception.h"
-
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-
-#include <cuda_runtime_api.h>
-
-static constexpr auto s_tag = "[CUDADeviceChooserFilter]";
-
-TEST_CASE("Standard checks of CUDADeviceProducer", s_tag) {
-  const std::string baseConfig{
-R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
-process.moduleToTest(process.toTest)
-)_"
-  };
-
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
-  
-  edm::test::TestProcessor::Config config{ baseConfig };  
-  SECTION("base configuration is OK") {
-    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
-  }
-  
-  SECTION("No event data") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.test());
-  }
-  
-  SECTION("beginJob and endJob only") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
-  }
-
-  SECTION("Run with no LuminosityBlocks") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
-  }
-
-  SECTION("LuminosityBlock with no Events") {
-    edm::test::TestProcessor tester(config);
-    
-    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
-  }
-
-}
-
-TEST_CASE("CUDAService enabled", s_tag) {
-  const std::string config{
-R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
-process.moduleToTest(process.toTest)
-)_"
-  };
-
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
-
-  SECTION("CUDAToken") {
-    edm::test::TestProcessor tester{config};
-    auto event = tester.test();
-    
-    REQUIRE(event.get<CUDAToken>()->device() >= 0);
-    REQUIRE(event.get<CUDAToken>()->stream().id() != nullptr);
-  }
-}
-
-TEST_CASE("CUDAService disabled", s_tag) {
-  const std::string config{
-R"_(from FWCore.TestProcessor.TestProcess import *
-process = TestProcess()
-process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.CUDAService.enabled = False
-process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
-process.moduleToTest(process.toTest)
-)_"
-  };
-
-  SECTION("Construction") {
-    REQUIRE_THROWS_AS(edm::test::TestProcessor{config}, cms::Exception);
-  }
-}

From e62f9dcf0e4f16162a4da69f2438c4ed7112df7c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 10 Dec 2018 23:43:39 +0100
Subject: [PATCH 10/49] Remove CUDAToken as obsolete

---
 .../CUDACore/interface/CUDAToken.h            | 43 -------------------
 HeterogeneousCore/CUDACore/src/CUDAToken.cc   | 23 ----------
 HeterogeneousCore/CUDACore/src/classes.h      |  3 --
 .../CUDACore/src/classes_def.xml              |  3 --
 4 files changed, 72 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/interface/CUDAToken.h
 delete mode 100644 HeterogeneousCore/CUDACore/src/CUDAToken.cc

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAToken.h b/HeterogeneousCore/CUDACore/interface/CUDAToken.h
deleted file mode 100644
index ef879677c720b..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/CUDAToken.h
+++ /dev/null
@@ -1,43 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_CUDAToken_h
-#define HeterogeneousCore_CUDACore_CUDAToken_h
-
-#include <cuda/api_wrappers.h>
-
-#include <memory>
-
-/**
- * The purpose of this class is to deliver the device and CUDA stream
- * information from CUDADeviceChooser to the EDModules with CUDA
- * implementation.
- *
- * Currently the class is declared as transient in the dictionary, but
- * in principle (for debugging purposes) it could be possible to
- * persist it by marking only the CUDA stream as transient.
- *
- * Note that the CUDA stream is returned only as a const reference.
- * Various methods (e.g. cuda::stream_t<>::synchronize()) are
- * non-const, but on the other hand cuda:stream_t is just a handle
- * wrapping the real CUDA stream, and can thus be cheaply copied as a
- * non-owning non-const handle.
- */
-class CUDAToken {
-public:
-  CUDAToken() = default;
-  explicit CUDAToken(int device);
-
-  ~CUDAToken();
-
-  CUDAToken(const CUDAToken&) = delete;
-  CUDAToken& operator=(const CUDAToken&) = delete;
-  CUDAToken(CUDAToken&&) = default;
-  CUDAToken& operator=(CUDAToken&&) = default;
-
-  int device() const { return device_; }
-  const cuda::stream_t<>& stream() const { return *stream_; }
-  
-private:
-  std::unique_ptr<cuda::stream_t<>> stream_;
-  int device_ = -1;
-};
-
-#endif
diff --git a/HeterogeneousCore/CUDACore/src/CUDAToken.cc b/HeterogeneousCore/CUDACore/src/CUDAToken.cc
deleted file mode 100644
index 3493bd49a0fcd..0000000000000
--- a/HeterogeneousCore/CUDACore/src/CUDAToken.cc
+++ /dev/null
@@ -1,23 +0,0 @@
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-
-namespace {
-  auto make_stream(int device) {
-    cuda::device::current::scoped_override_t<> setDeviceForThisScope(device);
-    auto current_device = cuda::device::current::get();
-    return std::make_unique<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
-  }
-}
-
-CUDAToken::CUDAToken(int device):
-  stream_(make_stream(device)),
-  device_(device)
-{}
-
-CUDAToken::~CUDAToken() {
-  if(stream_) {
-    // The current memory allocation model (large blocks) requires the
-    // CUDA stream to be synchronized before moving on to the next
-    // event in the EDM stream in order to avoid race conditions.
-    stream_->synchronize();
-  }
-}
diff --git a/HeterogeneousCore/CUDACore/src/classes.h b/HeterogeneousCore/CUDACore/src/classes.h
index 8e7fcca0008f0..0cf6aaf0cb393 100644
--- a/HeterogeneousCore/CUDACore/src/classes.h
+++ b/HeterogeneousCore/CUDACore/src/classes.h
@@ -1,11 +1,8 @@
 #include "DataFormats/Common/interface/Wrapper.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 
 namespace {
   struct dictionary {
-    CUDAToken ct;
-
     // These should really be placed elsewhere?
     CUDA<float *> cf;
   };
diff --git a/HeterogeneousCore/CUDACore/src/classes_def.xml b/HeterogeneousCore/CUDACore/src/classes_def.xml
index b551c0c74d721..7884ba28175d5 100644
--- a/HeterogeneousCore/CUDACore/src/classes_def.xml
+++ b/HeterogeneousCore/CUDACore/src/classes_def.xml
@@ -1,7 +1,4 @@
 <lcgdict>
-    <class name="CUDAToken" persistent="false"/>
-    <class name="edm::Wrapper<CUDAToken>" persistent="false"/>
-
     <class name="CUDA<float *>" persistent="false"/>
     <class name="edm::Wrapper<CUDA<float *>>" persistent="false"/>
 </lcgdict>

From e8c9f89e0f8c6e6163bc4cfc08a643c631ae4a43 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 11 Dec 2018 00:35:18 +0100
Subject: [PATCH 11/49] Normal EDProducers with dynamic memory management

---
 HeterogeneousCore/CUDACore/BuildFile.xml      |  1 -
 HeterogeneousCore/CUDACore/interface/CUDA.h   |  8 ++---
 .../CUDACore/interface/CUDAScopedContext.h    |  2 ++
 HeterogeneousCore/CUDACore/src/classes.h      |  9 -----
 .../CUDACore/src/classes_def.xml              |  4 ---
 HeterogeneousCore/CUDATest/BuildFile.xml      |  3 ++
 .../CUDATest/interface/CUDAThing.h            | 19 ++++++++++
 .../CUDATest/plugins/TestCUDAProducerGPU.cc   | 35 ++++++++-----------
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 33 +++++++----------
 .../plugins/TestCUDAProducerGPUFirst.cc       | 28 ++++++---------
 .../plugins/TestCUDAProducerGPUKernel.cu      | 32 ++++++++---------
 .../plugins/TestCUDAProducerGPUKernel.h       | 28 +++++----------
 .../plugins/TestCUDAProducerGPUtoCPU.cc       | 32 ++++++++---------
 HeterogeneousCore/CUDATest/src/classes.h      |  3 ++
 .../CUDATest/src/classes_def.xml              |  4 +++
 .../test/test_TestCUDAProducerGPUFirst.cc     |  6 ++--
 16 files changed, 116 insertions(+), 131 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/src/classes.h
 delete mode 100644 HeterogeneousCore/CUDACore/src/classes_def.xml
 create mode 100644 HeterogeneousCore/CUDATest/BuildFile.xml
 create mode 100644 HeterogeneousCore/CUDATest/interface/CUDAThing.h
 create mode 100644 HeterogeneousCore/CUDATest/src/classes.h
 create mode 100644 HeterogeneousCore/CUDATest/src/classes_def.xml

diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
index 1ebe999f9746c..9f4b814ad644a 100644
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -6,7 +6,6 @@
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda"/>
 <use name="cuda-api-wrappers"/>
-<use name="root"/>
 
 <export>
     <lib name="1"/>
diff --git a/HeterogeneousCore/CUDACore/interface/CUDA.h b/HeterogeneousCore/CUDACore/interface/CUDA.h
index bda3f4e2a0cd9..6fc3a93aacde1 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDA.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDA.h
@@ -58,14 +58,14 @@ class CUDA {
 private:
   // The cuda::stream_t is really shared among edm::Event products, so
   // using shared_ptr also here
-  std::shared_ptr<cuda::stream_t<>> stream_;
+  std::shared_ptr<cuda::stream_t<>> stream_; //!
   // Using unique_ptr to support the default constructor. Tried
   // std::optional, but cuda::event_t has its move assignment
   // operators deleted.
-  std::unique_ptr<cuda::event_t> event_;
+  std::unique_ptr<cuda::event_t> event_; //!
 
-  T data_;
-  int device_ = -1;
+  T data_; //!
+  int device_ = -1; //!
 };
 
 #endif
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 6900d3f63442a..5c01e9a8f7fdc 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -7,6 +7,8 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 
+#include <cuda/api_wrappers.h>
+
 #include <optional>
 
 /**
diff --git a/HeterogeneousCore/CUDACore/src/classes.h b/HeterogeneousCore/CUDACore/src/classes.h
deleted file mode 100644
index 0cf6aaf0cb393..0000000000000
--- a/HeterogeneousCore/CUDACore/src/classes.h
+++ /dev/null
@@ -1,9 +0,0 @@
-#include "DataFormats/Common/interface/Wrapper.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
-
-namespace {
-  struct dictionary {
-    // These should really be placed elsewhere?
-    CUDA<float *> cf;
-  };
-}
diff --git a/HeterogeneousCore/CUDACore/src/classes_def.xml b/HeterogeneousCore/CUDACore/src/classes_def.xml
deleted file mode 100644
index 7884ba28175d5..0000000000000
--- a/HeterogeneousCore/CUDACore/src/classes_def.xml
+++ /dev/null
@@ -1,4 +0,0 @@
-<lcgdict>
-    <class name="CUDA<float *>" persistent="false"/>
-    <class name="edm::Wrapper<CUDA<float *>>" persistent="false"/>
-</lcgdict>
diff --git a/HeterogeneousCore/CUDATest/BuildFile.xml b/HeterogeneousCore/CUDATest/BuildFile.xml
new file mode 100644
index 0000000000000..112c200812d98
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/BuildFile.xml
@@ -0,0 +1,3 @@
+<use name="DataFormats/Common"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="rootcore"/>
diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
new file mode 100644
index 0000000000000..294374bba1ce5
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
@@ -0,0 +1,19 @@
+#ifndef HeterogeneousCore_CUDATest_CUDAThing_H
+#define HeterogeneousCore_CUDATest_CUDAThing_H
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+
+class CUDAThing {
+public:
+  CUDAThing() = default;
+  CUDAThing(edm::cuda::device::unique_ptr<float[]> ptr):
+    ptr_(std::move(ptr))
+  {}
+
+  const float *get() const { return ptr_.get(); }
+
+private:
+  edm::cuda::device::unique_ptr<float[]> ptr_;;
+};
+
+#endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index bf7b6d8563d66..56e0b3601c8d1 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -1,59 +1,52 @@
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/Frameworkfwd.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
 
-class TestCUDAProducerGPU: public CUDAStreamEDProducer<> {
+class TestCUDAProducerGPU: public edm::global::EDProducer<> {
 public:
   explicit TestCUDAProducerGPU(const edm::ParameterSet& iConfig);
   ~TestCUDAProducerGPU() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void beginStreamCUDA(edm::StreamID id) override;
-
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup);
+  void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDA<float *>> srcToken_;
-  std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+  edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
 };
 
 TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<float *>>(iConfig.getParameter<edm::InputTag>("src")))
+  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src")))
 {
-  produces<CUDA<float *>>();
+  produces<CUDA<CUDAThing>>();
 }
 
 void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDA<float *>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDA<CUDAThing>.");
   descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDA<float *>.");
-}
-
-void TestCUDAProducerGPU::beginStreamCUDA(edm::StreamID id) {
-  // Allocate device memory via RAII
-  gpuAlgo_ = std::make_unique<TestCUDAProducerGPUKernel>();
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDA<CUDAThing>.");
 }
 
-void TestCUDAProducerGPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDA<float *> > hin;
+  edm::Handle<CUDA<CUDAThing>> hin;
   iEvent.getByToken(srcToken_, hin);
   auto ctx = CUDAScopedContext(*hin);
-  const float *input = ctx.get(*hin);
+  const CUDAThing& input = ctx.get(*hin);
 
-  iEvent.put(ctx.wrap(gpuAlgo_->runAlgo(label_, input, ctx.stream())));
+  iEvent.put(ctx.wrap(CUDAThing(gpuAlgo_.runAlgo(label_, input.get(), ctx.stream()))));
 
   edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index cc20789f3451b..3a87d99e4c08d 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -1,41 +1,40 @@
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/Frameworkfwd.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
 
-class TestCUDAProducerGPUEW: public CUDAStreamEDProducer<edm::ExternalWork> {
+class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig);
   ~TestCUDAProducerGPUEW() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void beginStreamCUDA(edm::StreamID id) override;
-
   void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDA<float *>> srcToken_;
-  std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+  edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
   CUDAContextToken ctxTmp_;
-  float *devicePtr_ = nullptr;
+  edm::cuda::device::unique_ptr<float[]> devicePtr_;
   float hostData_ = 0.f;
 };
 
 TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<float *>>(iConfig.getParameter<edm::InputTag>("src")))
+  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src")))
 {
-  produces<CUDA<float *>>();
+  produces<CUDA<CUDAThing>>();
 }
 
 void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -44,24 +43,19 @@ void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& des
   descriptions.addWithDefaultLabel(desc);
 }
 
-void TestCUDAProducerGPUEW::beginStreamCUDA(edm::StreamID id) {
-  // Allocate device memory via RAII
-  gpuAlgo_ = std::make_unique<TestCUDAProducerGPUKernel>();
-}
-
 void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDA<float *> > hin;
+  edm::Handle<CUDA<CUDAThing>> hin;
   iEvent.getByToken(srcToken_, hin);
   auto ctx = CUDAScopedContext(*hin, std::move(waitingTaskHolder));
-  const float *input = ctx.get(*hin);
+  const CUDAThing& input = ctx.get(*hin);
 
-  devicePtr_ = gpuAlgo_->runAlgo(label_, input, ctx.stream());
+  devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
   // Mimick the need to transfer some of the GPU data back to CPU to
   // be used for something within this module, or to be put in the
   // event.
-  cuda::memory::async::copy(&hostData_, devicePtr_+10, sizeof(float), ctx.stream().id());
+  cuda::memory::async::copy(&hostData_, devicePtr_.get()+10, sizeof(float), ctx.stream().id());
 
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
@@ -73,8 +67,7 @@ void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& i
 
   auto ctx = CUDAScopedContext(std::move(ctxTmp_));
 
-  iEvent.put(ctx.wrap(devicePtr_));
-  devicePtr_ = nullptr;
+  iEvent.put(ctx.wrap(CUDAThing(std::move(devicePtr_))));
 
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index 7167f3b895aaf..9798aac2d24b3 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -1,54 +1,48 @@
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/Frameworkfwd.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
 
-class TestCUDAProducerGPUFirst: public CUDAStreamEDProducer<> {
+class TestCUDAProducerGPUFirst: public edm::global::EDProducer<> {
 public:
   explicit TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig);
   ~TestCUDAProducerGPUFirst() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void beginStreamCUDA(edm::StreamID id) override;
-
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup);
+  void produce(edm::StreamID stream, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 private:
   std::string label_;
-  std::unique_ptr<TestCUDAProducerGPUKernel> gpuAlgo_;
+  TestCUDAProducerGPUKernel gpuAlgo_;
 };
 
 TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label"))
 {
-  produces<CUDA<float *>>();
+  produces<CUDA<CUDAThing>>();
 }
 
 void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA<float *>.");
-}
-
-void TestCUDAProducerGPUFirst::beginStreamCUDA(edm::StreamID id) {
-  // Allocate device memory via RAII
-  gpuAlgo_ = std::make_unique<TestCUDAProducerGPUKernel>();
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA<CUDAThing>.");
 }
 
-void TestCUDAProducerGPUFirst::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  auto ctx = CUDAScopedContext(iEvent.streamID());
+  auto ctx = CUDAScopedContext(streamID);
 
-  float *output = gpuAlgo_->runAlgo(label_, ctx.stream());
-  iEvent.put(ctx.wrap(output));
+  edm::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
 
   edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index aa22d330be374..933e5deea1266 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -2,6 +2,8 @@
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/Utilities/interface/Exception.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 namespace {
   template<typename T>
@@ -52,21 +54,7 @@ namespace {
   }
 }
 
-TestCUDAProducerGPUKernel::TestCUDAProducerGPUKernel() {
-  h_a = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
-  h_b = cuda::memory::host::make_unique<float[]>(NUM_VALUES);
-
-  auto current_device = cuda::device::current::get();
-  d_a = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
-  d_b = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
-  d_c = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES);
-
-  d_ma = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES*NUM_VALUES);
-  d_mb = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES*NUM_VALUES);
-  d_mc = cuda::memory::device::make_unique<float[]>(current_device, NUM_VALUES*NUM_VALUES);
-}
-
-float *TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) {
+edm::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const {
   // First make the sanity check
   if(d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
@@ -78,21 +66,33 @@ float *TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float
     }
   }
 
+  edm::Service<CUDAService> cs;
+
+  auto h_a = cs->make_host_unique<float[]>(NUM_VALUES, stream);
+  auto h_b = cs->make_host_unique<float[]>(NUM_VALUES, stream);
+
   for (auto i=0; i<NUM_VALUES; i++) {
     h_a[i] = i;
     h_b[i] = i*i;
   }
 
+  auto d_a = cs->make_device_unique<float[]>(NUM_VALUES, stream);
+  auto d_b = cs->make_device_unique<float[]>(NUM_VALUES, stream);
+
   cuda::memory::async::copy(d_a.get(), h_a.get(), NUM_VALUES*sizeof(float), stream.id());
   cuda::memory::async::copy(d_b.get(), h_b.get(), NUM_VALUES*sizeof(float), stream.id());
 
   int threadsPerBlock {32};
   int blocksPerGrid = (NUM_VALUES + threadsPerBlock - 1) / threadsPerBlock;
 
+  auto d_c = cs->make_device_unique<float[]>(NUM_VALUES, stream);
   auto current_device = cuda::device::current::get();
   edm::LogPrint("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id();
   vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
+  auto d_ma = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
+  auto d_mb = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
+  auto d_mc = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
   dim3 threadsPerBlock3{NUM_VALUES, NUM_VALUES};
   dim3 blocksPerGrid3{1,1};
   if(NUM_VALUES*NUM_VALUES > 32) {
@@ -108,5 +108,5 @@ float *TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float
   matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
   edm::LogPrint("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU kernels launched, returning return pointer device " << current_device.id() << " CUDA stream " << stream.id();
-  return d_a.get();
+  return d_a;
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
index e462516716f70..c8fe63fe2b2f0 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -1,12 +1,14 @@
 #ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
 #define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
 
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+
 #include <cuda/api_wrappers.h>
 
 /**
  * This class models the actual CUDA implementation of an algorithm.
- * It follows RAII, i.e. does all memory allocations in its
- * constructor.
+ *
+ * Memory is allocated dynamically with the allocator in CUDAService
  *
  * The algorithm is intended to waste time with large matrix
  * operations so that the asynchronous nature of the CUDA integration
@@ -16,28 +18,14 @@ class TestCUDAProducerGPUKernel {
 public:
   static constexpr int NUM_VALUES = 4000;
 
-  TestCUDAProducerGPUKernel();
+  TestCUDAProducerGPUKernel() = default;
   ~TestCUDAProducerGPUKernel() = default;
 
-  // returns (non-owning) pointer to device memory
-  float *runAlgo(const std::string& label, cuda::stream_t<>& stream) {
+  // returns (owning) pointer to device memory
+  edm::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cuda::stream_t<>& stream) const {
     return runAlgo(label, nullptr, stream);
   }
-  float *runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream);
-
-private:
-  // stored for the job duration
-  cuda::memory::host::unique_ptr<float[]> h_a;
-  cuda::memory::host::unique_ptr<float[]> h_b;
-  cuda::memory::device::unique_ptr<float[]> d_a;
-  cuda::memory::device::unique_ptr<float[]> d_b;
-  cuda::memory::device::unique_ptr<float[]> d_c;
-  cuda::memory::device::unique_ptr<float[]> d_ma;
-  cuda::memory::device::unique_ptr<float[]> d_mb;
-  cuda::memory::device::unique_ptr<float[]> d_mc;
-
-  // temporary storage, need to be somewhere to allow async execution
-  cuda::memory::device::unique_ptr<float[]> d_d;
+  edm::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const;
 };
 
 #endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 7ab018eba6402..840ff8253c353 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -1,63 +1,60 @@
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Framework/interface/Frameworkfwd.h"
 #include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
 
-class TestCUDAProducerGPUtoCPU: public CUDAStreamEDProducer<edm::ExternalWork> {
+class TestCUDAProducerGPUtoCPU: public edm::stream::EDProducer<edm::ExternalWork> {
 public:
   explicit TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig);
   ~TestCUDAProducerGPUtoCPU() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
-  void beginStreamCUDA(edm::StreamID id) override;
-
   void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
 
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDA<float *>> srcToken_;
-  cuda::memory::host::unique_ptr<float[]> buffer_;
+  edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  edm::cuda::host::unique_ptr<float[]> buffer_;
 };
 
 TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<float *>>(iConfig.getParameter<edm::InputTag>("src")))
+  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src")))
 {
   produces<int>();
 }
 
 void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDA<float *>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDA<CUDAThing>.");
   descriptions.addWithDefaultLabel(desc);
   descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
 }
 
-void TestCUDAProducerGPUtoCPU::beginStreamCUDA(edm::StreamID id) {
-  // Pinned host memory has to be allocated here as well so that it is
-  // not done when running on a non-GPU machine
-  buffer_ = cuda::memory::host::make_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES);
-}
-
 void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDA<float *>> hin;
+  edm::Handle<CUDA<CUDAThing>> hin;
   iEvent.getByToken(srcToken_, hin);
   auto ctx = CUDAScopedContext(*hin, std::move(waitingTaskHolder));
-  const float *device = ctx.get(*hin);
+  const CUDAThing& device = ctx.get(*hin);
 
+  edm::Service<CUDAService> cs;
+  buffer_ = cs->make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
   // Enqueue async copy, continue in produce once finished
-  cuda::memory::async::copy(buffer_.get(), device, TestCUDAProducerGPUKernel::NUM_VALUES*sizeof(float), ctx.stream().id());
+  cuda::memory::async::copy(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES*sizeof(float), ctx.stream().id());
 
   edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
@@ -69,6 +66,7 @@ void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup
   for(int i=0; i<TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
     counter += buffer_[i];
   }
+  buffer_.reset(); // not so nice, but no way around?
 
   iEvent.put(std::make_unique<int>(counter));
 
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
new file mode 100644
index 0000000000000..b028864460fe8
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -0,0 +1,3 @@
+#include "DataFormats/Common/interface/Wrapper.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
new file mode 100644
index 0000000000000..5ae2e0f8117fb
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+    <class name="CUDA<CUDAThing>" persistent="false"/>
+    <class name="edm::Wrapper<CUDA<CUDAThing>>" persistent="false"/>
+</lcgdict>
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 9cc28aec8469c..0ab921029d1ac 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -4,6 +4,7 @@
 
 #include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include <iostream>
 
@@ -75,10 +76,11 @@ process.moduleToTest(process.toTest)
   SECTION("Produce") {
     edm::test::TestProcessor tester{config};
     auto event = tester.test();
-    auto prod = event.get<CUDA<float *> >();
+    auto prod = event.get<CUDA<CUDAThing> >();
     REQUIRE(prod->device() == defaultDevice);
     auto ctx = CUDAScopedContext(*prod);
-    const float *data = ctx.get(*prod);
+    const CUDAThing& thing = ctx.get(*prod);
+    const float *data = thing.get();
     REQUIRE(data != nullptr);
 
     float firstElements[10];

From 1ce6d8a0cb8531eda1e24489eb8f57166c1882b2 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 12 Dec 2018 18:40:16 +0100
Subject: [PATCH 12/49] Move CUDA.h to CUDADataFormats/Common

---
 .../CUDACore => CUDADataFormats/Common}/interface/CUDA.h     | 4 ++--
 CUDADataFormats/Common/test/BuildFile.xml                    | 5 +++++
 .../CUDACore => CUDADataFormats/Common}/test/test_CUDA.cc    | 2 +-
 CUDADataFormats/Common/test/test_main.cc                     | 2 ++
 HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h     | 2 +-
 HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc    | 2 +-
 HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc    | 3 ++-
 HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc  | 2 +-
 .../CUDATest/plugins/TestCUDAProducerGPUFirst.cc             | 2 +-
 .../CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc             | 2 +-
 HeterogeneousCore/CUDATest/src/classes.h                     | 2 +-
 .../CUDATest/test/test_TestCUDAProducerGPUFirst.cc           | 2 +-
 12 files changed, 19 insertions(+), 11 deletions(-)
 rename {HeterogeneousCore/CUDACore => CUDADataFormats/Common}/interface/CUDA.h (96%)
 create mode 100644 CUDADataFormats/Common/test/BuildFile.xml
 rename {HeterogeneousCore/CUDACore => CUDADataFormats/Common}/test/test_CUDA.cc (96%)
 create mode 100644 CUDADataFormats/Common/test/test_main.cc

diff --git a/HeterogeneousCore/CUDACore/interface/CUDA.h b/CUDADataFormats/Common/interface/CUDA.h
similarity index 96%
rename from HeterogeneousCore/CUDACore/interface/CUDA.h
rename to CUDADataFormats/Common/interface/CUDA.h
index 6fc3a93aacde1..b0c6ee965ad3b 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDA.h
+++ b/CUDADataFormats/Common/interface/CUDA.h
@@ -1,5 +1,5 @@
-#ifndef HeterogeneousCore_CUDACore_CUDA_h
-#define HeterogeneousCore_CUDACore_CUDA_h
+#ifndef CUDADataFormats_Common_CUDA_h
+#define CUDADataFormats_Common_CUDA_h
 
 #include <optional>
 
diff --git a/CUDADataFormats/Common/test/BuildFile.xml b/CUDADataFormats/Common/test/BuildFile.xml
new file mode 100644
index 0000000000000..5e804fe80a736
--- /dev/null
+++ b/CUDADataFormats/Common/test/BuildFile.xml
@@ -0,0 +1,5 @@
+<bin file="test*.cc" name="testCUDADataFormatsCommon">
+  <use name="HeterogeneousCore/CUDACore"/>
+  <use name="catch2"/>
+  <use name="cuda"/>
+</bin>
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDA.cc b/CUDADataFormats/Common/test/test_CUDA.cc
similarity index 96%
rename from HeterogeneousCore/CUDACore/test/test_CUDA.cc
rename to CUDADataFormats/Common/test/test_CUDA.cc
index 6572ffd9d258d..a6c22833a10c4 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDA.cc
+++ b/CUDADataFormats/Common/test/test_CUDA.cc
@@ -1,6 +1,6 @@
 #include "catch.hpp"
 
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
diff --git a/CUDADataFormats/Common/test/test_main.cc b/CUDADataFormats/Common/test/test_main.cc
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/CUDADataFormats/Common/test/test_main.cc
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 5c01e9a8f7fdc..8a59e9d0d22dc 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -4,7 +4,7 @@
 #include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/StreamID.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 
 #include <cuda/api_wrappers.h>
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 632e4d7cb493b..ff18c068aedb6 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -1,6 +1,6 @@
 #include "catch.hpp"
 
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index 56e0b3601c8d1..6b07eece6f3ec 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -4,8 +4,9 @@
 #include "FWCore/Framework/interface/global/EDProducer.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 3a87d99e4c08d..e69e5966f8995 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -5,9 +5,9 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index 9798aac2d24b3..b38d41a02e591 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -5,8 +5,8 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
 #include "TestCUDAProducerGPUKernel.h"
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 840ff8253c353..af9c55a3b78aa 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -6,7 +6,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
index b028864460fe8..333529467d409 100644
--- a/HeterogeneousCore/CUDATest/src/classes.h
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -1,3 +1,3 @@
 #include "DataFormats/Common/interface/Wrapper.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 0ab921029d1ac..4d2eb7c68aebc 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -2,7 +2,7 @@
 #include "FWCore/TestProcessor/interface/TestProcessor.h"
 #include "FWCore/Utilities/interface/Exception.h"
 
-#include "HeterogeneousCore/CUDACore/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 

From 2e792c09f98f610e8fff812571c6decec4caee24 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 12 Dec 2018 18:50:13 +0100
Subject: [PATCH 13/49] Remove CUDAStreamEDProducer as obsolete

---
 .../CUDACore/interface/CUDAStreamEDProducer.h | 46 -------------------
 1 file changed, 46 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h b/HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h
deleted file mode 100644
index f825cacb91567..0000000000000
--- a/HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h
+++ /dev/null
@@ -1,46 +0,0 @@
-#ifndef HeterogeneousCore_CUDACore_CUDAStreamEDProducer_h
-#define HeterogeneousCore_CUDACore_CUDAStreamEDProducer_h
-
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-#include <cuda/api_wrappers.h>
-
-/**
- * This class is a bit hacky but intended only for a transition
- * period. It also duplicates the EDM stream -> CUDA device assignment.
- */
-template <typename ...Args>
-class CUDAStreamEDProducer: public edm::stream::EDProducer<Args...> {
-private:
-  void beginStream(edm::StreamID id) override final {
-    // The following checks only from CUDAService whether it is
-    // enabled or not. Also CUDADeviceChooser can be configured to be
-    // disabled, effectively disabling that "CUDA chain".
-    // Unfortunately we have no (easy) means here to know whether this
-    // EDProducer is part of such a chain. On the other hand,
-    // beginStream() is intended only for block memory allocations
-    // (and we will likely adjust the strategy), and the
-    // CUDADeviceChooser.enabled is intended for debugging/testing
-    // purposes, so maybe this solution is good enough (i.e. for
-    // debugging it doesn't matter if we allocate "too much")
-    edm::Service<CUDAService> cudaService;
-    if(cudaService->enabled(id)) {
-      // This logic is duplicated from CUDADeviceChooser
-      int device = id % cudaService->numberOfDevices();
-      cuda::device::current::scoped_override_t<> setDeviceForThisScope(device);
-      beginStreamCUDA(id);
-    }
-  }
-
-  // It's a bit stupid to change the name, but I don't have anything
-  // additional to pass down.
-  //
-  // Note: contrary to HeterogeneousEDProducer+GPUCuda, the CUDA
-  // stream is *not* passed to the deriving class (there is no good
-  // place for a CUDA stream here in this design).
-  virtual void beginStreamCUDA(edm::StreamID id) = 0;
-};
-
-#endif

From 01fc2e85a1dfa9d5ea1d263cdc0ec8edde2c346c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 12 Dec 2018 21:02:36 +0100
Subject: [PATCH 14/49] Add emplace() to CUDAScopedContext to allow using
 Event::emplace()

---
 CUDADataFormats/Common/interface/CUDA.h        | 14 ++++++++++++--
 .../CUDACore/interface/CUDAScopedContext.h     | 18 ++++++++++++------
 .../CUDATest/plugins/TestCUDAProducerGPU.cc    | 10 +++++-----
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc  | 10 +++++-----
 4 files changed, 34 insertions(+), 18 deletions(-)

diff --git a/CUDADataFormats/Common/interface/CUDA.h b/CUDADataFormats/Common/interface/CUDA.h
index b0c6ee965ad3b..f7a08b0751f92 100644
--- a/CUDADataFormats/Common/interface/CUDA.h
+++ b/CUDADataFormats/Common/interface/CUDA.h
@@ -5,6 +5,10 @@
 
 #include <cuda/api_wrappers.h>
 
+namespace edm {
+  template <typename T> class Wrapper;
+}
+
 /**
  * The purpose of this class is to wrap CUDA data to edm::Event in a
  * way which forces correct use of various utilities.
@@ -43,17 +47,23 @@ class CUDA {
 
 private:
   friend class CUDAScopedContext;
+  friend class edm::Wrapper<CUDA<T>>;
 
   // Using template to break circular dependency
   template <typename Context>
-  explicit CUDA(T data, const Context& ctx):
+  explicit CUDA(const Context& ctx, T data):
     stream_(ctx.streamPtr()),
     event_(std::make_unique<cuda::event_t>(cuda::event::create(ctx.device(),
                                                                cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
                                                                cuda::event::dont_record_timings))), // it should be a bit faster to ignore timings
     data_(std::move(data)),
     device_(ctx.device())
-  {}
+  {
+    // Record CUDA event to the CUDA stream. The event will become
+    // "occurred" after all work queued to the stream before this
+    // point has been finished.
+    event_->record(stream_->id());
+  }
 
 private:
   // The cuda::stream_t is really shared among edm::Event products, so
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 8a59e9d0d22dc..85439e99a08e3 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -2,8 +2,10 @@
 #define HeterogeneousCore_CUDACore_CUDAScopedContext_h
 
 #include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
+#include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/StreamID.h"
+#include "FWCore/Utilities/interface/EDPutToken.h"
 #include "CUDADataFormats/Common/interface/CUDA.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 
@@ -93,12 +95,16 @@ class CUDAScopedContext {
   template <typename T>
   std::unique_ptr<CUDA<T> > wrap(T data) {
     // make_unique doesn't work because of private constructor
-    auto ret = std::unique_ptr<CUDA<T> >(new CUDA<T>(std::move(data), *this));
-    // Record CUDA event to the CUDA stream. The event will become
-    // "occurred" after all work queued to the stream before this
-    // point has been finished.
-    ret->event().record(stream_->id());
-    return ret;
+    //
+    // CUDA<T> constructor records CUDA event to the CUDA stream. The
+    // event will become "occurred" after all work queued to the
+    // stream before this point has been finished.
+    return std::unique_ptr<CUDA<T> >(new CUDA<T>(*this, std::move(data)));
+  }
+
+  template <typename T, typename... Args>
+  auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
+    return iEvent.emplace(token, *this, std::forward<Args>(args)...);
   }
 
 private:
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index 6b07eece6f3ec..06f51bfc1b7ff 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -22,15 +22,15 @@ class TestCUDAProducerGPU: public edm::global::EDProducer<> {
 private:
   std::string label_;
   edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDA<CUDAThing>> dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
 };
 
 TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src")))
-{
-  produces<CUDA<CUDAThing>>();
-}
+  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+  dstToken_(produces<CUDA<CUDAThing>>())
+{}
 
 void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -47,7 +47,7 @@ void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, co
   auto ctx = CUDAScopedContext(*hin);
   const CUDAThing& input = ctx.get(*hin);
 
-  iEvent.put(ctx.wrap(CUDAThing(gpuAlgo_.runAlgo(label_, input.get(), ctx.stream()))));
+  ctx.emplace(iEvent, dstToken_, CUDAThing(gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())));
 
   edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index e69e5966f8995..20b9308694da4 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -24,6 +24,7 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
 private:
   std::string label_;
   edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDA<CUDAThing>> dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   CUDAContextToken ctxTmp_;
   edm::cuda::device::unique_ptr<float[]> devicePtr_;
@@ -32,10 +33,9 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
 
 TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src")))
-{
-  produces<CUDA<CUDAThing>>();
-}
+  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+  dstToken_(produces<CUDA<CUDAThing>>())
+{}
 
 void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -67,7 +67,7 @@ void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& i
 
   auto ctx = CUDAScopedContext(std::move(ctxTmp_));
 
-  iEvent.put(ctx.wrap(CUDAThing(std::move(devicePtr_))));
+  ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
 
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }

From 50b438a956c72fcb2d63350e4aa9a930ca6f3560 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 12 Dec 2018 14:15:14 -0600
Subject: [PATCH 15/49] Update documentation

---
 HeterogeneousCore/CUDACore/README.md | 222 ++++++++++++++-------------
 1 file changed, 116 insertions(+), 106 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 28e24f648e7eb..e59fca3e3ed22 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -13,113 +13,105 @@ to extend to other devices. It will be extended if/when it gets
 deployed and `HeterogeneousEDProducer` retired.
 
 ## Sub-packages
-* [`CUDACore`](#cuda-integration) CUDA-specific core components
-* [`CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
-* [`CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+* [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
+* [`HeterogeneousCore/CUDAServices`](../CUDAServices) Various edm::Services related to CUDA
+* [`HeterogeneousCore/CUDAUtilities`](../CUDAUtilities) Various utilities for CUDA kernel code
+* [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
+* [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
 
 # CUDA integration
 
 ## Choosing device
 
-### Dynamically between GPU and CPU
+### GPU and CPU
 
-The device choosing (CPU vs. GPU, which GPU) logic is done by an
-EDFilter and using Paths in the configuration.
+Currently the device type choice (CPU vs. GPU) is done at the
+configuration level with `cms.Modifier`. In the near future this will
+be changed to a decision made at the beginning of the job with a
+[`SwitchProducer`](https://github.com/cms-sw/cmssw/pull/25439).
 
-First, a `CUDADeviceChooserFilter` EDFilter is run. It has the logic
-to device whether the following chain of EDModules should run on a
-CUDA device or not, and if yes, on which CUDA device. If it decides
-"yes", it returns `true` and produces a `CUDAToken`, which contains
-the device id and a CUDA stream. If it decides "no", it returns
-`false` and does not produce anything.
-
-Then, the pieces need to be put together in the configuration. The
-`CUDADeviceChooserFilter` should be put as the first module on a
-`cms.Path`, followed by the CUDA EDProducers (in the future it may
-become sufficient to have only the first EDProducer of a chain in the
-`Path`).
-```python
-process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceChooserFilter",
-    src = cms.InputTag("fooCUDADevice")
-)
-process.fooCUDA = cms.EDProducer("FooProducerCUDA")
-process.fooPathCUDA = cms.Path(
-    process.fooCUDADeviceFilter + process.fooCUDA
-)
+For multi-GPU setup the device is chosen in the first CUDA module in a
+chain of modules by one of the constructors of
+`CUDAScopedContext`
+```cpp
+auto ctx = CUDAScopedContext(iEvent.streamID());
 ```
+As the choice is still the static EDM stream to device assignment, the
+EDM stream ID is needed. The logic will likely evolve in the future.
 
 ### Always on GPU
 
 In case the chain of modules should always be run on a GPU, the
-EDFilter and Paths are not needed. In this case, a
-`CUDADeviceChooserProducer` should be used to produce the `CUDAToken`.
-If the machine has no GPUs or `CUDAService` is disabled, the producer
-throws an exception.
+configuration should be built only with the GPU modules.
 
 
 ## Data model
 
-The GPU data can be a single pointer to device data, or a class/struct
-containing such pointers (among other stuff). When putting the data to
-event, the data is wrapped to `CUDA<T>` template, which holds
+The GPU data should be a class/struct containing smart pointer(s) to
+device data (see [Memory allocation](#memory-allocation)). When
+putting the data to event, the data is wrapped to `CUDA<T>` template,
+which holds
 * the GPU data
-  * must be movable, but no other restrictions (except need to be able to generate ROOT dictionaries from it)
+  * must be movable, but no other restrictions
 * the current device where the data was produced, and the CUDA stream the data was produced with
 * [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
 
 Note that the `CUDA<T>` wrapper can be constructed only with
 `CUDAScopedContext::wrap()`, and the data `T` can be obtained from it
-only with `CUDAScopedContext::get()`, as described further below.
+only with `CUDAScopedContext::get()`, as described further below. When
+putting the data product directly to `edm::Event`, also
+`CUDASCopedContext::emplace()` can be used.
 
 ## CUDA EDProducer
 
 ### Class declaration
 
-For time being (may disappear in the future) a CUDA producer should
-inherit from `CUDAStreamEDProducer<...>`. The template parameters are
-the usual
-[stream producer extensions](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#Template_Arguments).
-Note that contrary to `HeterogeneousEDProducer`, the `ExternalWork`
-extension is **not** implied.
+The CUDA producers are normal EDProducers. Contrary to
+`HeterogeneousEDProducer`, the `ExternalWork` extension is **not**
+required. Its use is recommended though when transferring data from
+GPU to CPU.
+
+### Memory allocation
 
+The memory allocations should be done dynamically with `CUDAService`
 ```cpp
-#include "HeterogeneousCore/CUDACore/interface/CUDAStreamEDProducer.h"
-class FooProducerCUDA: public CUDAStreamEDProducer<> {
-  ...
+edm::Service<CUDAService> cs;
+edm::cuda::device::unique_ptr<float[]> device_buffer = cs->make_device_unique<float[]>(50, cudaStream);
+edm::cuda::host::unique_ptr<float[]>   host_buffer   = cs->make_host_unique<float[]>(50, cudaStream);
 ```
 
-### Memory allocation
+in the `acquire()` and `produce()` functions. The same
+`cuda::stream_t<>` object that is used for transfers and kernels
+should be passed to the allocator.
 
-The only effect of the `CUDAStreamEDProducer` base class is that
-`beginStream(edm::StreamID)` is replaced with
-`beginStreamCUDA(edm::StreamID)`. This is done in order to set the
-current CUDA device before the user code starts. **If the algorithm
-has to allocate memory buffers for the duration of the whole job, the
-recommended place is here.** Note that a CUDA stream is not passed to
-the user code. If a CUDA stream is really needed, the developer should
-create+synchronize it by him/herself. (although if this appears to be
-common practice, we should try to provide the situation somehow)
+The allocator is based on `cub::CachingDeviceAllocator`. The memory is
+guaranteed to be reserved
+* for the host: up to the destructor of the `unique_ptr`
+* for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
 
 ### Setting the current device
 
+A CUDA producer should construct `CUDAScopedContext` in `acquire()`
+either with `edm::StreamID`, or with a `CUDA<T>` read as an input.
+
 A CUDA producer should read either `CUDAToken` (from
 `CUDADeviceChooser`) or one or more `CUDA<T>` products. Then, in the
 `acquire()`/`produce()`, it should construct `CUDAScopedContext` from
 one of them
 ```cpp
-// From CUDAToken
-edm::Handle<CUDAToken> htoken;
-iEvent.getByToken(srcToken_, htoken);
-auto ctx = CUDAScopedContext(*htoken);
+// From edm::StreamID
+auto ctx = CUDAScopedContext(iEvent.streamID());
 
 /// From CUDA<T>
-edm::Handle<CUDA<GPUClusters> > handle;
+edm::Handle<CUDA<GPUClusters>> handle;
 iEvent.getByToken(srctoken_, handle);
 auto ctx = CUDAScopedContext(*handle);
 ```
 
 `CUDAScopedContext` works in the RAII way and does the following
-* Sets the current device (for the scope) from `CUDAToken`/`CUDA<T>`
+* Sets the current device for the current scope
+  - If constructed from the `edm::StreamID`, makes the device choice and creates a new CUDA stream
+  - If constructed from the `CUDA<T>`, uses the same device and CUDA stream as was used to produce the `CUDA<T>`
 * Gives access to the CUDA stream the algorithm should use to queue asynchronous work
 * Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary
 * [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
@@ -135,7 +127,7 @@ The real product (`T`) can be obtained from `CUDA<T>` only with the
 help of `CUDAScopedContext`. 
 
 ```cpp
-edm::Handle<CUDA<GPUClusters> > hclus;
+edm::Handle<CUDA<GPUClusters>> hclus;
 iEvent.getByToken(srctoken_, hclus);
 GPUClusters const& clus = ctx.get(*hclus);
 ```
@@ -147,7 +139,7 @@ This step is needed to
 
 ### Calling the CUDA kernels
 
-There is nothing special, except the CUDA stream can be obtained from
+There is nothing special, except the CUDA stream should be obtained from
 the `CUDAScopedContext`
 
 ```cpp
@@ -156,15 +148,20 @@ gpuAlgo.makeClustersAsync(..., ctx.stream());
 
 ### Putting output
 
-The GPU data needs to be wrapped to `CUDA<T>` template with `CUDAScopedContest.wrap()`
+The GPU data needs to be wrapped to `CUDA<T>` template with
+`CUDAScopedContext::wrap()` or `CUDAScopedContext::emplace()`
 
 ```cpp
 GPUClusters clusters = gpuAlgo.makeClustersAsync(..., ctx.stream());
-std::unique_ptr<CUDA<GPUClusters> > ret = ctx.wrap(clusters);
+std::unique_ptr<CUDA<GPUClusters>> ret = ctx.wrap(clusters);
 iEvent.put(std::move(ret));
 
 // or with one line
 iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
+
+// or avoid one unique_ptr with emplace
+edm::PutTokenT<CUDA<GPUClusters>> putToken_ = produces<CUDA<GPUClusters>>(); // in constructor
+ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
 ```
 
 This step is needed to
@@ -176,12 +173,12 @@ This step is needed to
 Everything above works both with and without `ExternalWork`.
 
 Without `ExternalWork` the `EDProducer`s act similar to TBB
-flowgraph's "streaming node". I.e. they just queue more asynchronous
-work in their `produce()`.
+flowgraph's "streaming node". In other words, they just queue more
+asynchronous work in their `produce()`.
 
 The `ExternalWork` is needed when one would otherwise call
-`cudeStreamSynchronize()`, e.g. transferring something to CPU needed
-for downstream DQM, or to queue more asynchronous work. With
+`cudeStreamSynchronize()`. For example transferring something to CPU
+needed for downstream DQM, or queueing more asynchronous work. With
 `ExternalWork` an `acquire()` method needs to be implemented that gets
 an `edm::WaitingTaskWithArenaHolder` parameter. The
 `WaitingTaskWithArenaHolder` should then be passed to the constructor
@@ -189,7 +186,7 @@ of `CUDAScopedContext` along
 
 ```cpp
 void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  edm::Handle<CUDA<GPUClusters> > handle;
+  edm::Handle<CUDA<GPUClusters>> handle;
   iEvent.getByToken(token_, handle);
   auto ctx = CUDAScopedContext(*handle, std::move(waitingTaskHolder)); // can also copy instead of move if waitingTaskHolder is needed for something else as well
   ...
@@ -200,9 +197,28 @@ function to the CUDA stream in its destructor to call
 `waitingTaskHolder.doneWaiting()`.
 
 A GPU->GPU producer needs a `CUDAScopedContext` also in its
-`produce()`. Currently the best way is to read the input again in
-`produce()` and construct the `CUDAScopedContext` from there. This
-point will be improved. 
+`produce()`. Currently the best way is to store the state of
+`CUDAScopedContext` to `CUDAContextToken` member variable:
+
+```cpp
+class FooProducerCUDA ... {
+  ...
+  CUDAContextToken ctxTmp_;
+};
+
+void acquire(...) {
+  ...
+  ctxTmp_ = ctx.toToken();
+}
+
+void produce(...( {
+  ...
+  auto ctx = CUDAScopedContext(std::move(ctxTmp_));
+}
+```
+
+Ideas for improvements are welcome.
+
 
 ### Transferring GPU data to CPU
 
@@ -215,7 +231,7 @@ the `ExternalWork` needs to be used along
 * In `produce()`
   * If needed, read additional CPU products (e.g. from `edm::Ref`s)
   * Reformat data back to legacy data formats
-  * Note: `CUDAScopedContext` is **not** needed in in `produce()`
+  * Note: `CUDAScopedContext` is **not** needed in `produce()`
 
 ### Synchronizing between CUDA streams
 
@@ -238,50 +254,44 @@ to-be-getted CUDA product exists.
 
 ## Configuration
 
+### With `cms.Modifier`
+
 ```python
-process.fooCPU = cms.EDProducer("FooProducer") # legacy CPU
+process.foo = cms.EDProducer("FooProducer") # legacy CPU
 
-process.fooCUDADevice = cms.EDProducer("CUDADeviceChooser")
-process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceFilter",
-    src = cms.InputTag("fooCUDADevice")
-)
+from Configuration.ProcessModifiers.gpu_cff import gpu
 process.fooCUDA = cms.EDProducer("FooProducerCUDA")
-process.fooFromCUDA = cms.EDProducer("FooProducerCUDAtoCPU", src="fooCUDA")
-process.foo = cms.EDProducer("FooProducerFallback",
-    src = cms.VInputTag("fooFromCUDA", "fooCPU")
-)
-process.fooPathCUDA = cms.Path(
-    process.fooCUDADeviceFilter + process.fooCUDA
+gpu.toReplaceWith(process.foo, cms.EDProducer("FooProducerFromCUDA", src="fooCUDA"))
+
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
+process.fooTask = cms.Task(
+    process.foo,
+    process.fooTaskCUDA
 )
-process.fooPathCPU = cms.Path(
-    ~process.fooCUDADeviceFilter + process.fooCPU
+```
+
+For a more complete example, see [here](../CUDATests/test/testCUDA_cfg.py).
+
+### With `SwitchProducer`
+
+```python
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+process.foo = SwitchProducerCUDA(
+    cpu = cms.EDProducer("FooProducer"), # legacy CPU
+    cuda = cms.EDProducer("FooProducerFromCUDA", src="fooCUDA")
 )
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
+
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
 process.fooTask = cms.Task(
-    process.fooDevice,
-    process.fooFromCUDA,
-    process.foo
+    process.foo,
+    process.fooTaskCUDA
 )
-...
 ```
-For a more complete example, see [here](test/testCUDA_cfg.py).
 
 # Extension to other devices
 
 The C++ side extends in a straightforward way. One has to add classes
 similar to `CUDAToken`, `CUDA<T>`, and `CUDAScopedContext`. Of course,
 much depends on the exact details. The python configuration side
-extends as well, one "just" has to add more modules there. Also the
-device choosing logic is also extendable
-```python
-process.fooCUDADevice = ...
-process.fooFPGADevice = ...
-process.fooPathCUDA = cms.Path(
-    process.fooCUDADeviceFilter + ...
-)
-process.fooPathFPGA = cms.Path(
-    ~process.fooCUDADeviceFilter + process.fooFPGADeviceFilter + ...
-)    
-process.fooPathCPU = cms.Path(
-    ~process.fooCUDADeviceFilter + ~process.fooFPGADeviceFilter + ...
-)
-```
+extends as well.

From 62de99201dd073cae16dd0ad9c66dbef7a7708d9 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 13 Dec 2018 21:23:48 +0100
Subject: [PATCH 16/49] Fix a typo

---
 HeterogeneousCore/CUDAServices/src/CUDAService.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index babe062f9bab2..ad94d8837f68f 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -106,7 +106,7 @@ namespace {
   }
 
   void hostPreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
-    preallocate<edm::cuda::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+    preallocate<edm::cuda::host::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
         return cs.make_host_unique<char[]>(size, stream);
       }, bufferSizes);
   }

From 683ad2b2de979201636047b1ebca7824bc27916f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 13 Dec 2018 21:35:45 +0100
Subject: [PATCH 17/49] Make device::unique_ptr and host::unique_ptr separate
 types

---
 .../Common/interface/device_unique_ptr.h      | 15 +++++++++++-
 .../Common/interface/host_unique_ptr.h        | 15 +++++++++++-
 .../CUDAServices/interface/CUDAService.h      | 24 +++++++++----------
 3 files changed, 40 insertions(+), 14 deletions(-)

diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h
index 1282c52125fa6..de7474d26a67a 100644
--- a/CUDADataFormats/Common/interface/device_unique_ptr.h
+++ b/CUDADataFormats/Common/interface/device_unique_ptr.h
@@ -7,8 +7,21 @@
 namespace edm {
   namespace cuda {
     namespace device {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class DeviceDeleter {
+        public:
+          DeviceDeleter() = default;
+          explicit DeviceDeleter(std::function<void(void *)> f): f_(f) {}
+
+          void operator()(void *ptr) { f_(ptr); }
+        private:
+          std::function<void(void *)> f_;
+        };
+      }
+
       template <typename T>
-      using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
+      using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
     }
   }
 }
diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h
index c945d9b0aa027..a9eee938b165d 100644
--- a/CUDADataFormats/Common/interface/host_unique_ptr.h
+++ b/CUDADataFormats/Common/interface/host_unique_ptr.h
@@ -7,8 +7,21 @@
 namespace edm {
   namespace cuda {
     namespace host {
+      namespace impl {
+        // Additional layer of types to distinguish from host::unique_ptr
+        class HostDeleter {
+        public:
+          HostDeleter() = default;
+          explicit HostDeleter(std::function<void(void *)> f): f_(f) {}
+
+          void operator()(void *ptr) { f_(ptr); }
+        private:
+          std::function<void(void *)> f_;
+        };
+      }
+
       template <typename T>
-      using unique_ptr = std::unique_ptr<T, std::function<void(void *)>>;
+      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
     }
   }
 }
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 9555321f5153a..9101c7772d9ec 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -76,9 +76,9 @@ class CUDAService {
     int dev = getCurrentDevice();
     void *mem = allocate_device(dev, sizeof(T), stream);
     return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
-                                                                               [this, dev](void *ptr) {
-                                                                                 this->free_device(dev, ptr);
-                                                                               });
+                                                                               edm::cuda::device::impl::DeviceDeleter([this, dev](void *ptr) {
+                                                                                   this->free_device(dev, ptr);
+                                                                                 }));
   }
 
   template <typename T>
@@ -89,9 +89,9 @@ class CUDAService {
     int dev = getCurrentDevice();
     void *mem = allocate_device(dev, n*sizeof(element_type), stream);
     return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
-                                                                                     [this, dev](void *ptr) {
-                                                                                       this->free_device(dev, ptr);
-                                                                                     });
+                                                                                     edm::cuda::device::impl::DeviceDeleter([this, dev](void *ptr) {
+                                                                                         this->free_device(dev, ptr);
+                                                                                       }));
   }
 
   template <typename T, typename ...Args>
@@ -105,9 +105,9 @@ class CUDAService {
     static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
     void *mem = allocate_host(sizeof(T), stream);
     return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
-                                                                             [this](void *ptr) {
-                                                                               this->free_host(ptr);
-                                                                             });
+                                                                             edm::cuda::host::impl::HostDeleter([this](void *ptr) {
+                                                                                 this->free_host(ptr);
+                                                                               }));
   }
 
   template <typename T>
@@ -117,9 +117,9 @@ class CUDAService {
     static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
     void *mem = allocate_host(n*sizeof(element_type), stream);
     return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
-                                                                                   [this](void *ptr) {
-                                                                                     this->free_host(ptr);
-                                                                                   });
+                                                                                   edm::cuda::host::impl::HostDeleter([this](void *ptr) {
+                                                                                       this->free_host(ptr);
+                                                                                     }));
   }
 
   template <typename T, typename ...Args>

From 2d9308b0245ba138840575c9fddc18fa5fb52707 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 13 Dec 2018 22:26:24 +0100
Subject: [PATCH 18/49] Add copyAsync() and memsetAsync() helper functions

---
 .../CUDAUtilities/interface/copyAsync.h       |  43 ++++++
 .../CUDAUtilities/interface/memsetAsync.h     |  27 ++++
 .../CUDAUtilities/test/BuildFile.xml          |   4 +
 .../CUDAUtilities/test/copyAsync_t.cpp        | 138 ++++++++++++++++++
 .../CUDAUtilities/test/memsetAsync_t.cpp      |  73 +++++++++
 .../CUDAUtilities/test/testCatch2Main.cpp     |   2 +
 6 files changed, 287 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
 create mode 100644 HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp

diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
new file mode 100644
index 0000000000000..3469e1a0dfd90
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -0,0 +1,43 @@
+#ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
+#define HeterogeneousCore_CUDAUtilities_copyAsync_h
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <type_traits>
+
+namespace cudautils {
+  // Single element
+  template <typename T>
+  inline
+  void copyAsync(edm::cuda::device::unique_ptr<T>& dst, edm::cuda::host::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+    // Shouldn't compile for array types because of sizeof(T), but
+    // let's add an assert with a more helpful message
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id());
+  }
+
+  template <typename T>
+  inline
+  void copyAsync(edm::cuda::host::unique_ptr<T>& dst, edm::cuda::device::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id());
+  }
+
+  // Multiple elements
+  template <typename T>
+  inline
+  void copyAsync(edm::cuda::device::unique_ptr<T[]>& dst, edm::cuda::host::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+    cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
+  }
+
+  template <typename T>
+  inline
+  void copyAsync(edm::cuda::host::unique_ptr<T[]>& dst, edm::cuda::device::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+    cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
new file mode 100644
index 0000000000000..689a997a90936
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
+#define HeterogeneousCore_CUDAUtilities_memsetAsync_h
+
+#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+
+#include <cuda/api_wrappers.h>
+
+#include <type_traits>
+
+namespace cudautils {
+  template <typename T>
+  inline
+  void memsetAsync(edm::cuda::device::unique_ptr<T>& ptr, T value, cuda::stream_t<>& stream) {
+    // Shouldn't compile for array types because of sizeof(T), but
+    // let's add an assert with a more helpful message
+    static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
+    cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream.id());
+  }
+
+  template <typename T>
+  inline
+  void memsetAsync(edm::cuda::device::unique_ptr<T[]>& ptr, T value, size_t nelements, cuda::stream_t<>& stream) {
+    cuda::memory::device::async::set(ptr.get(), value, nelements*sizeof(T), stream.id());
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
index d3f8d77a35d32..f6d6a4bb7e594 100644
--- a/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDAUtilities/test/BuildFile.xml
@@ -66,3 +66,7 @@
   <flags CUDA_FLAGS="-g -DGPU_DEBUG"/>
 </bin>
 
+<bin file="testCatch2Main.cpp,copyAsync_t.cpp,memsetAsync_t.cpp" name="cudaMemUtils_t">
+  <use name="HeterogeneousCore/CUDAServices"/>
+  <use name="catch2"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
new file mode 100644
index 0000000000000..385e2925ac923
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -0,0 +1,138 @@
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+
+namespace {
+  CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
+    auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+    CUDAService::fillDescriptions(desc);
+    desc.validate(ps, "CUDAService");
+    return CUDAService(ps, ar);
+  }
+}
+
+TEST_CASE("copyAsync", "[cudaMemTools]") {
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  edm::ActivityRegistry ar;
+  edm::ParameterSet ps;
+  auto cs = makeCUDAService(ps, ar);
+
+  auto current_device = cuda::device::current::get();
+  auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+
+  SECTION("Host to device") {
+    SECTION("Single element") {
+      auto host_orig = cs.make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cs.make_device_unique<int>(stream);
+      auto host = cs.make_host_unique<int>(stream);
+
+      cudautils::copyAsync(device, host_orig, stream);
+      cuda::memory::async::copy(host.get(), device.get(), sizeof(int), stream.id());
+      stream.synchronize();
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cs.make_host_unique<int[]>(N, stream);
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cs.make_device_unique<int[]>(N, stream);
+      auto host = cs.make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cudautils::copyAsync(device, host_orig, N, stream);
+        cuda::memory::async::copy(host.get(), device.get(), N*sizeof(int), stream.id());
+        stream.synchronize();
+        for(int i=0; i<N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = 200+i;
+      }
+
+      SECTION("Copy some") {
+        cudautils::copyAsync(device, host_orig, 42, stream);
+        cuda::memory::async::copy(host.get(), device.get(), 42*sizeof(int), stream.id());
+        stream.synchronize();
+        for(int i=0; i<42; ++i) {
+          CHECK(host[i] == 200+i);
+        }
+      }
+    }
+  }
+
+  SECTION("Device to host") {
+    SECTION("Single element") {
+      auto host_orig = cs.make_host_unique<int>(stream);
+      *host_orig = 42;
+
+      auto device = cs.make_device_unique<int>(stream);
+      auto host = cs.make_host_unique<int>(stream);
+
+      cuda::memory::async::copy(device.get(), host_orig.get(), sizeof(int), stream.id());
+      cudautils::copyAsync(host, device, stream);
+      stream.synchronize();
+
+      REQUIRE(*host == 42);
+    }
+
+    SECTION("Multiple elements") {
+      constexpr int N = 100;
+
+      auto host_orig = cs.make_host_unique<int[]>(N, stream);
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = i;
+      }
+
+      auto device = cs.make_device_unique<int[]>(N, stream);
+      auto host = cs.make_host_unique<int[]>(N, stream);
+
+      SECTION("Copy all") {
+        cuda::memory::async::copy(device.get(), host_orig.get(), N*sizeof(int), stream.id());
+        cudautils::copyAsync(host, device, N, stream);
+        stream.synchronize();
+        for(int i=0; i<N; ++i) {
+          CHECK(host[i] == i);
+        }
+      }
+
+      for(int i=0; i<N; ++i) {
+        host_orig[i] = 200+i;
+      }
+
+      SECTION("Copy some") {
+        cuda::memory::async::copy(device.get(), host_orig.get(), 42*sizeof(int), stream.id());
+        cudautils::copyAsync(host, device, 42, stream);
+        stream.synchronize();
+        for(int i=0; i<42; ++i) {
+          CHECK(host[i] == 200+i);
+        }
+      }
+    }
+  }
+
+  //Fake the end-of-job signal.
+  ar.postEndJobSignal_();
+}
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
new file mode 100644
index 0000000000000..06ce3dc1ad726
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -0,0 +1,73 @@
+#include "catch.hpp"
+
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+namespace {
+  CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
+    auto desc = edm::ConfigurationDescriptions("Service", "CUDAService");
+    CUDAService::fillDescriptions(desc);
+    desc.validate(ps, "CUDAService");
+    return CUDAService(ps, ar);
+  }
+}
+
+TEST_CASE("memsetAsync", "[cudaMemTools]") {
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  edm::ActivityRegistry ar;
+  edm::ParameterSet ps;
+  auto cs = makeCUDAService(ps, ar);
+
+  auto current_device = cuda::device::current::get();
+  auto stream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+
+  SECTION("Single element") {
+    auto host_orig = cs.make_host_unique<int>(stream);
+    *host_orig = 42;
+
+    auto device = cs.make_device_unique<int>(stream);
+    auto host = cs.make_host_unique<int>(stream);
+    cudautils::copyAsync(device, host_orig, stream);
+    cudautils::memsetAsync(device, 0, stream);
+    cudautils::copyAsync(host, device, stream);
+    stream.synchronize();
+
+    REQUIRE(*host == 0);
+  }
+
+  SECTION("Multiple elements") {
+    constexpr int N = 100;
+
+    auto host_orig = cs.make_host_unique<int[]>(N, stream);
+    for(int i=0; i<N; ++i) {
+      host_orig[i] = i;
+    }
+
+    auto device = cs.make_device_unique<int[]>(N, stream);
+    auto host = cs.make_host_unique<int[]>(N, stream);
+    cudautils::copyAsync(device, host_orig, N, stream);
+    cudautils::memsetAsync(device, 0, N, stream);
+    cudautils::copyAsync(host, device, N, stream);
+    stream.synchronize();
+    
+    for(int i=0; i < N; ++i) {
+      CHECK(host[i] == 0);
+    }
+  }
+
+  //Fake the end-of-job signal.
+  ar.postEndJobSignal_();
+}
+
diff --git a/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
new file mode 100644
index 0000000000000..0c7c351f437f5
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/test/testCatch2Main.cpp
@@ -0,0 +1,2 @@
+#define CATCH_CONFIG_MAIN
+#include "catch.hpp"

From 653bd664e3d3582836a2b65c21489220dde8e434 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 14 Dec 2018 23:21:39 +0100
Subject: [PATCH 19/49] Add reset() to CUDAContextToken

---
 HeterogeneousCore/CUDACore/interface/CUDAContextToken.h | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
index 1a599132d13f1..c9b1afe8f3ca1 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
@@ -18,6 +18,11 @@ class CUDAContextToken {
   CUDAContextToken(CUDAContextToken&&) = default;
   CUDAContextToken& operator=(CUDAContextToken&& other) = default;
 
+  void reset() {
+    stream_.reset();
+    device_ = -1;
+  }
+
 private:
   friend class CUDAScopedContext;
 

From 92a1febe50f6694cfbfc31acececd1b202461e54 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 13 Dec 2018 00:21:31 +0100
Subject: [PATCH 20/49] Reorganize Raw2Cluster

---
 CUDADataFormats/SiPixelCluster/BuildFile.xml  |   1 +
 .../interface/SiPixelClustersCUDA.h           |  17 +-
 .../SiPixelCluster/src/SiPixelClustersCUDA.cc |  15 +-
 CUDADataFormats/SiPixelCluster/src/classes.h  |   8 +
 .../SiPixelCluster/src/classes_def.xml        |   4 +
 CUDADataFormats/SiPixelDigi/BuildFile.xml     |   2 +
 .../SiPixelDigi/interface/SiPixelDigisCUDA.h  |  59 +-
 .../SiPixelDigi/src/SiPixelDigisCUDA.cc       |  87 ++-
 CUDADataFormats/SiPixelDigi/src/classes.h     |   8 +
 .../SiPixelDigi/src/classes_def.xml           |   4 +
 .../StandardSequences/python/RawToDigi_cff.py |   8 +-
 .../SiPixelDigi/interface/PixelErrors.h       |  21 +
 .../SiPixelDigi/interface/SiPixelDigisSoA.h   |  46 ++
 .../SiPixelDigi/src/SiPixelDigisSoA.cc        |  23 +
 DataFormats/SiPixelDigi/src/classes.h         |   1 +
 DataFormats/SiPixelDigi/src/classes_def.xml   |   3 +
 .../SiPixelRawToDigi/plugins/BuildFile.xml    |   3 +
 .../plugins/SiPixelDigisFromSoA.cc            | 206 +++++
 .../plugins/SiPixelDigisSoAFromCUDA.cc        | 111 +++
 .../python/SiPixelRawToDigi_cfi.py            |   6 +-
 .../python/siPixelDigis_cff.py                |  16 +
 .../CUDAUtilities/interface/copyAsync.h       |   8 +-
 .../CUDAUtilities/interface/memsetAsync.h     |   8 +-
 .../python/RecoLocalTracker_cff.py            |   4 +-
 .../plugins/SiPixelClustersFromSoA.cc         | 160 ++++
 .../plugins/SiPixelRawToClusterCUDA.cc        | 235 ++++++
 .../plugins/SiPixelRawToClusterGPUKernel.cu   |  95 +--
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  83 +-
 .../SiPixelRawToClusterHeterogeneous.cc       | 739 ------------------
 .../siPixelRawToClusterHeterogeneousProduct.h |  47 --
 .../SiPixelClusterizerPreSplitting_cfi.py     |   6 +-
 .../python/siPixelClustersPreSplitting_cff.py |  18 +
 .../SiPixelRecHits/plugins/PixelRecHits.cu    |  21 +-
 .../SiPixelRecHits/plugins/PixelRecHits.h     |   6 +-
 .../plugins/SiPixelRecHitHeterogeneous.cc     |  60 +-
 .../customizePixelTracksForProfiling.py       |   2 -
 .../plugins/ClusterSLOnGPU.cu                 |  10 +-
 .../plugins/ClusterSLOnGPU.h                  |   5 +-
 .../ClusterTPAssociationHeterogeneous.cc      |  34 +-
 39 files changed, 1159 insertions(+), 1031 deletions(-)
 create mode 100644 CUDADataFormats/SiPixelCluster/src/classes.h
 create mode 100644 CUDADataFormats/SiPixelCluster/src/classes_def.xml
 create mode 100644 CUDADataFormats/SiPixelDigi/src/classes.h
 create mode 100644 CUDADataFormats/SiPixelDigi/src/classes_def.xml
 create mode 100644 DataFormats/SiPixelDigi/interface/PixelErrors.h
 create mode 100644 DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
 create mode 100644 DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
 create mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
 create mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
 create mode 100644 EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
 create mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
 create mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
 delete mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
 delete mode 100644 RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
 create mode 100644 RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py

diff --git a/CUDADataFormats/SiPixelCluster/BuildFile.xml b/CUDADataFormats/SiPixelCluster/BuildFile.xml
index 21c527e7b2f0d..d34658faa2573 100644
--- a/CUDADataFormats/SiPixelCluster/BuildFile.xml
+++ b/CUDADataFormats/SiPixelCluster/BuildFile.xml
@@ -1,6 +1,7 @@
 <use name="FWCore/ServiceRegistry"/>
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
 
 <export>
     <lib name="1"/>
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index ca8a75d178b6c..5a780f5b70f65 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -2,13 +2,14 @@
 #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 
 #include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
 class SiPixelClustersCUDA {
 public:
   SiPixelClustersCUDA() = default;
-  explicit SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream);
+  explicit SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream);
   ~SiPixelClustersCUDA() = default;
 
   SiPixelClustersCUDA(const SiPixelClustersCUDA&) = delete;
@@ -16,20 +17,23 @@ class SiPixelClustersCUDA {
   SiPixelClustersCUDA(SiPixelClustersCUDA&&) = default;
   SiPixelClustersCUDA& operator=(SiPixelClustersCUDA&&) = default;
 
+  void setNClusters(uint32_t nClusters) {
+    nClusters_h = nClusters;
+  }
+
+  uint32_t nClusters() const { return nClusters_h; }
+
   uint32_t *moduleStart() { return moduleStart_d.get(); }
-  int32_t  *clus() { return clus_d.get(); }
   uint32_t *clusInModule() { return clusInModule_d.get(); }
   uint32_t *moduleId() { return moduleId_d.get(); }
   uint32_t *clusModuleStart() { return clusModuleStart_d.get(); }
 
   uint32_t const *moduleStart() const { return moduleStart_d.get(); }
-  int32_t  const *clus() const { return clus_d.get(); }
   uint32_t const *clusInModule() const { return clusInModule_d.get(); }
   uint32_t const *moduleId() const { return moduleId_d.get(); }
   uint32_t const *clusModuleStart() const { return clusModuleStart_d.get(); }
 
   uint32_t const *c_moduleStart() const { return moduleStart_d.get(); }
-  int32_t  const *c_clus() const { return clus_d.get(); }
   uint32_t const *c_clusInModule() const { return clusInModule_d.get(); }
   uint32_t const *c_moduleId() const { return moduleId_d.get(); }
   uint32_t const *c_clusModuleStart() const { return clusModuleStart_d.get(); }
@@ -40,7 +44,6 @@ class SiPixelClustersCUDA {
 
 #ifdef __CUDACC__
     __device__ __forceinline__ uint32_t moduleStart(int i) const { return __ldg(moduleStart_+i); }
-    __device__ __forceinline__ int32_t  clus(int i) const { return __ldg(clus_+i); }
     __device__ __forceinline__ uint32_t clusInModule(int i) const { return __ldg(clusInModule_+i); }
     __device__ __forceinline__ uint32_t moduleId(int i) const { return __ldg(moduleId_+i); }
     __device__ __forceinline__ uint32_t clusModuleStart(int i) const { return __ldg(clusModuleStart_+i); }
@@ -50,7 +53,6 @@ class SiPixelClustersCUDA {
 
   private:
     uint32_t const *moduleStart_;
-    int32_t  const *clus_;
     uint32_t const *clusInModule_;
     uint32_t const *moduleId_;
     uint32_t const *clusModuleStart_;
@@ -60,7 +62,6 @@ class SiPixelClustersCUDA {
 
 private:
   edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  edm::cuda::device::unique_ptr<int32_t[]>  clus_d;          // cluster id of each pixel
   edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
   edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
 
@@ -68,6 +69,8 @@ class SiPixelClustersCUDA {
   edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;
 
   edm::cuda::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
+
+  uint32_t nClusters_h;
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
index 7363c2fd364af..d88a1b0a6370b 100644
--- a/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
+++ b/CUDADataFormats/SiPixelCluster/src/SiPixelClustersCUDA.cc
@@ -2,23 +2,22 @@
 
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 
-SiPixelClustersCUDA::SiPixelClustersCUDA(size_t feds, size_t nelements, cuda::stream_t<>& stream) {
+SiPixelClustersCUDA::SiPixelClustersCUDA(size_t maxClusters, cuda::stream_t<>& stream) {
   edm::Service<CUDAService> cs;
 
-  moduleStart_d     = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
-  clus_d            = cs->make_device_unique< int32_t[]>(feds, stream);
-  clusInModule_d    = cs->make_device_unique<uint32_t[]>(nelements, stream);
-  moduleId_d        = cs->make_device_unique<uint32_t[]>(nelements, stream);
-  clusModuleStart_d = cs->make_device_unique<uint32_t[]>(nelements+1, stream);
+  moduleStart_d     = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
+  clusInModule_d    = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
+  moduleId_d        = cs->make_device_unique<uint32_t[]>(maxClusters, stream);
+  clusModuleStart_d = cs->make_device_unique<uint32_t[]>(maxClusters+1, stream);
 
   auto view = cs->make_host_unique<DeviceConstView>(stream);
   view->moduleStart_ = moduleStart_d.get();
-  view->clus_ = clus_d.get();
   view->clusInModule_ = clusInModule_d.get();
   view->moduleId_ = moduleId_d.get();
   view->clusModuleStart_ = clusModuleStart_d.get();
 
   view_d = cs->make_device_unique<DeviceConstView>(stream);
-  cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id());
+  cudautils::copyAsync(view_d, view, stream);
 }
diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h
new file mode 100644
index 0000000000000..c04bec77a3d02
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiPixelCluster_classes_h
+#define CUDADataFormats_SiPixelCluster_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
new file mode 100644
index 0000000000000..562783f0db308
--- /dev/null
+++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDA<SiPixelClustersCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDA<SiPixelClustersCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/CUDADataFormats/SiPixelDigi/BuildFile.xml b/CUDADataFormats/SiPixelDigi/BuildFile.xml
index 259aa9f08d054..29ec13098819c 100644
--- a/CUDADataFormats/SiPixelDigi/BuildFile.xml
+++ b/CUDADataFormats/SiPixelDigi/BuildFile.xml
@@ -1,6 +1,8 @@
+<use name="DataFormats/SiPixelRawData"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda-api-wrappers"/>
+<use name="rootcore"/>
 
 <export>
     <lib name="1"/>
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index 66ca680effd19..9f7b2d8e62178 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -2,14 +2,17 @@
 #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 
 #include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 #include "FWCore/Utilities/interface/propagate_const.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 
 #include <cuda/api_wrappers.h>
 
 class SiPixelDigisCUDA {
 public:
   SiPixelDigisCUDA() = default;
-  explicit SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream);
+  explicit SiPixelDigisCUDA(size_t maxFedWords, bool includeErrors, cuda::stream_t<>& stream);
   ~SiPixelDigisCUDA() = default;
 
   SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
@@ -17,21 +20,55 @@ class SiPixelDigisCUDA {
   SiPixelDigisCUDA(SiPixelDigisCUDA&&) = default;
   SiPixelDigisCUDA& operator=(SiPixelDigisCUDA&&) = default;
 
+  void setNModulesDigis(uint32_t nModules, uint32_t nDigis) {
+    nModules_h = nModules;
+    nDigis_h = nDigis;
+  }
+
+  uint32_t nModules() const { return nModules_h; }
+  uint32_t nDigis() const { return nDigis_h; }
+
+  void setFormatterErrors(const PixelFormatterErrors& err) { formatterErrors_h = err; }
+  bool hasErrors() const { return hasErrors_h; }
+  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+
   uint16_t * xx() { return xx_d.get(); }
   uint16_t * yy() { return yy_d.get(); }
   uint16_t * adc() { return adc_d.get(); }
   uint16_t * moduleInd() { return moduleInd_d.get(); }
+  int32_t  * clus() { return clus_d.get(); }
+  uint32_t * pdigi() { return pdigi_d.get(); }
+  uint32_t * rawIdArr() { return rawIdArr_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> *error() { return error_d.get(); }
 
   uint16_t const *xx() const { return xx_d.get(); }
   uint16_t const *yy() const { return yy_d.get(); }
   uint16_t const *adc() const { return adc_d.get(); }
   uint16_t const *moduleInd() const { return moduleInd_d.get(); }
+  int32_t  const *clus() const { return clus_d.get(); } 
+  uint32_t const *pdigi() const { return pdigi_d.get(); }
+  uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *error() const { return error_d.get(); }
 
   uint16_t const *c_xx() const { return xx_d.get(); }
   uint16_t const *c_yy() const { return yy_d.get(); }
   uint16_t const *c_adc() const { return adc_d.get(); }
   uint16_t const *c_moduleInd() const { return moduleInd_d.get(); }
+  int32_t  const *c_clus() const { return clus_d.get(); }
+  uint32_t const *c_pdigi() const { return pdigi_d.get(); }
+  uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }
+  
+  edm::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
+  edm::cuda::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
+  edm::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
+  edm::cuda::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;
+
+  using HostDataError = std::pair<edm::cuda::host::unique_ptr<PixelErrorCompact[]>, GPU::SimpleVector<PixelErrorCompact> const *>;
+  HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
 
+  void copyErrorToHostAsync(cuda::stream_t<>& stream);
+  
   class DeviceConstView {
   public:
     DeviceConstView() = default;
@@ -41,6 +78,7 @@ class SiPixelDigisCUDA {
     __device__ __forceinline__ uint16_t yy(int i) const { return __ldg(yy_+i); }
     __device__ __forceinline__ uint16_t adc(int i) const { return __ldg(adc_+i); }
     __device__ __forceinline__ uint16_t moduleInd(int i) const { return __ldg(moduleInd_+i); }
+    __device__ __forceinline__ int32_t  clus(int i) const { return __ldg(clus_+i); }
 #endif
 
     friend class SiPixelDigisCUDA;
@@ -50,16 +88,35 @@ class SiPixelDigisCUDA {
     uint16_t const *yy_;
     uint16_t const *adc_;
     uint16_t const *moduleInd_;
+    int32_t  const *clus_;
   };
 
   const DeviceConstView *view() const { return view_d.get(); }
 
 private:
+  // These are consumed by downstream device code
   edm::cuda::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
   edm::cuda::device::unique_ptr<uint16_t[]> yy_d;        //
   edm::cuda::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
   edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
+  edm::cuda::device::unique_ptr<int32_t[]>  clus_d;      // cluster id of each pixel
   edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
+
+  // These are for CPU output; should we (eventually) place them to a
+  // separate product?
+  edm::cuda::device::unique_ptr<uint32_t[]> pdigi_d;
+  edm::cuda::device::unique_ptr<uint32_t[]> rawIdArr_d;
+
+  // These are for error CPU output; should we (eventually) place them
+  // to a separate product?
+  edm::cuda::device::unique_ptr<PixelErrorCompact[]> data_d;
+  edm::cuda::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
+  edm::cuda::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  PixelFormatterErrors formatterErrors_h;
+
+  uint32_t nModules_h = 0;
+  uint32_t nDigis_h = 0;
+  bool hasErrors_h;
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index 7e3d876ac8bdc..1f8f782c0a0e2 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -2,24 +2,93 @@
 
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
 
-#include <cuda_runtime.h>
-
-SiPixelDigisCUDA::SiPixelDigisCUDA(size_t nelements, cuda::stream_t<>& stream) {
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, bool includeErrors, cuda::stream_t<>& stream):
+  hasErrors_h(includeErrors)
+{
   edm::Service<CUDAService> cs;
 
-  xx_d              = cs->make_device_unique<uint16_t[]>(nelements, stream);
-  yy_d              = cs->make_device_unique<uint16_t[]>(nelements, stream);
-  adc_d             = cs->make_device_unique<uint16_t[]>(nelements, stream);
-  moduleInd_d       = cs->make_device_unique<uint16_t[]>(nelements, stream);
+  xx_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  yy_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  adc_d             = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  moduleInd_d       = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
+  clus_d            = cs->make_device_unique< int32_t[]>(maxFedWords, stream);
+
+  pdigi_d           = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
+  rawIdArr_d        = cs->make_device_unique<uint32_t[]>(maxFedWords, stream);
 
   auto view = cs->make_host_unique<DeviceConstView>(stream);
   view->xx_ = xx_d.get();
   view->yy_ = yy_d.get();
   view->adc_ = adc_d.get();
   view->moduleInd_ = moduleInd_d.get();
+  view->clus_ = clus_d.get();
 
   view_d = cs->make_device_unique<DeviceConstView>(stream);
-  cudaCheck(cudaMemcpyAsync(view_d.get(), view.get(), sizeof(DeviceConstView), cudaMemcpyDefault, stream.id()));
+  cudautils::copyAsync(view_d, view, stream);
+
+  if(includeErrors) {
+    error_d = cs->make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+    data_d = cs->make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+
+    cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);
+
+    error_h = cs->make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+    GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
+    assert(error_h->size() == 0);
+    assert(error_h->capacity() == static_cast<int>(maxFedWords));
+
+    cudautils::copyAsync(error_d, error_h, stream);
+  }
+}
+
+edm::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint16_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, adc_d, nDigis(), stream);
+  return ret;
+}
+
+edm::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<int32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, clus_d, nDigis(), stream);
+  return ret;
+}
+
+edm::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
+  return ret;
+}
+
+edm::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
+  cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
+  return ret;
+}
+
+void SiPixelDigisCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
+  cudautils::copyAsync(error_h, error_d, stream);
+}
+
+
+SiPixelDigisCUDA::HostDataError SiPixelDigisCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  // On one hand size() could be sufficient. On the other hand, if
+  // someone copies the SimpleVector<>, (s)he might expect the data
+  // buffer to actually have space for capacity() elements.
+  auto data = cs->make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+
+  // but transfer only the required amount
+  if(error_h->size() > 0) {
+    cudautils::copyAsync(data, data_d, error_h->size(), stream);
+  }
+  error_h->set_data(data.get());
+  return HostDataError(std::move(data), error_h.get());
 }
+  
diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h
new file mode 100644
index 0000000000000..09676c8fdc2f5
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/classes.h
@@ -0,0 +1,8 @@
+#ifndef CUDADataFormats_SiPixelDigi_classes_h
+#define CUDADataFormats_SiPixelDigi_classes_h
+
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "DataFormats/Common/interface/Wrapper.h"
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
new file mode 100644
index 0000000000000..92a8949b12cba
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
@@ -0,0 +1,4 @@
+<lcgdict>
+  <class name="CUDA<SiPixelDigisCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDA<SiPixelDigisCUDA>>" persistent="false"/>
+</lcgdict>
diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py
index 96aaeebbfaacd..4830da7dbf25a 100644
--- a/Configuration/StandardSequences/python/RawToDigi_cff.py
+++ b/Configuration/StandardSequences/python/RawToDigi_cff.py
@@ -3,7 +3,7 @@
 # This object is used to selectively make changes for different running
 # scenarios. In this case it makes changes for Run 2.
 
-from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import *
+from EventFilter.SiPixelRawToDigi.siPixelDigis_cff import *
 
 from EventFilter.SiStripRawToDigi.SiStripDigis_cfi import *
 
@@ -49,7 +49,7 @@
 from EventFilter.CTPPSRawToDigi.ctppsRawToDigi_cff import *
 
 RawToDigiTask = cms.Task(L1TRawToDigiTask,
-                         siPixelDigis,
+                         siPixelDigisTask,
                          siStripDigis,
                          ecalDigis,
                          ecalPreshowerDigis,
@@ -64,10 +64,10 @@
                          )
 RawToDigi = cms.Sequence(RawToDigiTask)
 
-RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigis, siStripDigis])
+RawToDigiTask_noTk = RawToDigiTask.copyAndExclude([siPixelDigisTask, siStripDigis])
 RawToDigi_noTk = cms.Sequence(RawToDigiTask_noTk)
 
-RawToDigiTask_pixelOnly = cms.Task(siPixelDigis)
+RawToDigiTask_pixelOnly = cms.Task(siPixelDigisTask)
 RawToDigi_pixelOnly = cms.Sequence(RawToDigiTask_pixelOnly)
 
 scalersRawToDigi.scalersInputTag = 'rawDataCollector'
diff --git a/DataFormats/SiPixelDigi/interface/PixelErrors.h b/DataFormats/SiPixelDigi/interface/PixelErrors.h
new file mode 100644
index 0000000000000..5231b7d1f372a
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/PixelErrors.h
@@ -0,0 +1,21 @@
+#ifndef DataFormats_SiPixelDigi_PixelErrors_h
+#define DataFormats_SiPixelDigi_PixelErrors_h
+
+#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h"
+#include "FWCore/Utilities/interface/typedefs.h"
+
+#include <map>
+#include <vector>
+
+// Better ideas for the placement of these?
+
+struct PixelErrorCompact {
+  uint32_t rawId;
+  uint32_t word;
+  unsigned char errorType;
+  unsigned char fedId;
+};
+
+using PixelFormatterErrors = std::map<cms_uint32_t, std::vector<SiPixelRawDataError>>;
+
+#endif
diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
new file mode 100644
index 0000000000000..916dfbcf28136
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
@@ -0,0 +1,46 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+#define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
+
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+
+#include <cstdint>
+#include <vector>
+
+class SiPixelDigisSoA {
+public:
+  SiPixelDigisSoA() = default;
+  explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus);
+  explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus,
+                           size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err);
+  ~SiPixelDigisSoA() = default;
+
+  auto size() const { return pdigi_.size(); }
+  auto errorSize() const { return error_.size(); }
+
+  bool hasError() const { return hasError_; }
+  const PixelFormatterErrors *formatterErrors() const { return formatterErrors_; }
+
+  uint32_t pdigi(size_t i) const { return pdigi_[i]; }
+  uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; }
+  uint16_t adc(size_t i) const { return adc_[i]; }
+  int32_t clus(size_t i) const { return clus_[i]; }
+  const PixelErrorCompact& error(size_t i) const { return error_[i]; }
+  
+  const std::vector<uint32_t>& pdigiVector() const { return pdigi_; }
+  const std::vector<uint32_t>& rawIdArrVector() const { return rawIdArr_; }
+  const std::vector<uint16_t>& adcVector() const { return adc_; }
+  const std::vector<int32_t>& clusVector() const { return clus_; }
+  const std::vector<PixelErrorCompact>& errorVector() const { return error_; }
+  
+private:
+  std::vector<uint32_t> pdigi_;
+  std::vector<uint32_t> rawIdArr_;
+  std::vector<uint16_t> adc_;
+  std::vector<int32_t> clus_;
+
+  std::vector<PixelErrorCompact> error_;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+  bool hasError_ = false;
+};
+
+#endif
diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
new file mode 100644
index 0000000000000..2d2e6b7eadfe8
--- /dev/null
+++ b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
@@ -0,0 +1,23 @@
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+
+#include <algorithm>
+#include <cassert>
+
+SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus):
+  pdigi_(pdigi, pdigi+nDigis),
+  rawIdArr_(rawIdArr, rawIdArr+nDigis),
+  adc_(adc, adc+nDigis),
+  clus_(clus, clus+nDigis)
+{
+  assert(pdigi_.size() == nDigis);
+}
+
+SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus,
+                                 size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err):
+  SiPixelDigisSoA(nDigis, pdigi, rawIdArr, adc, clus)
+{
+  error_.resize(nErrors);
+  std::copy(error, error+nErrors, error_.begin());
+  formatterErrors_ = err;
+  hasError_ = true;
+}
diff --git a/DataFormats/SiPixelDigi/src/classes.h b/DataFormats/SiPixelDigi/src/classes.h
index 0c6a09d852959..bd93840de0d4d 100644
--- a/DataFormats/SiPixelDigi/src/classes.h
+++ b/DataFormats/SiPixelDigi/src/classes.h
@@ -5,6 +5,7 @@
 #include "DataFormats/SiPixelDigi/interface/PixelDigiCollection.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigi.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigiError.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
diff --git a/DataFormats/SiPixelDigi/src/classes_def.xml b/DataFormats/SiPixelDigi/src/classes_def.xml
index de7779a5c00ea..e6bc08de161fa 100755
--- a/DataFormats/SiPixelDigi/src/classes_def.xml
+++ b/DataFormats/SiPixelDigi/src/classes_def.xml
@@ -49,4 +49,7 @@
 
    <class name="edmNew::DetSetVector<SiPixelCalibDigiError>"/>
    <class name="edm::Wrapper<edmNew::DetSetVector<SiPixelCalibDigiError> >"/>
+
+   <class name="SiPixelDigisSoA" persistent="false"/>
+   <class name="edm::Wrapper<SiPixelDigisSoA>" persistent="false"/>
 </lcgdict>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
index f92aa68373927..4d2b5ebf45542 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
+++ b/EventFilter/SiPixelRawToDigi/plugins/BuildFile.xml
@@ -1,4 +1,7 @@
+<use name="CUDADataFormats/SiPixelDigi"/>
 <use name="EventFilter/SiPixelRawToDigi"/>
+<use name="HeterogeneousCore/CUDACore"/>
+<use name="cuda-api-wrappers"/>
 <library file="*.cc" name="EventFilterSiPixelRawToDigiPlugins">
   <flags EDM_PLUGIN="1"/>
 </library>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
new file mode 100644
index 0000000000000..ddf20bd578430
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
@@ -0,0 +1,206 @@
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetIdCollection.h"
+#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include <memory>
+
+class SiPixelDigisFromSoA: public edm::stream::EDProducer<> {
+public:
+  explicit SiPixelDigisFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<SiPixelDigisSoA> digiSoAGetToken_;
+  edm::EDGetTokenT<edm::DetSetVector<PixelDigi>> digiGetToken_; // for a copy
+
+  edm::EDPutTokenT<edm::DetSetVector<PixelDigi>> digiPutToken_;
+  edm::EDPutTokenT<edm::DetSetVector<SiPixelRawDataError>> errorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> tkErrorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> userErrorPutToken_;
+  edm::EDPutTokenT<edmNew::DetSetVector<PixelFEDChannel>> disabledChannelPutToken_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> cablingWatcher_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  const std::string cablingMapLabel_;
+
+  const std::vector<int> tkerrorlist_;
+  const std::vector<int> usererrorlist_;
+
+  const bool includeErrors_;
+  const bool usePhase1_;
+};
+
+SiPixelDigisFromSoA::SiPixelDigisFromSoA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<edm::DetSetVector<PixelDigi>>(iConfig.getParameter<edm::InputTag>("digiSrc"))),
+  digiPutToken_(produces<edm::DetSetVector<PixelDigi>>()),
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  tkerrorlist_(iConfig.getParameter<std::vector<int>>("ErrorList")),
+  usererrorlist_(iConfig.getParameter<std::vector<int>>("UserErrorList")),
+  includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
+  usePhase1_(iConfig.getParameter<bool> ("UsePhase1"))
+{
+  if(includeErrors_) {
+    digiSoAGetToken_         = consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("digiSoASrc"));
+    errorPutToken_           = produces<edm::DetSetVector<SiPixelRawDataError>>();
+    tkErrorPutToken_         = produces<DetIdCollection>();
+    userErrorPutToken_       = produces<DetIdCollection>("UserErrorModules");
+    disabledChannelPutToken_ = produces<edmNew::DetSetVector<PixelFEDChannel>>();
+  }
+}
+
+void SiPixelDigisFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("digiSrc", edm::InputTag("siPixelClusters"));
+  desc.add<edm::InputTag>("digiSoASrc", edm::InputTag("siPixelDigisSoA"));
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label");
+  desc.add<bool>("IncludeErrors", true);
+  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
+  desc.add<std::vector<int> >("ErrorList", std::vector<int>{29})->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
+  desc.add<std::vector<int> >("UserErrorList", std::vector<int>{40})->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigisFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // How could we avoid the copy?
+  edm::Handle<edm::DetSetVector<PixelDigi>> hdigi;
+  iEvent.getByToken(digiGetToken_, hdigi);
+  iEvent.emplace(digiPutToken_, *hdigi);
+
+  // pack errors into collection
+  if (includeErrors_) {
+    // initialize cabling map or update if necessary
+    if (cablingWatcher_.check(iSetup)) {
+      // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+      edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+      iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap);
+      cabling_ = cablingMap->cablingTree();
+      LogDebug("map version:")<< cabling_->version();
+    }
+
+    edm::Handle<SiPixelDigisSoA> hsoa;
+    iEvent.getByToken(digiSoAGetToken_, hsoa);
+    const auto& digis = *hsoa;
+
+    if(!digis.hasError()) {
+      throw cms::Exception("LogicError") << "The module was configured to include errors, but the input SoA does not include the errors. This is likely a problem in the configuration.";
+    }
+    
+    auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
+    auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
+    auto usererror_detidcollection = std::make_unique<DetIdCollection>();
+    auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
+
+    PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0
+    const PixelDataFormatter::Errors *formatterErrors = digis.formatterErrors();
+    assert(formatterErrors != nullptr);
+    auto errors = *formatterErrors; // make a copy
+    PixelDataFormatter::DetErrors nodeterrors;
+
+    auto size = digis.errorSize();
+    for (auto i = 0U; i < size; i++) {
+      PixelErrorCompact err = digis.error(i);
+      if (err.errorType != 0) {
+        SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
+        errors[err.rawId].push_back(error);
+      }
+    }
+
+    constexpr uint32_t dummydetid = 0xffffffff;
+    typedef PixelDataFormatter::Errors::iterator IE;
+    for (IE is = errors.begin(); is != errors.end(); is++) {
+
+      uint32_t errordetid = is->first;
+      if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
+        nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
+      }
+      else {
+        edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(errordetid);
+        errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
+        // Fill detid of the detectors where there is error AND the error number is listed
+        // in the configurable error list in the job option cfi.
+        // Code needs to be here, because there can be a set of errors for each
+        // entry in the for loop over PixelDataFormatter::Errors
+
+        std::vector<PixelFEDChannel> disabledChannelsDetSet;
+
+        for (auto const& aPixelError : errorDetSet) {
+          // For the time being, we extend the error handling functionality with ErrorType 25
+          // In the future, we should sort out how the usage of tkerrorlist can be generalized
+          if (aPixelError.getType() == 25) {
+            int fedId = aPixelError.getFedId();
+            const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
+            if (fed) {
+              cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
+              const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
+              if (link) {
+                // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
+                // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
+                PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
+                for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
+                  const sipixelobjects::PixelROC * roc = link->roc(iRoc);
+                  if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
+                  if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
+                }
+                if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
+              }
+            }
+          }
+          else {
+            // fill list of detIds to be turned off by tracking
+            if (!tkerrorlist_.empty()) {
+              auto it_find = std::find(tkerrorlist_.begin(), tkerrorlist_.end(), aPixelError.getType());
+              if (it_find != tkerrorlist_.end()) {
+                tkerror_detidcollection->push_back(errordetid);
+              }
+            }
+          }
+
+          // fill list of detIds with errors to be studied
+          if (!usererrorlist_.empty()) {
+            auto it_find = std::find(usererrorlist_.begin(), usererrorlist_.end(), aPixelError.getType());
+            if (it_find != usererrorlist_.end()) {
+              usererror_detidcollection->push_back(errordetid);
+            }
+          }
+
+        } // loop on DetSet of errors
+
+        if (!disabledChannelsDetSet.empty()) {
+          disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
+        }
+
+      } // if error assigned to a real DetId
+    } // loop on errors in event for this FED
+
+    edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(dummydetid);
+    errorDetSet.data = nodeterrors;
+
+    iEvent.put(errorPutToken_, std::move(errorcollection));
+    iEvent.put(tkErrorPutToken_, std::move(tkerror_detidcollection));
+    iEvent.put(userErrorPutToken_, std::move(usererror_detidcollection));
+    iEvent.put(disabledChannelPutToken_, std::move(disabled_channelcollection));
+    
+  } // if errors to be included in the event
+}
+
+DEFINE_FWK_MODULE(SiPixelDigisFromSoA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
new file mode 100644
index 0000000000000..90d1649128836
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -0,0 +1,111 @@
+#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+
+
+class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> digiGetToken_;
+  edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
+
+  edm::cuda::host::unique_ptr<uint32_t[]> pdigi_;
+  edm::cuda::host::unique_ptr<uint32_t[]> rawIdArr_;
+  edm::cuda::host::unique_ptr<uint16_t[]> adc_;
+  edm::cuda::host::unique_ptr< int32_t[]> clus_;
+
+  edm::cuda::host::unique_ptr<PixelErrorCompact[]> data_;
+  const GPU::SimpleVector<PixelErrorCompact> *error_ = nullptr;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+
+  CUDAContextToken ctxTmp_;
+
+  int nDigis_;
+  bool includeErrors_;
+};
+
+SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<CUDA<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiPutToken_(produces<SiPixelDigisSoA>())
+{}
+
+void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  auto ctx = CUDAScopedContext(iEvent.streamID(), std::move(waitingTaskHolder));
+
+  edm::Handle<CUDA<SiPixelDigisCUDA>> hdigi;
+  iEvent.getByToken(digiGetToken_, hdigi);
+  const auto& gpuDigis = ctx.get(*hdigi);
+
+  nDigis_ = gpuDigis.nDigis();
+  pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
+  rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
+  adc_ = gpuDigis.adcToHostAsync(ctx.stream());
+  clus_ = gpuDigis.clusToHostAsync(ctx.stream());
+
+  includeErrors_ = gpuDigis.hasErrors();
+  if(includeErrors_) {
+    auto tmp = gpuDigis.dataErrorToHostAsync(ctx.stream());
+    data_ = std::move(tmp.first);
+    error_ = tmp.second;
+    formatterErrors_ = &(gpuDigis.formatterErrors());
+  }
+
+  ctxTmp_ = ctx.toToken(); // CUDA stream must live until produce
+}
+
+void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  if(includeErrors_) {
+    iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get(),
+                   error_->size(), error_->data(), formatterErrors_);
+  }
+  else {
+    iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
+  }
+
+  pdigi_.reset();
+  rawIdArr_.reset();
+  adc_.reset();
+  clus_.reset();
+  data_.reset();
+  error_ = nullptr;
+  formatterErrors_ = nullptr;
+  
+  ctxTmp_.reset(); // release CUDA stream etc
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigisSoAFromCUDA);
diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
index c2479af1f60bd..63d63145d5f8e 100644
--- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
+++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
@@ -6,9 +6,7 @@
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 phase1Pixel.toModify(siPixelDigis, UsePhase1=True)
 
-import RecoLocalTracker.SiPixelClusterizer.siPixelDigiHeterogeneousConverter_cfi
-_siPixelDigis_gpu = RecoLocalTracker.SiPixelClusterizer.siPixelDigiHeterogeneousConverter_cfi.siPixelDigiHeterogeneousConverter.clone()
-_siPixelDigis_gpu.includeErrors = cms.bool(True)
-
+from EventFilter.SiPixelRawToDigi.siPixelDigisFromSoA_cfi import siPixelDigisFromSoA as _siPixelDigisFromSoA
+_siPixelDigis_gpu = _siPixelDigisFromSoA.clone(digiSrc = "siPixelClustersPreSplitting")
 from Configuration.ProcessModifiers.gpu_cff import gpu
 gpu.toReplaceWith(siPixelDigis, _siPixelDigis_gpu)
diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
new file mode 100644
index 0000000000000..623842c03a549
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
@@ -0,0 +1,16 @@
+import FWCore.ParameterSet.Config as cms
+
+from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis
+from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA
+
+siPixelDigisTask = cms.Task(siPixelDigis)
+
+siPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone(
+    src = "siPixelClustersCUDAPreSplitting"
+)
+siPixelDigisTaskCUDA = cms.Task(siPixelDigisSoA)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelDigisTask_gpu = siPixelDigisTask.copy()
+_siPixelDigisTask_gpu.add(siPixelDigisTaskCUDA)
+gpu.toReplaceWith(siPixelDigisTask, _siPixelDigisTask_gpu)
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index 3469e1a0dfd90..c8016983070a2 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -12,7 +12,7 @@ namespace cudautils {
   // Single element
   template <typename T>
   inline
-  void copyAsync(edm::cuda::device::unique_ptr<T>& dst, edm::cuda::host::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+  void copyAsync(edm::cuda::device::unique_ptr<T>& dst, const edm::cuda::host::unique_ptr<T>& src, cuda::stream_t<>& stream) {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
@@ -21,7 +21,7 @@ namespace cudautils {
 
   template <typename T>
   inline
-  void copyAsync(edm::cuda::host::unique_ptr<T>& dst, edm::cuda::device::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+  void copyAsync(edm::cuda::host::unique_ptr<T>& dst, const edm::cuda::device::unique_ptr<T>& src, cuda::stream_t<>& stream) {
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
     cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id());
   }
@@ -29,13 +29,13 @@ namespace cudautils {
   // Multiple elements
   template <typename T>
   inline
-  void copyAsync(edm::cuda::device::unique_ptr<T[]>& dst, edm::cuda::host::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+  void copyAsync(edm::cuda::device::unique_ptr<T[]>& dst, const edm::cuda::host::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
     cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
   }
 
   template <typename T>
   inline
-  void copyAsync(edm::cuda::host::unique_ptr<T[]>& dst, edm::cuda::device::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+  void copyAsync(edm::cuda::host::unique_ptr<T[]>& dst, const edm::cuda::device::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
     cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
   }
 }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index 689a997a90936..5d4f6e10d747f 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -17,9 +17,15 @@ namespace cudautils {
     cuda::memory::device::async::set(ptr.get(), value, sizeof(T), stream.id());
   }
 
+  /**
+   * The type of `value` is `int` because of `cudaMemsetAsync()` takes
+   * it as an `int`. Note that `cudaMemsetAsync()` sets the value of
+   * each **byte** to `value`. This may lead to unexpected results if
+   * `sizeof(T) > 1` and `value != 0`.
+   */
   template <typename T>
   inline
-  void memsetAsync(edm::cuda::device::unique_ptr<T[]>& ptr, T value, size_t nelements, cuda::stream_t<>& stream) {
+  void memsetAsync(edm::cuda::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cuda::stream_t<>& stream) {
     cuda::memory::device::async::set(ptr.get(), value, nelements*sizeof(T), stream.id());
   }
 }
diff --git a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
index b75e75e000d48..a486a83d178f4 100644
--- a/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
+++ b/RecoLocalTracker/Configuration/python/RecoLocalTracker_cff.py
@@ -9,11 +9,11 @@
 from RecoLocalTracker.SiStripRecHitConverter.StripCPEfromTrackAngle_cfi import *
 from RecoLocalTracker.SiStripZeroSuppression.SiStripZeroSuppression_cfi import *
 from RecoLocalTracker.SiStripClusterizer.SiStripClusterizer_cfi import *
-from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import *
+from RecoLocalTracker.SiPixelClusterizer.siPixelClustersPreSplitting_cff import *
 from RecoLocalTracker.SiPixelRecHits.SiPixelRecHits_cfi import *
 from RecoLocalTracker.SubCollectionProducers.clustersummaryproducer_cfi import *
 
-pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplitting,siPixelRecHitsPreSplitting)
+pixeltrackerlocalrecoTask = cms.Task(siPixelClustersPreSplittingTask,siPixelRecHitsPreSplitting)
 striptrackerlocalrecoTask = cms.Task(siStripZeroSuppression,siStripClusters,siStripMatchedRecHits)
 trackerlocalrecoTask = cms.Task(pixeltrackerlocalrecoTask,striptrackerlocalrecoTask,clusterSummaryProducer)
 
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
new file mode 100644
index 0000000000000..731dd3d54ad2a
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
@@ -0,0 +1,160 @@
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetId.h"
+#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "Geometry/Records/interface/TrackerTopologyRcd.h"
+
+namespace {
+  struct AccretionCluster {
+    typedef unsigned short UShort;
+    static constexpr UShort MAXSIZE = 256;
+    UShort adc[MAXSIZE];
+    UShort x[MAXSIZE];
+    UShort y[MAXSIZE];
+    UShort xmin=16000;
+    UShort ymin=16000;
+    unsigned int isize=0;
+    int charge=0;
+
+    void clear() {
+      isize=0;
+      charge=0;
+      xmin=16000;
+      ymin=16000;
+    }
+
+    bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) {
+      if (isize==MAXSIZE) return false;
+      xmin=std::min(xmin,(unsigned short)(p.row()));
+      ymin=std::min(ymin,(unsigned short)(p.col()));
+      adc[isize]=iadc;
+      x[isize]=p.row();
+      y[isize++]=p.col();
+      charge+=iadc;
+      return true;
+    }
+  };
+
+  constexpr uint32_t dummydetid = 0xffffffff;
+}
+
+class SiPixelClustersFromSoA: public edm::global::EDProducer<> {
+public:
+  explicit SiPixelClustersFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelClustersFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+  edm::EDGetTokenT<SiPixelDigisSoA> digiGetToken_;
+
+  edm::EDPutTokenT<edm::DetSetVector<PixelDigi>> digiPutToken_;
+  edm::EDPutTokenT<SiPixelClusterCollectionNew> clusterPutToken_;
+ 
+};
+
+SiPixelClustersFromSoA::SiPixelClustersFromSoA(const edm::ParameterSet& iConfig):
+  digiGetToken_(consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiPutToken_(produces<edm::DetSetVector<PixelDigi>>()),
+  clusterPutToken_(produces<SiPixelClusterCollectionNew>())
+{}
+
+void SiPixelClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelDigisSoA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  edm::Handle<SiPixelDigisSoA> hdigi;
+  iEvent.getByToken(digiGetToken_, hdigi);
+  const auto& digis = *hdigi;
+  
+  edm::ESHandle<TrackerTopology> trackerTopologyHandle;
+  iSetup.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
+  const auto& ttopo = *trackerTopologyHandle;
+
+  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
+  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
+
+  const uint32_t nDigis = digis.size();
+  edm::DetSet<PixelDigi> * detDigis=nullptr;
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis.pdigi(i)==0) continue;
+    detDigis = &collection->find_or_insert(digis.rawIdArr(i));
+    if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations
+    break;
+  }
+
+  int32_t nclus=-1;
+  std::vector<AccretionCluster> aclusters(1024);
+  auto totCluseFilled=0;
+
+  auto fillClusters = [&](uint32_t detId){
+    if (nclus<0) return; // this in reality should never happen
+    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detId);
+    auto layer = (DetId(detId).subdetId()==1) ? ttopo.pxbLayer(detId) : 0;
+    auto clusterThreshold = (layer==1) ? 2000 : 4000;
+    for (int32_t ic=0; ic<nclus+1;++ic) {
+      auto const & acluster = aclusters[ic];
+      if ( acluster.charge < clusterThreshold) continue;
+      SiPixelCluster cluster(acluster.isize,acluster.adc, acluster.x,acluster.y, acluster.xmin,acluster.ymin);
+      ++totCluseFilled;
+      // std::cout << "putting in this cluster " << ic << " " << cluster.charge() << " " << cluster.pixelADC().size() << endl;
+      // sort by row (x)
+      spc.push_back( std::move(cluster) );
+      std::push_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
+    }
+    for (int32_t ic=0; ic<nclus+1;++ic) aclusters[ic].clear();
+    nclus = -1;
+    // sort by row (x)
+    std::sort_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
+    if ( spc.empty() ) spc.abort();
+  };
+
+  for (uint32_t i = 0; i < nDigis; i++) {
+    if (digis.pdigi(i)==0) continue;
+    if (digis.clus(i)>9000) continue; // not in cluster; TODO add an assert for the size
+    assert(digis.rawIdArr(i) > 109999);
+    if ( (*detDigis).detId() != digis.rawIdArr(i))
+      {
+        fillClusters((*detDigis).detId());
+        assert(nclus==-1);
+        detDigis = &collection->find_or_insert(digis.rawIdArr(i));
+        if ( (*detDigis).empty() )
+          (*detDigis).data.reserve(32); // avoid the first relocations
+        else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; }
+      }
+    (*detDigis).data.emplace_back(digis.pdigi(i));
+    auto const & dig = (*detDigis).data.back();
+    // fill clusters
+    assert(digis.clus(i)>=0);
+    assert(digis.clus(i)<1024);
+    nclus = std::max(digis.clus(i),nclus);
+    auto row = dig.row();
+    auto col = dig.column();
+    SiPixelCluster::PixelPos pix(row,col);
+    aclusters[digis.clus(i)].add(pix, digis.adc(i));
+  }
+
+  // fill final clusters
+  fillClusters((*detDigis).detId());
+  //std::cout << "filled " << totCluseFilled << " clusters" << std::endl;
+
+  iEvent.put(digiPutToken_, std::move(collection));
+  iEvent.put(clusterPutToken_, std::move(outputClusters));
+}
+
+DEFINE_FWK_MODULE(SiPixelClustersFromSoA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
new file mode 100644
index 0000000000000..78cc8d2483709
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -0,0 +1,235 @@
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
+#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
+#include "DataFormats/FEDRawData/interface/FEDRawData.h"
+#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h"
+#include "FWCore/Framework/interface/ConsumesCollector.h"
+#include "FWCore/Framework/interface/ESHandle.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
+#include "RecoTracker/Record/interface/CkfComponentsRecord.h"
+
+#include "SiPixelRawToClusterGPUKernel.h"
+
+#include <memory>
+#include <string>
+#include <vector>
+
+class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelRawToClusterCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
+
+  edm::EDPutTokenT<CUDA<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<CUDA<SiPixelClustersCUDA>> clusterPutToken_;
+
+  CUDAContextToken ctxTmp_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;
+
+  std::string cablingMapLabel_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  std::vector<unsigned int> fedIds_;
+  const SiPixelFedCablingMap *cablingMap_ = nullptr;
+  std::unique_ptr<PixelUnpackingRegions> regions_;
+
+  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
+  PixelDataFormatter::Errors errors_;
+
+  const bool includeErrors_;
+  const bool useQuality_;
+  const bool usePilotBlade_;
+  const bool convertADCtoElectrons_;
+};
+
+SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig):
+  rawGetToken_(consumes<FEDRawDataCollection>(iConfig.getParameter<edm::InputTag>("InputLabel"))),
+  digiPutToken_(produces<CUDA<SiPixelDigisCUDA>>()),
+  clusterPutToken_(produces<CUDA<SiPixelClustersCUDA>>()),
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
+  useQuality_(iConfig.getParameter<bool>("UseQualityInfo")),
+  usePilotBlade_(iConfig.getParameter<bool> ("UsePilotBlade")), // Control the usage of pilot-blade data, FED=40
+  convertADCtoElectrons_(iConfig.getParameter<bool>("ConvertADCtoElectrons"))
+{
+  // regions
+  if(!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
+    regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
+  }
+
+  if(usePilotBlade_) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
+}
+
+void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("IncludeErrors",true);
+  desc.add<bool>("UseQualityInfo",false);
+  desc.add<bool>("UsePilotBlade",false)->setComment("##  Use pilot blades");
+  desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
+  desc.add<edm::InputTag>("InputLabel",edm::InputTag("rawDataCollector"));
+  {
+    edm::ParameterSetDescription psd0;
+    psd0.addOptional<std::vector<edm::InputTag>>("inputs");
+    psd0.addOptional<std::vector<double>>("deltaPhi");
+    psd0.addOptional<std::vector<double>>("maxZ");
+    psd0.addOptional<edm::InputTag>("beamSpot");
+    desc.add<edm::ParameterSetDescription>("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking");
+  }
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
+  descriptions.addWithDefaultLabel(desc);
+}
+
+
+void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  auto ctx = CUDAScopedContext(iEvent.streamID(), std::move(waitingTaskHolder));
+
+  edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
+  iSetup.get<CkfComponentsRecord>().get(hgpuMap);
+  if(hgpuMap->hasQuality() != useQuality_) {
+    throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality_ << ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.";
+  }
+  // get the GPU product already here so that the async transfer can begin
+  const auto *gpuMap = hgpuMap->getGPUProductAsync(ctx.stream());
+
+  edm::ESHandle<SiPixelGainCalibrationForHLTGPU> hgains;
+  iSetup.get<SiPixelGainCalibrationForHLTGPURcd>().get(hgains);
+  // get the GPU product already here so that the async transfer can begin
+  const auto *gpuGains = hgains->getGPUProductAsync(ctx.stream());
+
+  edm::cuda::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
+  const unsigned char *gpuModulesToUnpack;
+
+  if(regions_) {
+    regions_->run(iEvent, iSetup);
+    LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<<regions_->nFEDs();
+    LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<<regions_->nBarrelModules()<<" "<<regions_->nForwardModules()<<" "<<regions_->nModules();
+    modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), ctx.stream());
+    gpuModulesToUnpack = modulesToUnpackRegional.get();
+  }
+  else {
+    gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(ctx.stream());
+  }
+
+  // initialize cabling map or update if necessary
+  if (recordWatcher.check(iSetup)) {
+    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+    iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap); //Tav
+    cablingMap_ = cablingMap.product();
+    fedIds_  = cablingMap->fedIds();
+    cabling_ = cablingMap->cablingTree();
+    LogDebug("map version:")<< cabling_->version();
+  }
+
+  edm::Handle<FEDRawDataCollection> hbuffers;
+  iEvent.getByToken(rawGetToken_, hbuffers);
+  const auto& buffers = *hbuffers;
+
+  errors_.clear();
+
+    // GPU specific: Data extraction for RawToDigi GPU
+  unsigned int wordCounterGPU = 0;
+  unsigned int fedCounter = 0;
+  bool errorsInEvent = false;
+
+  // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
+  ErrorChecker errorcheck;
+  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(ctx.stream());
+  for(int fedId: fedIds_) {
+    if (!usePilotBlade_ && (fedId==40) ) continue; // skip pilot blade data
+    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
+
+    // for GPU
+    // first 150 index stores the fedId and next 150 will store the
+    // start index of word in that fed
+    assert(fedId>=1200);
+    fedCounter++;
+
+    // get event data for this fed
+    const FEDRawData& rawData = buffers.FEDData( fedId );
+
+    // GPU specific
+    int nWords = rawData.size()/sizeof(cms_uint64_t);
+    if (nWords == 0) {
+      continue;
+    }
+
+    // check CRC bit
+    const cms_uint64_t* trailer = reinterpret_cast<const cms_uint64_t* >(rawData.data())+(nWords-1);
+    if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
+      continue;
+    }
+
+    // check headers
+    const cms_uint64_t* header = reinterpret_cast<const cms_uint64_t* >(rawData.data()); header--;
+    bool moreHeaders = true;
+    while (moreHeaders) {
+      header++;
+      bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
+      moreHeaders = headerStatus;
+    }
+
+    // check trailers
+    bool moreTrailers = true;
+    trailer++;
+    while (moreTrailers) {
+      trailer--;
+      bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
+      moreTrailers = trailerStatus;
+    }
+
+    const cms_uint32_t * bw = (const cms_uint32_t *)(header+1);
+    const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
+
+    assert(0 == (ew-bw)%2);
+    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
+    wordCounterGPU+=(ew-bw);
+
+  } // end of for loop
+
+  gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
+                             wordFedAppender,
+                             wordCounterGPU, fedCounter, convertADCtoElectrons_,
+                             useQuality_, includeErrors_,
+                             edm::MessageDrop::instance()->debugEnabled,
+                             ctx.stream());
+
+  ctxTmp_ = ctx.toToken();
+}
+
+void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  auto ctx = CUDAScopedContext(std::move(ctxTmp_));
+
+  auto tmp = gpuAlgo_.getResults();
+  ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
+  ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelRawToClusterCUDA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index 1388ed4852b25..d01ce5c6f2f26 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -44,13 +44,8 @@
 
 namespace pixelgpudetails {
 
-  // data structures size
-  constexpr uint32_t vsize = sizeof(GPU::SimpleVector<pixelgpudetails::error_obj>);
-  constexpr uint32_t esize = sizeof(pixelgpudetails::error_obj);
-
   // number of words for all the FEDs
   constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
-  constexpr uint32_t MAX_ERROR_SIZE  = MAX_FED_WORDS * esize;
 
   SiPixelRawToClusterGPUKernel::WordFedAppender::WordFedAppender(cuda::stream_t<>& cudaStream) {
     edm::Service<CUDAService> cs;
@@ -397,7 +392,7 @@ namespace pixelgpudetails {
       const uint32_t wordCounter, const uint32_t *word, const uint8_t *fedIds,
       uint16_t *xx, uint16_t *yy, uint16_t *adc,
       uint32_t *pdigi, uint32_t *rawIdArr, uint16_t *moduleId,
-      GPU::SimpleVector<pixelgpudetails::error_obj> *err,
+      GPU::SimpleVector<PixelErrorCompact> *err,
       bool useQualityInfo, bool includeErrors, bool debug)
   {
     //if (threadIdx.x==0) printf("Event: %u blockIdx.x: %u start: %u end: %u\n", eventno, blockIdx.x, begin, end);
@@ -432,7 +427,7 @@ namespace pixelgpudetails {
         if (includeErrors and skipROC)
         {
           uint32_t rID = getErrRawID(fedId, ww, errorType, cablingMap, debug);
-          err->push_back(pixelgpudetails::error_obj{rID, ww, errorType, fedId});
+          err->push_back(PixelErrorCompact{rID, ww, errorType, fedId});
           continue;
         }
 
@@ -476,7 +471,7 @@ namespace pixelgpudetails {
           if (includeErrors) {
             if (not rocRowColIsValid(row, col)) {
               uint8_t error = conversionError(fedId, 3, debug); //use the device function and fill the arrays
-              err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId});
+              err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
               if(debug) printf("BPIX1  Error status: %i\n", error);
               continue;
             }
@@ -491,7 +486,7 @@ namespace pixelgpudetails {
           localPix.col = col;
           if (includeErrors and not dcolIsValid(dcol, pxid)) {
             uint8_t error = conversionError(fedId, 3, debug);
-            err->push_back(pixelgpudetails::error_obj{rawId, ww, error, fedId});
+            err->push_back(PixelErrorCompact{rawId, ww, error, fedId});
             if(debug) printf("Error status: %i %d %d %d %d\n", error, dcol, pxid, fedId, roc);
             continue;
           }
@@ -516,17 +511,16 @@ namespace pixelgpudetails {
       const WordFedAppender& wordFed,
       const uint32_t wordCounter, const uint32_t fedCounter,
       bool convertADCtoElectrons,
-      bool useQualityInfo, bool includeErrors, bool transferToCPU, bool debug,
+      bool useQualityInfo, bool includeErrors, bool debug,
       cuda::stream_t<>& stream)
   {
     nDigis = wordCounter;
 
-    constexpr uint32_t MAX_FED_WORDS   = pixelgpudetails::MAX_FED * pixelgpudetails::MAX_WORD;
-    digis_d = SiPixelDigisCUDA(MAX_FED_WORDS, stream);
-    clusters_d = SiPixelClustersCUDA(MAX_FED_WORDS, gpuClustering::MaxNumModules, stream);
+    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, includeErrors, stream);
+    clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
 
     edm::Service<CUDAService> cs;
-    digis_clusters_h.nModules_Clusters = cs->make_host_unique<uint32_t[]>(2, stream);
+    nModules_Clusters_h = cs->make_host_unique<uint32_t[]>(2, stream);
 
     {
       const int threadsPerBlock = 512;
@@ -537,20 +531,8 @@ namespace pixelgpudetails {
       auto word_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
       auto fedId_d = cs->make_device_unique<uint8_t[]>(wordCounter, stream);
 
-      auto error_d = cs->make_device_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
-      auto data_d = cs->make_device_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
-      cudaCheck(cudaMemsetAsync(data_d.get(), 0x00, MAX_ERROR_SIZE, stream.id()));
-      auto error_h_tmp = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
-      GPU::make_SimpleVector(error_h_tmp.get(), MAX_FED_WORDS, data_d.get());
-      assert(error_h_tmp->size() == 0);
-      assert(error_h_tmp->capacity() == static_cast<int>(MAX_FED_WORDS));
-
       cudaCheck(cudaMemcpyAsync(word_d.get(),  wordFed.word(), wordCounter*sizeof(uint32_t),    cudaMemcpyDefault, stream.id()));
       cudaCheck(cudaMemcpyAsync(fedId_d.get(), wordFed.fedId(), wordCounter*sizeof(uint8_t) / 2, cudaMemcpyDefault, stream.id()));
-      cudaCheck(cudaMemcpyAsync(error_d.get(), error_h_tmp.get(), vsize, cudaMemcpyDefault, stream.id()));
-
-      auto pdigi_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
-      auto rawIdArr_d = cs->make_device_unique<uint32_t[]>(wordCounter, stream);
 
       // Launch rawToDigi kernel
       RawToDigi_kernel<<<blocks, threadsPerBlock, 0, stream.id()>>>(
@@ -560,43 +542,17 @@ namespace pixelgpudetails {
           word_d.get(),
           fedId_d.get(),
           digis_d.xx(), digis_d.yy(), digis_d.adc(),
-          pdigi_d.get(),
-          rawIdArr_d.get(),
+          digis_d.pdigi(),
+          digis_d.rawIdArr(),
           digis_d.moduleInd(),
-          error_d.get(),
+          digis_d.error(),
           useQualityInfo,
           includeErrors,
           debug);
       cudaCheck(cudaGetLastError());
 
-      // copy data to host variable
-      if(transferToCPU) {
-        digis_clusters_h.pdigi = cs->make_host_unique<uint32_t[]>(MAX_FED_WORDS, stream);
-        digis_clusters_h.rawIdArr = cs->make_host_unique<uint32_t[]>(MAX_FED_WORDS, stream);
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.pdigi.get(), pdigi_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.rawIdArr.get(), rawIdArr_d.get(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
-        if (includeErrors) {
-          digis_clusters_h.data = cs->make_host_unique<pixelgpudetails::error_obj[]>(MAX_FED_WORDS, stream);
-          digis_clusters_h.error = cs->make_host_unique<GPU::SimpleVector<pixelgpudetails::error_obj>>(stream);
-          GPU::make_SimpleVector(digis_clusters_h.error.get(), MAX_FED_WORDS, digis_clusters_h.data.get());
-          assert(digis_clusters_h.error->size() == 0);
-          assert(digis_clusters_h.error->capacity() == static_cast<int>(MAX_FED_WORDS));
-
-          cudaCheck(cudaMemcpyAsync(digis_clusters_h.error.get(), error_d.get(), vsize, cudaMemcpyDefault, stream.id()));
-          cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), MAX_ERROR_SIZE, cudaMemcpyDefault, stream.id()));
-          // If we want to transfer only the minimal amount of data, we
-          // need a synchronization point. A single ExternalWork (of
-          // SiPixelRawToClusterHeterogeneous) does not help because it is
-          // already used to synchronize the data movement. So we'd need
-          // two ExternalWorks (or explicit use of TBB tasks). The
-          // prototype of #100 would allow this easily (as there would be
-          // two ExternalWorks).
-          //
-          //cudaCheck(cudaStreamSynchronize(stream.id()));
-          //int size = digis_clusters_h.error->size();
-          //cudaCheck(cudaMemcpyAsync(digis_clusters_h.data.get(), data_d.get(), size*esize, cudaMemcpyDefault, stream.id()));
-        }
+      if(includeErrors) {
+        digis_d.copyErrorToHostAsync(stream);
       }
     }
     // End  of Raw2Digi and passing data for cluserisation
@@ -614,12 +570,6 @@ namespace pixelgpudetails {
           wordCounter);
       cudaCheck(cudaGetLastError());
 
-      // calibrated adc
-      if(transferToCPU) {
-        digis_clusters_h.adc = cs->make_host_unique<uint16_t[]>(MAX_FED_WORDS, stream);
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.adc.get(), digis_d.adc(), wordCounter*sizeof(uint16_t), cudaMemcpyDefault, stream.id()));
-      }
-
 #ifdef GPU_DEBUG
        std::cout
          << "CUDA countModules kernel launch with " << blocks
@@ -628,11 +578,11 @@ namespace pixelgpudetails {
 
       cudaCheck(cudaMemsetAsync(clusters_d.moduleStart(), 0x00, sizeof(uint32_t), stream.id()));
 
-      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), clusters_d.clus(), wordCounter);
+      countModules<<<blocks, threadsPerBlock, 0, stream.id()>>>(digis_d.c_moduleInd(), clusters_d.moduleStart(), digis_d.clus(), wordCounter);
       cudaCheck(cudaGetLastError());
 
       // read the number of modules into a data member, used by getProduct())
-      cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[0]), clusters_d.moduleStart(), sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 
       threadsPerBlock = 256;
       blocks = MaxNumModules;
@@ -646,7 +596,7 @@ namespace pixelgpudetails {
           digis_d.c_xx(), digis_d.c_yy(),
           clusters_d.c_moduleStart(),
           clusters_d.clusInModule(), clusters_d.moduleId(),
-          clusters_d.clus(),
+          digis_d.clus(),
           wordCounter);
       cudaCheck(cudaGetLastError());
 
@@ -656,12 +606,11 @@ namespace pixelgpudetails {
           digis_d.c_adc(),
           clusters_d.c_moduleStart(),
           clusters_d.clusInModule(), clusters_d.c_moduleId(),
-          clusters_d.clus(),
+          digis_d.clus(),
           wordCounter);
       cudaCheck(cudaGetLastError());
 
 
-
       // count the module start indices already here (instead of
       // rechits) so that the number of clusters/hits can be made
       // available in the rechit producer without additional points of
@@ -681,15 +630,7 @@ namespace pixelgpudetails {
                                               clusters_d.c_clusInModule(), &clusters_d.clusModuleStart()[1], gpuClustering::MaxNumModules,
                                               stream.id()));
       // last element holds the number of all clusters
-      cudaCheck(cudaMemcpyAsync(&(digis_clusters_h.nModules_Clusters[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-
-
-      // clusters
-      if(transferToCPU) {
-        digis_clusters_h.clus = cs->make_host_unique<int32_t[]>(MAX_FED_WORDS, stream);
-        cudaCheck(cudaMemcpyAsync(digis_clusters_h.clus.get(), clusters_d.clus(), wordCounter*sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
-      }
+      cudaCheck(cudaMemcpyAsync(&(nModules_Clusters_h[1]), clusters_d.clusModuleStart()+gpuClustering::MaxNumModules, sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
     } // end clusterizer scope
   }
-
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 44bed9abc1e68..98f425d88908c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -6,9 +6,11 @@
 #include "cuda/api_wrappers.h"
 
 #include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "siPixelRawToClusterHeterogeneousProduct.h"
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
 class SiPixelGainForHLTonGPU;
@@ -152,34 +154,8 @@ namespace pixelgpudetails {
   }
 
 
-  using error_obj = siPixelRawToClusterHeterogeneousProduct::error_obj;
-
-
   class SiPixelRawToClusterGPUKernel {
   public:
-
-    using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
-
-    struct CPUData {
-      CPUData() = default;
-      ~CPUData() = default;
-
-      CPUData(const CPUData&) = delete;
-      CPUData& operator=(const CPUData&) = delete;
-      CPUData(CPUData&&) = default;
-      CPUData& operator=(CPUData&&) = default;
-      
-      edm::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters; // These should really be part of the GPU product
-
-      edm::cuda::host::unique_ptr<pixelgpudetails::error_obj[]> data;
-      edm::cuda::host::unique_ptr<GPU::SimpleVector<pixelgpudetails::error_obj>> error;
-
-      edm::cuda::host::unique_ptr<uint32_t[]> pdigi;
-      edm::cuda::host::unique_ptr<uint32_t[]> rawIdArr;
-      edm::cuda::host::unique_ptr<uint16_t[]> adc;
-      edm::cuda::host::unique_ptr<int32_t[]> clus;
-    };
-
     class WordFedAppender {
     public:
       WordFedAppender(cuda::stream_t<>& cudaStream);
@@ -208,61 +184,28 @@ namespace pixelgpudetails {
                            const SiPixelGainForHLTonGPU *gains,
                            const WordFedAppender& wordFed,
                            const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
-                           bool useQualityInfo, bool includeErrors, bool transferToCPU_, bool debug,
+                           bool useQualityInfo, bool includeErrors, bool debug,
                            cuda::stream_t<>& stream);
 
-    siPixelRawToClusterHeterogeneousProduct::GPUProduct getProduct() {
-      return siPixelRawToClusterHeterogeneousProduct::GPUProduct(
-        std::move(digis_d), std::move(clusters_d),
-        nDigis,
-        digis_clusters_h.nModules_Clusters[0],
-        digis_clusters_h.nModules_Clusters[1]
-      );
-    }
-
-    CPUData&& getCPUData() {
-      // Set the vector data pointer to point to CPU
-      digis_clusters_h.error->set_data(digis_clusters_h.data.get());
-      return std::move(digis_clusters_h);
+    std::pair<SiPixelDigisCUDA, SiPixelClustersCUDA> getResults() {
+      digis_d.setNModulesDigis(nModules_Clusters_h[0], nDigis);
+      clusters_d.setNClusters(nModules_Clusters_h[1]);
+      // need to explicitly deallocate while the associated CUDA
+      // stream is still alive
+      nModules_Clusters_h.reset();
+      return std::make_pair(std::move(digis_d), std::move(clusters_d));
     }
 
   private:
     uint32_t nDigis = 0;
 
-    // CPU data
-    CPUData digis_clusters_h;
-
     // Data to be put in the event
+    edm::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
+    edm::cuda::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
     SiPixelDigisCUDA digis_d;
     SiPixelClustersCUDA clusters_d;
   };
 
-  // configuration and memory buffers alocated on the GPU
-  struct context {
-    uint32_t * word_d;
-    uint8_t *  fedId_d;
-    uint32_t * pdigi_d;
-    uint16_t * xx_d;
-    uint16_t * yy_d;
-    uint16_t * adc_d;
-    uint16_t * moduleInd_d;
-    uint32_t * rawIdArr_d;
-
-    GPU::SimpleVector<error_obj> * error_d;
-    error_obj * data_d;
-
-    // these are for the clusterizer (to be moved)
-    uint32_t * moduleStart_d;
-    int32_t *  clus_d;
-    uint32_t * clusInModule_d;
-    uint32_t * moduleId_d;
-    uint32_t * debug_d;
-  };
-
-  // void initCablingMap();
-  context initDeviceMemory();
-  void freeMemory(context &);
-
   // see RecoLocalTracker/SiPixelClusterizer
   // all are runtime const, should be specified in python _cfg.py
   struct ADCThreshold {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
deleted file mode 100644
index 905bc297b394d..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterHeterogeneous.cc
+++ /dev/null
@@ -1,739 +0,0 @@
-// C++ includes
-#include <string>
-#include <chrono>
-#include <iostream>
-#include <fstream>
-
-// CUDA kincludes
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// CMSSW includes
-#include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
-#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
-#include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTService.h"
-#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
-#include "CondFormats/DataRecord/interface/SiPixelQualityRcd.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelQuality.h"
-#include "DataFormats/Common/interface/DetSetVector.h"
-#include "DataFormats/Common/interface/Handle.h"
-#include "DataFormats/DetId/interface/DetIdCollection.h"
-#include "DataFormats/FEDRawData/interface/FEDNumbering.h"
-#include "DataFormats/FEDRawData/interface/FEDRawData.h"
-#include "DataFormats/FEDRawData/interface/FEDRawDataCollection.h"
-#include "DataFormats/SiPixelCluster/interface/SiPixelCluster.h"
-#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
-#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
-#include "DataFormats/SiPixelRawData/interface/SiPixelRawDataError.h"
-#include "DataFormats/TrackerCommon/interface/TrackerTopology.h"
-#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
-#include "EventFilter/SiPixelRawToDigi/interface/PixelUnpackingRegions.h"
-#include "FWCore/Framework/interface/ConsumesCollector.h"
-#include "FWCore/Framework/interface/ESHandle.h"
-#include "FWCore/Framework/interface/ESTransientHandle.h"
-#include "FWCore/Framework/interface/ESWatcher.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/MessageLogger/interface/MessageLogger.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/PluginManager/interface/ModuleDef.h"
-#include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
-#include "Geometry/TrackerGeometryBuilder/interface/PixelGeomDetUnit.h"
-#include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
-#include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-#include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h"
-#include "RecoTracker/Record/interface/CkfComponentsRecord.h"
-
-#include "SiPixelRawToClusterGPUKernel.h"
-#include "siPixelRawToClusterHeterogeneousProduct.h"
-#include "PixelThresholdClusterizer.h"
-
-namespace {
-  struct AccretionCluster {
-    typedef unsigned short UShort;
-    static constexpr UShort MAXSIZE = 256;
-    UShort adc[MAXSIZE];
-    UShort x[MAXSIZE];
-    UShort y[MAXSIZE];
-    UShort xmin=16000;
-    UShort ymin=16000;
-    unsigned int isize=0;
-    int charge=0;
-
-    void clear() {
-      isize=0;
-      charge=0;
-      xmin=16000;
-      ymin=16000;
-    }
-
-    bool add(SiPixelCluster::PixelPos const & p, UShort const iadc) {
-      if (isize==MAXSIZE) return false;
-      xmin=std::min(xmin,(unsigned short)(p.row()));
-      ymin=std::min(ymin,(unsigned short)(p.col()));
-      adc[isize]=iadc;
-      x[isize]=p.row();
-      y[isize++]=p.col();
-      charge+=iadc;
-      return true;
-    }
-  };
-
-  constexpr uint32_t dummydetid = 0xffffffff;
-}
-
-class SiPixelRawToClusterHeterogeneous: public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices <
-                                                                      heterogeneous::GPUCuda,
-                                                                      heterogeneous::CPU
-                                                                      > > {
-public:
-  using CPUProduct = siPixelRawToClusterHeterogeneousProduct::CPUProduct;
-  using GPUProduct = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
-  using Output = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
-
-  explicit SiPixelRawToClusterHeterogeneous(const edm::ParameterSet& iConfig);
-  ~SiPixelRawToClusterHeterogeneous() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
-  // CPU implementation
-  void produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) override;
-
-  // GPU implementation
-  void acquireGPUCuda(const edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void produceGPUCuda(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup, cuda::stream_t<>& cudaStream) override;
-  void convertGPUtoCPU(edm::Event& ev, unsigned int nDigis, pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData) const;
-
-  // Commonalities
-  const FEDRawDataCollection *initialize(const edm::Event& ev, const edm::EventSetup& es);
-
-  std::unique_ptr<SiPixelFedCablingTree> cabling_;
-  const SiPixelQuality *badPixelInfo_ = nullptr;
-  const SiPixelFedCablingMap *cablingMap_ = nullptr;
-std::unique_ptr<PixelUnpackingRegions> regions_;
-  edm::EDGetTokenT<FEDRawDataCollection> tFEDRawDataCollection;
-
-  bool includeErrors;
-  bool useQuality;
-  bool debug;
-  std::vector<int> tkerrorlist;
-  std::vector<int> usererrorlist;
-  std::vector<unsigned int> fedIds;
-
-  edm::ESWatcher<SiPixelFedCablingMapRcd> recordWatcher;
-  edm::ESWatcher<SiPixelQualityRcd> qualityWatcher;
-
-  bool usePilotBlade;
-  bool usePhase1;
-  bool convertADCtoElectrons;
-  std::string cablingMapLabel;
-
-  // clusterizer
-  PixelThresholdClusterizer clusterizer_;
-  const TrackerGeometry *geom_ = nullptr;
-  const TrackerTopology *ttopo_ = nullptr;
-
-  //  gain calib
-  SiPixelGainCalibrationForHLTService  theSiPixelGainCalibration_;
-
-  // GPU algo
-  pixelgpudetails::SiPixelRawToClusterGPUKernel gpuAlgo_;
-  PixelDataFormatter::Errors errors_;
-
-  bool enableTransfer_;
-  bool enableConversion_;
-};
-
-SiPixelRawToClusterHeterogeneous::SiPixelRawToClusterHeterogeneous(const edm::ParameterSet& iConfig):
-  HeterogeneousEDProducer(iConfig),
-  clusterizer_(iConfig),
-  theSiPixelGainCalibration_(iConfig) {
-  includeErrors = iConfig.getParameter<bool>("IncludeErrors");
-  useQuality = iConfig.getParameter<bool>("UseQualityInfo");
-  tkerrorlist = iConfig.getParameter<std::vector<int> > ("ErrorList");
-  usererrorlist = iConfig.getParameter<std::vector<int> > ("UserErrorList");
-  tFEDRawDataCollection = consumes <FEDRawDataCollection> (iConfig.getParameter<edm::InputTag>("InputLabel"));
-
-  enableConversion_ = iConfig.getParameter<bool>("gpuEnableConversion");
-  enableTransfer_ = enableConversion_ || iConfig.getParameter<bool>("gpuEnableTransfer");
-
-  clusterizer_.setSiPixelGainCalibrationService(&theSiPixelGainCalibration_);
-
-  // Products in GPU
-  produces<HeterogeneousProduct>();
-  // Products in CPU
-  if(enableConversion_) {
-    produces<edm::DetSetVector<PixelDigi>>();
-    if(includeErrors) {
-      produces<edm::DetSetVector<SiPixelRawDataError>>();
-      produces<DetIdCollection>();
-      produces<DetIdCollection>("UserErrorModules");
-      produces<SiPixelClusterCollectionNew>();
-      produces<edmNew::DetSetVector<PixelFEDChannel>>();
-    }
-  }
-
-  // regions
-  if(!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
-    regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
-  }
-
-  // Control the usage of pilot-blade data, FED=40
-  usePilotBlade = iConfig.getParameter<bool> ("UsePilotBlade");
-  if(usePilotBlade) edm::LogInfo("SiPixelRawToCluster")  << " Use pilot blade data (FED 40)";
-
-  // Control the usage of phase1
-  usePhase1 = iConfig.getParameter<bool> ("UsePhase1");
-  if(usePhase1) edm::LogInfo("SiPixelRawToCluster")  << " Using phase1";
-
-  //CablingMap could have a label //Tav
-  cablingMapLabel = iConfig.getParameter<std::string> ("CablingMapLabel");
-
-  convertADCtoElectrons = iConfig.getParameter<bool>("ConvertADCtoElectrons");
-}
-
-void SiPixelRawToClusterHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<bool>("IncludeErrors",true);
-  desc.add<bool>("UseQualityInfo",false);
-  {
-    std::vector<int> temp1;
-    temp1.reserve(1);
-    temp1.push_back(29);
-    desc.add<std::vector<int> >("ErrorList",temp1)->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
-  }
-  {
-    std::vector<int> temp1;
-    temp1.reserve(1);
-    temp1.push_back(40);
-    desc.add<std::vector<int> >("UserErrorList",temp1)->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
-  }
-  desc.add<edm::InputTag>("InputLabel",edm::InputTag("rawDataCollector"));
-  {
-    edm::ParameterSetDescription psd0;
-    psd0.addOptional<std::vector<edm::InputTag>>("inputs");
-    psd0.addOptional<std::vector<double>>("deltaPhi");
-    psd0.addOptional<std::vector<double>>("maxZ");
-    psd0.addOptional<edm::InputTag>("beamSpot");
-    desc.add<edm::ParameterSetDescription>("Regions",psd0)->setComment("## Empty Regions PSet means complete unpacking");
-  }
-  desc.add<bool>("UsePilotBlade",false)->setComment("##  Use pilot blades");
-  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
-  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label"); //Tav
-  desc.addOptional<bool>("CheckPixelOrder");  // never used, kept for back-compatibility
-
-  desc.add<bool>("ConvertADCtoElectrons", false)->setComment("## do the calibration ADC-> Electron and apply the threshold, requried for clustering");
-
-  // clusterizer
-  desc.add<int>("ChannelThreshold", 1000);
-  desc.add<int>("SeedThreshold", 1000);
-  desc.add<int>("ClusterThreshold", 4000);
-  desc.add<int>("ClusterThreshold_L1", 4000);
-  desc.add<int>("VCaltoElectronGain", 65);
-  desc.add<int>("VCaltoElectronGain_L1", 65);
-  desc.add<int>("VCaltoElectronOffset", -414);
-  desc.add<int>("VCaltoElectronOffset_L1", -414);
-  desc.add<bool>("MissCalibrate", true);
-  desc.add<bool>("SplitClusters", false);
-  desc.add<double>("ElectronPerADCGain", 135.);
-  // Phase 2 clusterizer
-  desc.add<bool>("Phase2Calibration", false);
-  desc.add<int>("Phase2ReadoutMode", -1);
-  desc.add<double>("Phase2DigiBaseline", 1200.);
-  desc.add<int>("Phase2KinkADC", 8);
-
-  desc.add<bool>("gpuEnableTransfer", true);
-  desc.add<bool>("gpuEnableConversion", true);
-
-  HeterogeneousEDProducer::fillPSetDescription(desc);
-
-  descriptions.add("siPixelClustersHeterogeneousDefault",desc);
-}
-
-const FEDRawDataCollection *SiPixelRawToClusterHeterogeneous::initialize(const edm::Event& ev, const edm::EventSetup& es) {
-  debug = edm::MessageDrop::instance()->debugEnabled;
-
-  // setup gain calibration service
-  theSiPixelGainCalibration_.setESObjects( es );
-
-  // initialize cabling map or update if necessary
-  if (recordWatcher.check( es )) {
-    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
-    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
-    es.get<SiPixelFedCablingMapRcd>().get( cablingMapLabel, cablingMap ); //Tav
-    cablingMap_ = cablingMap.product();
-    fedIds   = cablingMap->fedIds();
-    cabling_ = cablingMap->cablingTree();
-    LogDebug("map version:")<< cabling_->version();
-  }
-  // initialize quality record or update if necessary
-  if (qualityWatcher.check( es )&&useQuality) {
-    // quality info for dead pixel modules or ROCs
-    edm::ESHandle<SiPixelQuality> qualityInfo;
-    es.get<SiPixelQualityRcd>().get( qualityInfo );
-    badPixelInfo_ = qualityInfo.product();
-    if (!badPixelInfo_) {
-      edm::LogError("SiPixelQualityNotPresent")<<" Configured to use SiPixelQuality, but SiPixelQuality not present";
-    }
-  }
-
-  // tracker geometry: to make sure numbering of DetId is consistent...
-  edm::ESHandle<TrackerGeometry> geom;
-  es.get<TrackerDigiGeometryRecord>().get(geom);
-  geom_ = geom.product();
-
-  edm::ESHandle<TrackerTopology> trackerTopologyHandle;
-  es.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
-  ttopo_ = trackerTopologyHandle.product();
-
-  if (regions_) {
-    regions_->run(ev, es);
-    LogDebug("SiPixelRawToCluster") << "region2unpack #feds: "<<regions_->nFEDs();
-    LogDebug("SiPixelRawToCluster") << "region2unpack #modules (BPIX,EPIX,total): "<<regions_->nBarrelModules()<<" "<<regions_->nForwardModules()<<" "<<regions_->nModules();
-  }
-
-  edm::Handle<FEDRawDataCollection> buffers;
-  ev.getByToken(tFEDRawDataCollection, buffers);
-  return buffers.product();
-}
-
-
-// -----------------------------------------------------------------------------
-void SiPixelRawToClusterHeterogeneous::produceCPU(edm::HeterogeneousEvent& ev, const edm::EventSetup& es)
-{
-  const auto buffers = initialize(ev.event(), es);
-
-  // create product (digis & errors)
-  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
-  auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
-  auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
-  auto usererror_detidcollection = std::make_unique<DetIdCollection>();
-  auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
-  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
-  // output->collection.reserve(8*1024);
-
-
-  PixelDataFormatter formatter(cabling_.get(), usePhase1); // for phase 1 & 0
-  formatter.setErrorStatus(includeErrors);
-  if (useQuality) formatter.setQualityStatus(useQuality, badPixelInfo_);
-
-  bool errorsInEvent = false;
-  PixelDataFormatter::DetErrors nodeterrors;
-
-  if (regions_) {
-    formatter.setModulesToUnpack(regions_->modulesToUnpack());
-  }
-
-  for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) {
-    int fedId = *aFed;
-
-    if(!usePilotBlade && (fedId==40) ) continue; // skip pilot blade data
-
-    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
-
-    if(debug) LogDebug("SiPixelRawToCluster")<< " PRODUCE DIGI FOR FED: " <<  fedId;
-
-    PixelDataFormatter::Errors errors;
-
-    //get event data for this fed
-    const FEDRawData& fedRawData = buffers->FEDData( fedId );
-
-    //convert data to digi and strip off errors
-    formatter.interpretRawData( errorsInEvent, fedId, fedRawData, *collection, errors);
-
-    //pack errors into collection
-    if(includeErrors) {
-      typedef PixelDataFormatter::Errors::iterator IE;
-      for (IE is = errors.begin(); is != errors.end(); is++) {
-	uint32_t errordetid = is->first;
-	if (errordetid==dummydetid) {           // errors given dummy detId must be sorted by Fed
-	  nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
-	} else {
-	  edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(errordetid);
-	  errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
-	  // Fill detid of the detectors where there is error AND the error number is listed
-	  // in the configurable error list in the job option cfi.
-	  // Code needs to be here, because there can be a set of errors for each
-	  // entry in the for loop over PixelDataFormatter::Errors
-
-	  std::vector<PixelFEDChannel> disabledChannelsDetSet;
-
-	  for (auto const& aPixelError : errorDetSet) {
-	    // For the time being, we extend the error handling functionality with ErrorType 25
-	    // In the future, we should sort out how the usage of tkerrorlist can be generalized
-	    if (aPixelError.getType()==25) {
-	      assert(aPixelError.getFedId()==fedId);
-	      const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
-	      if (fed) {
-		cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
-		const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
-		if (link) {
-		  // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
-		  // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
-		  PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
-		  for (unsigned int iRoc=1; iRoc<=link->numberOfROCs(); iRoc++) {
-		    const sipixelobjects::PixelROC * roc = link->roc(iRoc);
-		    if (roc->idInDetUnit()<ch.roc_first) ch.roc_first=roc->idInDetUnit();
-		    if (roc->idInDetUnit()>ch.roc_last) ch.roc_last=roc->idInDetUnit();
-		  }
-		  disabledChannelsDetSet.push_back(ch);
-		}
-	      }
-	    } else {
-	      // fill list of detIds to be turned off by tracking
-	      if(!tkerrorlist.empty()) {
-		auto it_find = std::find(tkerrorlist.begin(), tkerrorlist.end(), aPixelError.getType());
-		if(it_find != tkerrorlist.end()){
-		  tkerror_detidcollection->push_back(errordetid);
-		}
-	      }
-	    }
-
-	    // fill list of detIds with errors to be studied
-	    if(!usererrorlist.empty()) {
-	      auto it_find = std::find(usererrorlist.begin(), usererrorlist.end(), aPixelError.getType());
-	      if(it_find != usererrorlist.end()){
-		usererror_detidcollection->push_back(errordetid);
-	      }
-	    }
-
-	  } // loop on DetSet of errors
-
-	  if (!disabledChannelsDetSet.empty()) {
-	    disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
-	  }
-
-	} // if error assigned to a real DetId
-      } // loop on errors in event for this FED
-    } // if errors to be included in the event
-  } // loop on FED data to be unpacked
-
-  if(includeErrors) {
-    edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(dummydetid);
-    errorDetSet.data = nodeterrors;
-  }
-  if (errorsInEvent) LogDebug("SiPixelRawToCluster") << "Error words were stored in this event";
-
-  // clusterize, originally from SiPixelClusterProducer
-  for(const auto detset: *collection) {
-    const auto detId = DetId(detset.detId());
-
-    std::vector<short> badChannels; // why do we need this?
-
-    // Comment: At the moment the clusterizer depends on geometry
-    // to access information as the pixel topology (number of columns
-    // and rows in a detector module).
-    // In the future the geometry service will be replaced with
-    // a ES service.
-    const GeomDetUnit      * geoUnit = geom_->idToDetUnit( detId );
-    const PixelGeomDetUnit * pixDet  = dynamic_cast<const PixelGeomDetUnit*>(geoUnit);
-    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detset.detId());
-    clusterizer_.clusterizeDetUnit(detset, pixDet, ttopo_, badChannels, spc);
-    if ( spc.empty() ) {
-      spc.abort();
-    }
-  }
-  outputClusters->shrink_to_fit();
-
-  //send digis and errors back to framework
-  ev.put(std::move(collection));
-  if(includeErrors){
-    ev.put(std::move(errorcollection));
-    ev.put(std::move(tkerror_detidcollection));
-    ev.put(std::move(usererror_detidcollection), "UserErrorModules");
-    ev.put(std::move(disabled_channelcollection));
-  }
-  ev.put(std::move(outputClusters));
-}
-
-// -----------------------------------------------------------------------------
-void SiPixelRawToClusterHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
-  const auto buffers = initialize(ev.event(), es);
-
-  edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
-  es.get<CkfComponentsRecord>().get(hgpuMap);
-  if(hgpuMap->hasQuality() != useQuality) {
-    throw cms::Exception("LogicError") << "UseQuality of the module (" << useQuality<< ") differs the one from SiPixelFedCablingMapGPUWrapper. Please fix your configuration.";
-  }
-  // get the GPU product already here so that the async transfer can begin
-  const auto *gpuMap = hgpuMap->getGPUProductAsync(cudaStream);
-
-  edm::cuda::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
-  const unsigned char *gpuModulesToUnpack;
-  if (regions_) {
-    modulesToUnpackRegional = hgpuMap->getModToUnpRegionalAsync(*(regions_->modulesToUnpack()), cudaStream);
-    gpuModulesToUnpack = modulesToUnpackRegional.get();
-  }
-  else {
-    gpuModulesToUnpack = hgpuMap->getModToUnpAllAsync(cudaStream);
-  }
-
-
-  edm::ESHandle<SiPixelGainCalibrationForHLTGPU> hgains;
-  es.get<SiPixelGainCalibrationForHLTGPURcd>().get(hgains);
-
-  errors_.clear();
-
-  // GPU specific: Data extraction for RawToDigi GPU
-  unsigned int wordCounterGPU = 0;
-  unsigned int fedCounter = 0;
-  bool errorsInEvent = false;
-
-  // In CPU algorithm this loop is part of PixelDataFormatter::interpretRawData()
-  ErrorChecker errorcheck;
-  auto wordFedAppender = pixelgpudetails::SiPixelRawToClusterGPUKernel::WordFedAppender(cudaStream);
-  for (auto aFed = fedIds.begin(); aFed != fedIds.end(); ++aFed) {
-    int fedId = *aFed;
-
-    if (!usePilotBlade && (fedId==40) ) continue; // skip pilot blade data
-    if (regions_ && !regions_->mayUnpackFED(fedId)) continue;
-
-    // for GPU
-    // first 150 index stores the fedId and next 150 will store the
-    // start index of word in that fed
-    assert(fedId>=1200);
-    fedCounter++;
-
-    // get event data for this fed
-    const FEDRawData& rawData = buffers->FEDData( fedId );
-
-    // GPU specific
-    int nWords = rawData.size()/sizeof(cms_uint64_t);
-    if (nWords == 0) {
-      continue;
-    }
-
-    // check CRC bit
-    const cms_uint64_t* trailer = reinterpret_cast<const cms_uint64_t* >(rawData.data())+(nWords-1);
-    if (not errorcheck.checkCRC(errorsInEvent, fedId, trailer, errors_)) {
-      continue;
-    }
-
-    // check headers
-    const cms_uint64_t* header = reinterpret_cast<const cms_uint64_t* >(rawData.data()); header--;
-    bool moreHeaders = true;
-    while (moreHeaders) {
-      header++;
-      bool headerStatus = errorcheck.checkHeader(errorsInEvent, fedId, header, errors_);
-      moreHeaders = headerStatus;
-    }
-
-    // check trailers
-    bool moreTrailers = true;
-    trailer++;
-    while (moreTrailers) {
-      trailer--;
-      bool trailerStatus = errorcheck.checkTrailer(errorsInEvent, fedId, nWords, trailer, errors_);
-      moreTrailers = trailerStatus;
-    }
-
-    const cms_uint32_t * bw = (const cms_uint32_t *)(header+1);
-    const cms_uint32_t * ew = (const cms_uint32_t *)(trailer);
-
-    assert(0 == (ew-bw)%2);
-    wordFedAppender.initializeWordFed(fedId, wordCounterGPU, bw, (ew-bw));
-    wordCounterGPU+=(ew-bw);
-
-  } // end of for loop
-
-  gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, hgains->getGPUProductAsync(cudaStream),
-                             wordFedAppender,
-                             wordCounterGPU, fedCounter, convertADCtoElectrons,
-                             useQuality, includeErrors, enableTransfer_, debug, cudaStream);
-}
-
-void SiPixelRawToClusterHeterogeneous::produceGPUCuda(edm::HeterogeneousEvent& ev, const edm::EventSetup& es, cuda::stream_t<>& cudaStream) {
-  auto output = std::make_unique<GPUProduct>(gpuAlgo_.getProduct());
-
-  if(enableConversion_) {
-    convertGPUtoCPU(ev.event(), output->nDigis, gpuAlgo_.getCPUData());
-  }
-
-  ev.put<Output>(std::move(output), heterogeneous::DisableTransfer{});
-}
-
-void SiPixelRawToClusterHeterogeneous::convertGPUtoCPU(edm::Event& ev,
-                                                       unsigned int nDigis,
-                                                       pixelgpudetails::SiPixelRawToClusterGPUKernel::CPUData digis_clusters_h) const {
-  // TODO: add the transfers here as well?
-
-  auto collection = std::make_unique<edm::DetSetVector<PixelDigi>>();
-  auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
-  auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
-  auto usererror_detidcollection = std::make_unique<DetIdCollection>();
-  auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
-  auto outputClusters = std::make_unique<SiPixelClusterCollectionNew>();
-
-  edm::DetSet<PixelDigi> * detDigis=nullptr;
-  for (uint32_t i = 0; i < nDigis; i++) {
-    if (digis_clusters_h.pdigi[i]==0) continue;
-    detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]);
-    if ( (*detDigis).empty() ) (*detDigis).data.reserve(32); // avoid the first relocations
-    break;
-  }
-
-  int32_t nclus=-1;
-  std::vector<AccretionCluster> aclusters(1024);
-  auto totCluseFilled=0;
-
-  auto fillClusters = [&](uint32_t detId){
-    if (nclus<0) return; // this in reality should never happen
-    edmNew::DetSetVector<SiPixelCluster>::FastFiller spc(*outputClusters, detId);
-    auto layer = (DetId(detId).subdetId()==1) ? ttopo_->pxbLayer(detId) : 0;
-    auto clusterThreshold = (layer==1) ? 2000 : 4000;
-    for (int32_t ic=0; ic<nclus+1;++ic) {
-      auto const & acluster = aclusters[ic];
-      if ( acluster.charge < clusterThreshold) continue;
-      SiPixelCluster cluster(acluster.isize,acluster.adc, acluster.x,acluster.y, acluster.xmin,acluster.ymin);
-      ++totCluseFilled;
-      // std::cout << "putting in this cluster " << ic << " " << cluster.charge() << " " << cluster.pixelADC().size() << endl;
-      // sort by row (x)
-      spc.push_back( std::move(cluster) );
-      std::push_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
-    }
-    for (int32_t ic=0; ic<nclus+1;++ic) aclusters[ic].clear();
-    nclus = -1;
-    // sort by row (x)
-    std::sort_heap(spc.begin(),spc.end(),[](SiPixelCluster const & cl1,SiPixelCluster const & cl2) { return cl1.minPixelRow() < cl2.minPixelRow();});
-    if ( spc.empty() ) spc.abort();
-  };
-
-  for (uint32_t i = 0; i < nDigis; i++) {
-    if (digis_clusters_h.pdigi[i]==0) continue;
-    if (digis_clusters_h.clus[i]>9000) continue; // not in cluster
-    assert(digis_clusters_h.rawIdArr[i] > 109999);
-    if ( (*detDigis).detId() != digis_clusters_h.rawIdArr[i])
-      {
-        fillClusters((*detDigis).detId());
-        assert(nclus==-1);
-        detDigis = &collection->find_or_insert(digis_clusters_h.rawIdArr[i]);
-        if ( (*detDigis).empty() )
-          (*detDigis).data.reserve(32); // avoid the first relocations
-        else { std::cout << "Problem det present twice in input! " << (*detDigis).detId() << std::endl; }
-      }
-    (*detDigis).data.emplace_back(digis_clusters_h.pdigi[i]);
-    auto const & dig = (*detDigis).data.back();
-    // fill clusters
-    assert(digis_clusters_h.clus[i]>=0);
-    assert(digis_clusters_h.clus[i]<1024);
-    nclus = std::max(digis_clusters_h.clus[i],nclus);
-    auto row = dig.row();
-    auto col = dig.column();
-    SiPixelCluster::PixelPos pix(row,col);
-    aclusters[digis_clusters_h.clus[i]].add(pix, digis_clusters_h.adc[i]);
-  }
-
-  // fill final clusters
-  fillClusters((*detDigis).detId());
-  //std::cout << "filled " << totCluseFilled << " clusters" << std::endl;
-
-  PixelDataFormatter formatter(cabling_.get(), usePhase1); // for phase 1 & 0
-  auto errors = errors_; // make a copy
-  PixelDataFormatter::DetErrors nodeterrors;
-
-  auto size = digis_clusters_h.error->size();
-  for (auto i = 0; i < size; i++) {
-    pixelgpudetails::error_obj err = (*digis_clusters_h.error)[i];
-    if (err.errorType != 0) {
-      SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
-      errors[err.rawId].push_back(error);
-    }
-  }
-
-  // pack errors into collection
-  if (includeErrors) {
-
-    typedef PixelDataFormatter::Errors::iterator IE;
-    for (IE is = errors.begin(); is != errors.end(); is++) {
-
-      uint32_t errordetid = is->first;
-      if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
-        nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
-      }
-      else {
-        edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(errordetid);
-        errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
-        // Fill detid of the detectors where there is error AND the error number is listed
-        // in the configurable error list in the job option cfi.
-        // Code needs to be here, because there can be a set of errors for each
-        // entry in the for loop over PixelDataFormatter::Errors
-
-        std::vector<PixelFEDChannel> disabledChannelsDetSet;
-
-        for (auto const& aPixelError : errorDetSet) {
-          // For the time being, we extend the error handling functionality with ErrorType 25
-          // In the future, we should sort out how the usage of tkerrorlist can be generalized
-          if (aPixelError.getType() == 25) {
-            int fedId = aPixelError.getFedId();
-            const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
-            if (fed) {
-              cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
-              const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
-              if (link) {
-                // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
-                // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
-                PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
-                for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
-                  const sipixelobjects::PixelROC * roc = link->roc(iRoc);
-                  if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
-                  if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
-                }
-                if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
-              }
-            }
-          }
-          else {
-            // fill list of detIds to be turned off by tracking
-            if (!tkerrorlist.empty()) {
-              auto it_find = std::find(tkerrorlist.begin(), tkerrorlist.end(), aPixelError.getType());
-              if (it_find != tkerrorlist.end()) {
-                tkerror_detidcollection->push_back(errordetid);
-              }
-            }
-          }
-
-          // fill list of detIds with errors to be studied
-          if (!usererrorlist.empty()) {
-            auto it_find = std::find(usererrorlist.begin(), usererrorlist.end(), aPixelError.getType());
-            if (it_find != usererrorlist.end()) {
-              usererror_detidcollection->push_back(errordetid);
-            }
-          }
-
-        } // loop on DetSet of errors
-
-        if (!disabledChannelsDetSet.empty()) {
-          disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
-        }
-
-      } // if error assigned to a real DetId
-    } // loop on errors in event for this FED
-  } // if errors to be included in the event
-
-  if (includeErrors) {
-    edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(dummydetid);
-    errorDetSet.data = nodeterrors;
-  }
-
-  ev.put(std::move(collection));
-  if(includeErrors){
-    ev.put(std::move(errorcollection));
-    ev.put(std::move(tkerror_detidcollection));
-    ev.put(std::move(usererror_detidcollection), "UserErrorModules");
-    ev.put(std::move(disabled_channelcollection));
-  }
-  ev.put(std::move(outputClusters));
-}
-
-// define as framework plugin
-DEFINE_FWK_MODULE(SiPixelRawToClusterHeterogeneous);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h b/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
deleted file mode 100644
index 3b81e4a16f017..0000000000000
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h
+++ /dev/null
@@ -1,47 +0,0 @@
-#ifndef EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h
-#define EventFilter_SiPixelRawToDigi_siPixelRawToClusterHeterogeneousProduct_h
-
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
-#include "FWCore/Utilities/interface/typedefs.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
-#include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
-
-namespace siPixelRawToClusterHeterogeneousProduct {
-  using CPUProduct = int; // dummy...
-
-  struct error_obj {
-    uint32_t rawId;
-    uint32_t word;
-    unsigned char errorType;
-    unsigned char fedId;
-  };
-
-  // FIXME split in two
-  struct GPUProduct {
-    GPUProduct() = default;
-    GPUProduct(const GPUProduct&) = delete;
-    GPUProduct& operator=(const GPUProduct&) = delete;
-    GPUProduct(GPUProduct&&) = default;
-    GPUProduct& operator=(GPUProduct&&) = default;
-
-    GPUProduct(SiPixelDigisCUDA&& digis,
-               SiPixelClustersCUDA&& clusters,
-               uint32_t ndig, uint32_t nmod, uint32_t nclus):
-      digis_d(std::move(digis)), clusters_d(std::move(clusters)),
-      nDigis(ndig), nModules(nmod), nClusters(nclus)
-    {}
-
-    SiPixelDigisCUDA digis_d;
-    SiPixelClustersCUDA clusters_d;
-
-    uint32_t nDigis;
-    uint32_t nModules;
-    uint32_t nClusters;
-  };
-
-  using HeterogeneousDigiCluster = HeterogeneousProductImpl<heterogeneous::CPUProduct<CPUProduct>,
-                                                            heterogeneous::GPUCudaProduct<GPUProduct> >;
-}
-
-#endif
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
index bb0bb85697a99..b95d97f227aa4 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
@@ -5,7 +5,5 @@
 siPixelClustersPreSplitting = _siPixelClusters.clone()
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoLocalTracker.SiPixelClusterizer.siPixelClustersHeterogeneous_cfi import siPixelClustersHeterogeneous as _siPixelClustersHeterogeneous
-from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
-from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
-gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersHeterogeneous.clone())
+from RecoLocalTracker.SiPixelClusterizer.siPixelClustersFromSoA_cfi import siPixelClustersFromSoA as _siPixelClustersFromSoA
+gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersFromSoA.clone())
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
new file mode 100644
index 0000000000000..547fb061f96bf
--- /dev/null
+++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
@@ -0,0 +1,18 @@
+import FWCore.ParameterSet.Config as cms
+
+from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import siPixelClustersPreSplitting
+from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA
+from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
+from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
+
+siPixelClustersPreSplittingTask = cms.Task(siPixelClustersPreSplitting)
+
+siPixelClustersCUDAPreSplitting = _siPixelRawToClusterCUDA.clone()
+siPixelClustersPreSplittingTaskCUDA = cms.Task(
+    siPixelClustersCUDAPreSplitting,
+)
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+_siPixelClustersPreSplittingTask_gpu = siPixelClustersPreSplittingTask.copy()
+_siPixelClustersPreSplittingTask_gpu.add(siPixelClustersPreSplittingTaskCUDA)
+gpu.toReplaceWith(siPixelClustersPreSplittingTask, _siPixelClustersPreSplittingTask_gpu)
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
index 947cd20d97919..80be13dedd26b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.cu
@@ -129,18 +129,19 @@ namespace pixelgpudetails {
 #endif
   }
 
-  void PixelRecHitGPUKernel::makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+  void PixelRecHitGPUKernel::makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                                           SiPixelClustersCUDA const& clusters_d,
                                            float const * bs,
                                            pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                                            bool transferToCPU,
                                            cuda::stream_t<>& stream) {
     cudaCheck(cudaMemcpyAsync(gpu_.bs_d, bs, 3 * sizeof(float), cudaMemcpyDefault, stream.id()));
-    gpu_.hitsModuleStart_d = input.clusters_d.clusModuleStart();
+    gpu_.hitsModuleStart_d = clusters_d.clusModuleStart();
     gpu_.cpeParams = cpeParams; // copy it for use in clients
     cudaCheck(cudaMemcpyAsync(gpu_d, &gpu_, sizeof(HitsOnGPU), cudaMemcpyDefault, stream.id()));
 
     int threadsPerBlock = 256;
-    int blocks = input.nModules; // active modules (with digis)
+    int blocks = digis_d.nModules(); // active modules (with digis)
 
 #ifdef GPU_DEBUG
     std::cout << "launching getHits kernel for " << blocks << " blocks" << std::endl;
@@ -148,12 +149,12 @@ namespace pixelgpudetails {
     gpuPixelRecHits::getHits<<<blocks, threadsPerBlock, 0, stream.id()>>>(
       cpeParams,
       gpu_.bs_d,
-      input.digis_d.moduleInd(),
-      input.digis_d.xx(), input.digis_d.yy(), input.digis_d.adc(),
-      input.clusters_d.moduleStart(),
-      input.clusters_d.clusInModule(), input.clusters_d.moduleId(),
-      input.clusters_d.clus(),
-      input.nDigis,
+      digis_d.moduleInd(),
+      digis_d.xx(), digis_d.yy(), digis_d.adc(),
+      clusters_d.moduleStart(),
+      clusters_d.clusInModule(), clusters_d.moduleId(),
+      digis_d.clus(),
+      digis_d.nDigis(),
       gpu_.hitsModuleStart_d,
       gpu_.charge_d,
       gpu_.detInd_d,
@@ -170,7 +171,7 @@ namespace pixelgpudetails {
     cudaCheck(cudaGetLastError());
 
     // needed only if hits on CPU are required...
-    nhits_ = input.nClusters;
+    nhits_ = clusters_d.nClusters();
     if(transferToCPU) {
       cudaCheck(cudaMemcpyAsync(h_hitsModuleStart_, gpu_.hitsModuleStart_d, (gpuClustering::MaxNumModules+1) * sizeof(uint32_t), cudaMemcpyDefault, stream.id()));
 #ifdef GPU_DEBUG
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
index dcc80308c4463..49164d24ab335 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/PixelRecHits.h
@@ -1,7 +1,8 @@
 #ifndef RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 #define RecoLocalTracker_SiPixelRecHits_plugins_PixelRecHits_h
 
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/gpuClusteringConstants.h"
 
 #include <cuda/api_wrappers.h>
@@ -31,7 +32,8 @@ namespace pixelgpudetails {
     PixelRecHitGPUKernel& operator=(const PixelRecHitGPUKernel&) = delete;
     PixelRecHitGPUKernel& operator=(PixelRecHitGPUKernel&&) = delete;
 
-    void makeHitsAsync(const siPixelRawToClusterHeterogeneousProduct::GPUProduct& input,
+    void makeHitsAsync(SiPixelDigisCUDA const& digis_d,
+                       SiPixelClustersCUDA const& clusters_d,
                        float const * bs,
                        pixelCPEforGPU::ParamsOnGPU const * cpeParams,
                        bool transferToCPU,
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 68f53a47157d4..40b80ff6ca4f6 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -1,3 +1,6 @@
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
 #include "DataFormats/BeamSpot/interface/BeamSpot.h"
@@ -13,25 +16,25 @@
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/Product/interface/HeterogeneousProduct.h"
 #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEBase.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/PixelCPEFast.h"
 #include "RecoLocalTracker/Records/interface/TkPixelCPERecord.h"
 
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h" // TODO: we need a proper place for this header...
-
 #include "PixelRecHits.h"  // TODO : spit product from kernel
 
+#include <cuda_runtime.h>
+
 class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::HeterogeneousDevices <
                                                                    heterogeneous::GPUCuda,
                                                                    heterogeneous::CPU
                                                                    > > {
 
 public:
-  using Input = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
-
   using CPUProduct = siPixelRecHitsHeterogeneousProduct::CPUProduct;
   using GPUProduct = siPixelRecHitsHeterogeneousProduct::GPUProduct;
   using Output = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
@@ -62,7 +65,9 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 
 
   edm::EDGetTokenT<reco::BeamSpot> 	 tBeamSpot;
-  edm::EDGetTokenT<HeterogeneousProduct> token_;
+  // The mess with inputs will be cleaned up when migrating to the new framework
+  edm::EDGetTokenT<CUDA<SiPixelClustersCUDA>> token_;
+  edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> tokenDigi_;
   edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;
   std::string cpeName_;
 
@@ -78,8 +83,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig):
   HeterogeneousEDProducer(iConfig),
   tBeamSpot(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-  token_(consumesHeterogeneous(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
-  clusterToken_(consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"))),
+  token_(consumes<CUDA<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
+  tokenDigi_(consumes<CUDA<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
   cpeName_(iConfig.getParameter<std::string>("CPE"))
 {
   enableConversion_ = iConfig.getParameter<bool>("gpuEnableConversion");
@@ -87,6 +92,7 @@ SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet&
 
   produces<HeterogeneousProduct>();
   if(enableConversion_) {
+    clusterToken_ = consumes<SiPixelClusterCollectionNew>(iConfig.getParameter<edm::InputTag>("src"));
     produces<SiPixelRecHitCollectionNew>();
   }
 }
@@ -95,7 +101,7 @@ void SiPixelRecHitHeterogeneous::fillDescriptions(edm::ConfigurationDescriptions
   edm::ParameterSetDescription desc;
 
   desc.add<edm::InputTag>("beamSpot", edm::InputTag("offlineBeamSpot"));
-  desc.add<edm::InputTag>("heterogeneousSrc", edm::InputTag("siPixelClustersPreSplitting"));
+  desc.add<edm::InputTag>("heterogeneousSrc", edm::InputTag("siPixelClustersCUDAPreSplitting"));
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersPreSplitting"));
   desc.add<std::string>("CPE", "PixelCPEFast");
 
@@ -118,16 +124,7 @@ void SiPixelRecHitHeterogeneous::initialize(const edm::EventSetup& es) {
 }
 
 void SiPixelRecHitHeterogeneous::produceCPU(edm::HeterogeneousEvent& iEvent, const edm::EventSetup& iSetup) {
-  initialize(iSetup);
-
-  edm::Handle<SiPixelClusterCollectionNew> hclusters;
-  iEvent.getByToken(clusterToken_, hclusters);
-
-  auto output = std::make_unique<SiPixelRecHitCollectionNew>();
-  run(hclusters, *output);
-
-  output->shrink_to_fit();
-  iEvent.put(std::move(output));
+  throw cms::Exception("NotImplemented") << "CPU version is no longer implemented";
 }
 
 void SiPixelRecHitHeterogeneous::run(const edm::Handle<SiPixelClusterCollectionNew>& inputhandle, SiPixelRecHitCollectionNew &output) const {
@@ -174,8 +171,28 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
     throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible....";
   }
 
-  edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> hinput;
-  iEvent.getByToken<Input>(token_, hinput);
+  edm::Handle<CUDA<SiPixelClustersCUDA>> hclusters;
+  iEvent.getByToken(token_, hclusters);
+  // temporary check (until the migration)
+  edm::Service<CUDAService> cs;
+  assert(hclusters->device() == cs->getCurrentDevice());
+  CUDAScopedContext ctx{*hclusters};
+  auto const& clusters = ctx.get(*hclusters);
+
+  edm::Handle<CUDA<SiPixelDigisCUDA>> hdigis;
+  iEvent.getByToken(tokenDigi_, hdigis);
+  auto const& digis = ctx.get(*hdigis);
+
+  // We're processing in a stream given by base class, so need to
+  // synchronize explicitly (implementation is from
+  // CUDAScopedContext). In practice these should not be needed
+  // (because of synchronizations upstream), but let's play generic.
+  if(not hclusters->event().has_occurred()) {
+    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0));
+  }
+  if(not hdigis->event().has_occurred()) {
+    cudaCheck(cudaStreamWaitEvent(cudaStream.id(), hclusters->event().id(), 0));
+  }
 
   edm::Handle<reco::BeamSpot> bsHandle;
   iEvent.getByToken( tBeamSpot, bsHandle);
@@ -185,8 +202,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
     bs[0]=bsh.x0(); bs[1]=bsh.y0(); bs[2]=bsh.z0();
   }
 
-
-  gpuAlgo_->makeHitsAsync(*hinput, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream);
+  gpuAlgo_->makeHitsAsync(digis, clusters, bs, fcpe->getGPUProductAsync(cudaStream), enableTransfer_, cudaStream);
 
 }
 
diff --git a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
index 15224adb78cc3..58935e9a6991c 100644
--- a/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
+++ b/RecoPixelVertexing/Configuration/python/customizePixelTracksForProfiling.py
@@ -21,7 +21,6 @@ def customizePixelTracksForProfilingDisableConversion(process):
     process = customizePixelTracksForProfiling(process)
 
     # Disable conversions to legacy
-    process.siPixelClustersPreSplitting.gpuEnableConversion = False
     process.siPixelRecHitsPreSplitting.gpuEnableConversion = False
     process.pixelTracksHitQuadruplets.gpuEnableConversion = False
     process.pixelTracks.gpuEnableConversion = False
@@ -33,7 +32,6 @@ def customizePixelTracksForProfilingDisableTransfer(process):
     process = customizePixelTracksForProfilingDisableConversion(process)
 
     # Disable "unnecessary" transfers to CPU
-    process.siPixelClustersPreSplitting.gpuEnableTransfer = False
     process.siPixelRecHitsPreSplitting.gpuEnableTransfer = False
     process.pixelTracksHitQuadruplets.gpuEnableTransfer = False
     process.pixelVertices.gpuEnableTransfer = False
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
index b402daef07a05..dfa08c1fa2043 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.cu
@@ -3,8 +3,6 @@
 #include <mutex>
 
 #include "HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h"
-#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
-#include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudastdAlgorithm.h"
 #include "RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h"
 #include "RecoLocalTracker/SiPixelRecHits/interface/pixelCPEforGPU.h"
@@ -14,7 +12,7 @@
 using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
 
 __global__
-void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const SiPixelClustersCUDA::DeviceConstView *cc, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n)
+void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, clusterSLOnGPU::HitsOnGPU const * hhp, ClusterSLGPU const * slp, uint32_t n)
 {
   assert(slp == slp->me_d);
 
@@ -35,7 +33,7 @@ void simLink(const SiPixelDigisCUDA::DeviceConstView *dd, uint32_t ndigis, const
 
   auto ch = pixelgpudetails::pixelToChannel(dd->xx(i), dd->yy(i));
   auto first = hh.hitsModuleStart_d[id];
-  auto cl = first + cc->clus(i);
+  auto cl = first + dd->clus(i);
   assert(cl < 2000 * blockDim.x);
 
   const std::array<uint32_t, 4> me{{id, ch, 0, 0}};
@@ -162,7 +160,7 @@ namespace clusterSLOnGPU {
     cudaCheck(cudaMemsetAsync(slgpu.n2_d, 0, (ClusterSLGPU::MaxNumModules*256)*sizeof(uint32_t), stream));
   }
 
-  void Kernel::algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
+  void Kernel::algo(SiPixelDigisCUDA const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream) {
     zero(stream.id());
 
     ClusterSLGPU const & sl = slgpu;
@@ -177,7 +175,7 @@ namespace clusterSLOnGPU {
     blocks = (ndigis + threadsPerBlock - 1) / threadsPerBlock;
 
     assert(sl.me_d);
-    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.digis_d.view(), ndigis, dd.clusters_d.view(), hh.gpu_d, sl.me_d, n);
+    simLink<<<blocks, threadsPerBlock, 0, stream.id()>>>(dd.view(), ndigis, hh.gpu_d, sl.me_d, n);
     cudaCheck(cudaGetLastError());
 
     if (doDump) {
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
index 00b0e34b301c8..23976cb418e16 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterSLOnGPU.h
@@ -4,8 +4,8 @@
 #include <cuda_runtime.h>
 #include <cuda/api_wrappers.h>
 
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 
 #include "trackerHitAssociationHeterogeneousProduct.h"
@@ -15,7 +15,6 @@ namespace clusterSLOnGPU {
   using ClusterSLGPU = trackerHitAssociationHeterogeneousProduct::ClusterSLGPU;
   using GPUProduct   = trackerHitAssociationHeterogeneousProduct::GPUProduct;
 
-  using DigisOnGPU   = siPixelRawToClusterHeterogeneousProduct::GPUProduct;
   using HitsOnGPU    = siPixelRecHitsHeterogeneousProduct::HitsOnGPU;
   using HitsOnCPU    = siPixelRecHitsHeterogeneousProduct::HitsOnCPU;
 
@@ -23,7 +22,7 @@ namespace clusterSLOnGPU {
   public:
     Kernel(cuda::stream_t<>& stream, bool dump);
     ~Kernel() {deAlloc();}
-    void algo(DigisOnGPU const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
+    void algo(SiPixelDigisCUDA const & dd, uint32_t ndigis, HitsOnCPU const & hh, uint32_t nhits, uint32_t n, cuda::stream_t<>& stream);
     GPUProduct getProduct() { return GPUProduct{slgpu.me_d};}
     
   private:
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index a5e0f403adcad..188573b6d6d26 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -4,6 +4,8 @@
 
 #include <cuda_runtime.h>
 
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
 #include "DataFormats/Common/interface/Handle.h"
@@ -22,15 +24,16 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/InputTag.h"
 #include "Geometry/Records/interface/TrackerDigiGeometryRecord.h"
 #include "Geometry/TrackerGeometryBuilder/interface/TrackerGeometry.h"
 #include "HeterogeneousCore/CUDACore/interface/GPUCuda.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 #include "HeterogeneousCore/Producer/interface/HeterogeneousEDProducer.h"
-#include "RecoLocalTracker/SiPixelClusterizer/plugins/siPixelRawToClusterHeterogeneousProduct.h"
 #include "RecoLocalTracker/SiPixelRecHits/plugins/siPixelRecHitsHeterogeneousProduct.h"
 #include "SimDataFormats/Track/interface/SimTrackContainer.h"
 #include "SimDataFormats/TrackerDigiSimLink/interface/PixelDigiSimLink.h"
@@ -52,7 +55,6 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   using CPUProduct = trackerHitAssociationHeterogeneousProduct::CPUProduct;
   using Output = trackerHitAssociationHeterogeneousProduct::ClusterTPAHeterogeneousProduct;
 
-  using PixelDigiClustersH = siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster;
   using PixelRecHitsH = siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit;
 
   explicit ClusterTPAssociationHeterogeneous(const edm::ParameterSet&);
@@ -89,7 +91,7 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D>> phase2OTClustersToken_;
   edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
 
-  edm::EDGetTokenT<HeterogeneousProduct> tGpuDigis;
+  edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> tGpuDigis;
   edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
 
   std::unique_ptr<clusterSLOnGPU::Kernel> gpuAlgo;
@@ -111,7 +113,7 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::
     stripClustersToken_(consumes<edmNew::DetSetVector<SiStripCluster>>(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
     phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D>>(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
     trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
-    tGpuDigis(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+    tGpuDigis(consumes<CUDA<SiPixelDigisCUDA>>(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
     tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
     doDump(cfg.getParameter<bool>("dumpCSV"))
 {
@@ -128,7 +130,7 @@ void ClusterTPAssociationHeterogeneous::fillDescriptions(edm::ConfigurationDescr
   desc.add<edm::InputTag>("stripClusterSrc", edm::InputTag("siStripClusters"));
   desc.add<edm::InputTag>("phase2OTClusterSrc", edm::InputTag("siPhase2Clusters"));
   desc.add<edm::InputTag>("trackingParticleSrc", edm::InputTag("mix", "MergedTrackTruth"));
-  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersPreSplitting"));
+  desc.add<edm::InputTag>("heterogeneousPixelDigiClusterSrc", edm::InputTag("siPixelClustersCUDAPreSplitting"));
   desc.add<edm::InputTag>("heterogeneousPixelRecHitSrc", edm::InputTag("siPixelRecHitsPreSplitting"));
 
   desc.add<bool>("dumpCSV", false);
@@ -184,13 +186,27 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 
     //  gpu stuff ------------------------
 
-    edm::Handle<siPixelRawToClusterHeterogeneousProduct::GPUProduct> gd;
+    edm::Handle<CUDA<SiPixelDigisCUDA>> gd;
+    iEvent.getByToken(tGpuDigis, gd);
+    // temporary check (until the migration)
+    edm::Service<CUDAService> cs;
+    assert(gd->device() == cs->getCurrentDevice());
+
+    CUDAScopedContext ctx{*gd};
+    auto const &gDigis = ctx.get(*gd);
+
+    // We're processing in a stream given by base class, so need to
+    // synchronize explicitly (implementation is from
+    // CUDAScopedContext). In practice these should not be needed
+    // (because of synchronizations upstream), but let's play generic.
+    if(not gd->event().has_occurred()) {
+      cudaCheck(cudaStreamWaitEvent(cudaStream.id(), gd->event().id(), 0));
+    }
+
     edm::Handle<siPixelRecHitsHeterogeneousProduct::GPUProduct> gh;
-    iEvent.getByToken<siPixelRawToClusterHeterogeneousProduct::HeterogeneousDigiCluster>(tGpuDigis, gd);
     iEvent.getByToken<siPixelRecHitsHeterogeneousProduct::HeterogeneousPixelRecHit>(tGpuHits, gh);
-    auto const & gDigis = *gd;
     auto const & gHits = *gh;
-    auto ndigis = gDigis.nDigis;
+    auto ndigis = gDigis.nDigis();
     auto nhits = gHits.nHits;
 
     digi2tp.clear();

From 946f294e988d0e5ba2bf4bd48ed0bfc1bab05de5 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 27 Dec 2018 17:16:01 +0100
Subject: [PATCH 21/49] Move {device,host}::unique_ptr to
 HeterogeneousCore/CUDAUtilities

---
 .../Common/interface/device_unique_ptr.h      | 29 ---------------
 .../Common/interface/host_unique_ptr.h        | 29 ---------------
 .../interface/SiPixelClustersCUDA.h           | 14 ++++----
 .../SiPixelDigi/interface/SiPixelDigisCUDA.h  | 36 +++++++++----------
 .../SiPixelDigi/src/SiPixelDigisCUDA.cc       |  8 ++---
 .../plugins/SiPixelDigisSoAFromCUDA.cc        | 12 +++----
 .../CUDAServices/interface/CUDAService.h      | 20 +++++------
 .../CUDAServices/src/CUDAService.cc           |  4 +--
 .../CUDATest/interface/CUDAThing.h            |  6 ++--
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc |  2 +-
 .../plugins/TestCUDAProducerGPUFirst.cc       |  2 +-
 .../plugins/TestCUDAProducerGPUKernel.cu      |  2 +-
 .../plugins/TestCUDAProducerGPUKernel.h       |  6 ++--
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |  2 +-
 .../CUDAUtilities/interface/copyAsync.h       | 12 +++----
 .../interface/device_unique_ptr.h             | 27 ++++++++++++++
 .../CUDAUtilities/interface/host_unique_ptr.h | 27 ++++++++++++++
 .../CUDAUtilities/interface/memsetAsync.h     |  6 ++--
 .../SiPixelFedCablingMapGPUWrapper.h          |  5 ++-
 .../plugins/SiPixelRawToClusterCUDA.cc        |  2 +-
 .../plugins/SiPixelRawToClusterGPUKernel.h    | 10 +++---
 .../src/SiPixelFedCablingMapGPUWrapper.cc     |  2 +-
 22 files changed, 129 insertions(+), 134 deletions(-)
 delete mode 100644 CUDADataFormats/Common/interface/device_unique_ptr.h
 delete mode 100644 CUDADataFormats/Common/interface/host_unique_ptr.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
 create mode 100644 HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h

diff --git a/CUDADataFormats/Common/interface/device_unique_ptr.h b/CUDADataFormats/Common/interface/device_unique_ptr.h
deleted file mode 100644
index de7474d26a67a..0000000000000
--- a/CUDADataFormats/Common/interface/device_unique_ptr.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_device_unique_ptr_h
-#define CUDADataFormats_Common_interface_device_unique_ptr_h
-
-#include <memory>
-#include <functional>
-
-namespace edm {
-  namespace cuda {
-    namespace device {
-      namespace impl {
-        // Additional layer of types to distinguish from host::unique_ptr
-        class DeviceDeleter {
-        public:
-          DeviceDeleter() = default;
-          explicit DeviceDeleter(std::function<void(void *)> f): f_(f) {}
-
-          void operator()(void *ptr) { f_(ptr); }
-        private:
-          std::function<void(void *)> f_;
-        };
-      }
-
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
-    }
-  }
-}
-
-#endif
diff --git a/CUDADataFormats/Common/interface/host_unique_ptr.h b/CUDADataFormats/Common/interface/host_unique_ptr.h
deleted file mode 100644
index a9eee938b165d..0000000000000
--- a/CUDADataFormats/Common/interface/host_unique_ptr.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#ifndef CUDADataFormats_Common_interface_host_unique_ptr_h
-#define CUDADataFormats_Common_interface_host_unique_ptr_h
-
-#include <memory>
-#include <functional>
-
-namespace edm {
-  namespace cuda {
-    namespace host {
-      namespace impl {
-        // Additional layer of types to distinguish from host::unique_ptr
-        class HostDeleter {
-        public:
-          HostDeleter() = default;
-          explicit HostDeleter(std::function<void(void *)> f): f_(f) {}
-
-          void operator()(void *ptr) { f_(ptr); }
-        private:
-          std::function<void(void *)> f_;
-        };
-      }
-
-      template <typename T>
-      using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
-    }
-  }
-}
-
-#endif
diff --git a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
index 5a780f5b70f65..f25a8a25f0808 100644
--- a/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
+++ b/CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h
@@ -1,8 +1,8 @@
 #ifndef CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 #define CUDADataFormats_SiPixelCluster_interface_SiPixelClustersCUDA_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
@@ -61,14 +61,14 @@ class SiPixelClustersCUDA {
   DeviceConstView *view() const { return view_d.get(); }
 
 private:
-  edm::cuda::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
-  edm::cuda::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
-  edm::cuda::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
+  cudautils::device::unique_ptr<uint32_t[]> moduleStart_d;   // index of the first pixel of each module
+  cudautils::device::unique_ptr<uint32_t[]> clusInModule_d;  // number of clusters found in each module
+  cudautils::device::unique_ptr<uint32_t[]> moduleId_d;      // module id of each module
 
   // originally from rechits
-  edm::cuda::device::unique_ptr<uint32_t[]> clusModuleStart_d;
+  cudautils::device::unique_ptr<uint32_t[]> clusModuleStart_d;
 
-  edm::cuda::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
+  cudautils::device::unique_ptr<DeviceConstView> view_d;    // "me" pointer
 
   uint32_t nClusters_h;
 };
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index 9f7b2d8e62178..44785618249d9 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -1,8 +1,8 @@
 #ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 #define CUDADataFormats_SiPixelDigi_interface_SiPixelDigisCUDA_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 #include "FWCore/Utilities/interface/propagate_const.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
@@ -59,12 +59,12 @@ class SiPixelDigisCUDA {
   uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
   GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }
   
-  edm::cuda::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
-  edm::cuda::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
-  edm::cuda::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
-  edm::cuda::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
+  cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;
 
-  using HostDataError = std::pair<edm::cuda::host::unique_ptr<PixelErrorCompact[]>, GPU::SimpleVector<PixelErrorCompact> const *>;
+  using HostDataError = std::pair<cudautils::host::unique_ptr<PixelErrorCompact[]>, GPU::SimpleVector<PixelErrorCompact> const *>;
   HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
 
   void copyErrorToHostAsync(cuda::stream_t<>& stream);
@@ -95,23 +95,23 @@ class SiPixelDigisCUDA {
 
 private:
   // These are consumed by downstream device code
-  edm::cuda::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
-  edm::cuda::device::unique_ptr<uint16_t[]> yy_d;        //
-  edm::cuda::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
-  edm::cuda::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
-  edm::cuda::device::unique_ptr<int32_t[]>  clus_d;      // cluster id of each pixel
-  edm::cuda::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
+  cudautils::device::unique_ptr<uint16_t[]> xx_d;        // local coordinates of each pixel
+  cudautils::device::unique_ptr<uint16_t[]> yy_d;        //
+  cudautils::device::unique_ptr<uint16_t[]> adc_d;       // ADC of each pixel
+  cudautils::device::unique_ptr<uint16_t[]> moduleInd_d; // module id of each pixel
+  cudautils::device::unique_ptr<int32_t[]>  clus_d;      // cluster id of each pixel
+  cudautils::device::unique_ptr<DeviceConstView> view_d; // "me" pointer
 
   // These are for CPU output; should we (eventually) place them to a
   // separate product?
-  edm::cuda::device::unique_ptr<uint32_t[]> pdigi_d;
-  edm::cuda::device::unique_ptr<uint32_t[]> rawIdArr_d;
+  cudautils::device::unique_ptr<uint32_t[]> pdigi_d;
+  cudautils::device::unique_ptr<uint32_t[]> rawIdArr_d;
 
   // These are for error CPU output; should we (eventually) place them
   // to a separate product?
-  edm::cuda::device::unique_ptr<PixelErrorCompact[]> data_d;
-  edm::cuda::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
-  edm::cuda::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
+  cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
+  cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
   PixelFormatterErrors formatterErrors_h;
 
   uint32_t nModules_h = 0;
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index 1f8f782c0a0e2..30095d7985765 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -44,28 +44,28 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, bool includeErrors, cuda:
   }
 }
 
-edm::cuda::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
+cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
   edm::Service<CUDAService> cs;
   auto ret = cs->make_host_unique<uint16_t[]>(nDigis(), stream);
   cudautils::copyAsync(ret, adc_d, nDigis(), stream);
   return ret;
 }
 
-edm::cuda::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
+cudautils::host::unique_ptr<int32_t[]> SiPixelDigisCUDA::clusToHostAsync(cuda::stream_t<>& stream) const {
   edm::Service<CUDAService> cs;
   auto ret = cs->make_host_unique<int32_t[]>(nDigis(), stream);
   cudautils::copyAsync(ret, clus_d, nDigis(), stream);
   return ret;
 }
 
-edm::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
+cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::pdigiToHostAsync(cuda::stream_t<>& stream) const {
   edm::Service<CUDAService> cs;
   auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
   cudautils::copyAsync(ret, pdigi_d, nDigis(), stream);
   return ret;
 }
 
-edm::cuda::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
+cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cuda::stream_t<>& stream) const {
   edm::Service<CUDAService> cs;
   auto ret = cs->make_host_unique<uint32_t[]>(nDigis(), stream);
   cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 90d1649128836..4f92f7fe7856f 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -1,4 +1,3 @@
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 #include "CUDADataFormats/Common/interface/CUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
@@ -10,6 +9,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 
 class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
@@ -26,12 +26,12 @@ class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
-  edm::cuda::host::unique_ptr<uint32_t[]> pdigi_;
-  edm::cuda::host::unique_ptr<uint32_t[]> rawIdArr_;
-  edm::cuda::host::unique_ptr<uint16_t[]> adc_;
-  edm::cuda::host::unique_ptr< int32_t[]> clus_;
+  cudautils::host::unique_ptr<uint32_t[]> pdigi_;
+  cudautils::host::unique_ptr<uint32_t[]> rawIdArr_;
+  cudautils::host::unique_ptr<uint16_t[]> adc_;
+  cudautils::host::unique_ptr< int32_t[]> clus_;
 
-  edm::cuda::host::unique_ptr<PixelErrorCompact[]> data_;
+  cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
   const GPU::SimpleVector<PixelErrorCompact> *error_ = nullptr;
   const PixelFormatterErrors *formatterErrors_ = nullptr;
 
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 9101c7772d9ec..8a3e9f3ee8322 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -8,8 +8,8 @@
 
 #include "FWCore/Utilities/interface/StreamID.h"
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 namespace edm {
   class ParameterSet;
@@ -19,16 +19,16 @@ namespace edm {
 
 namespace cudaserviceimpl {
   template <typename T>
-  struct make_device_unique_selector { using non_array = edm::cuda::device::unique_ptr<T>; };
+  struct make_device_unique_selector { using non_array = cudautils::device::unique_ptr<T>; };
   template <typename T>
-  struct make_device_unique_selector<T[]> { using unbounded_array = edm::cuda::device::unique_ptr<T[]>; };
+  struct make_device_unique_selector<T[]> { using unbounded_array = cudautils::device::unique_ptr<T[]>; };
   template <typename T, size_t N>
   struct make_device_unique_selector<T[N]> { struct bounded_array {}; };
 
   template <typename T>
-  struct make_host_unique_selector { using non_array = edm::cuda::host::unique_ptr<T>; };
+  struct make_host_unique_selector { using non_array = cudautils::host::unique_ptr<T>; };
   template <typename T>
-  struct make_host_unique_selector<T[]> { using unbounded_array = edm::cuda::host::unique_ptr<T[]>; };
+  struct make_host_unique_selector<T[]> { using unbounded_array = cudautils::host::unique_ptr<T[]>; };
   template <typename T, size_t N>
   struct make_host_unique_selector<T[N]> { struct bounded_array {}; };
 }
@@ -76,7 +76,7 @@ class CUDAService {
     int dev = getCurrentDevice();
     void *mem = allocate_device(dev, sizeof(T), stream);
     return typename cudaserviceimpl::make_device_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
-                                                                               edm::cuda::device::impl::DeviceDeleter([this, dev](void *ptr) {
+                                                                               cudautils::device::impl::DeviceDeleter([this, dev](void *ptr) {
                                                                                    this->free_device(dev, ptr);
                                                                                  }));
   }
@@ -89,7 +89,7 @@ class CUDAService {
     int dev = getCurrentDevice();
     void *mem = allocate_device(dev, n*sizeof(element_type), stream);
     return typename cudaserviceimpl::make_device_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
-                                                                                     edm::cuda::device::impl::DeviceDeleter([this, dev](void *ptr) {
+                                                                                     cudautils::device::impl::DeviceDeleter([this, dev](void *ptr) {
                                                                                          this->free_device(dev, ptr);
                                                                                        }));
   }
@@ -105,7 +105,7 @@ class CUDAService {
     static_assert(std::is_trivially_constructible<T>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
     void *mem = allocate_host(sizeof(T), stream);
     return typename cudaserviceimpl::make_host_unique_selector<T>::non_array(reinterpret_cast<T *>(mem),
-                                                                             edm::cuda::host::impl::HostDeleter([this](void *ptr) {
+                                                                             cudautils::host::impl::HostDeleter([this](void *ptr) {
                                                                                  this->free_host(ptr);
                                                                                }));
   }
@@ -117,7 +117,7 @@ class CUDAService {
     static_assert(std::is_trivially_constructible<element_type>::value, "Allocating with non-trivial constructor on the pinned host memory is not supported");
     void *mem = allocate_host(n*sizeof(element_type), stream);
     return typename cudaserviceimpl::make_host_unique_selector<T>::unbounded_array(reinterpret_cast<element_type *>(mem),
-                                                                                   edm::cuda::host::impl::HostDeleter([this](void *ptr) {
+                                                                                   cudautils::host::impl::HostDeleter([this](void *ptr) {
                                                                                        this->free_host(ptr);
                                                                                      }));
   }
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index ad94d8837f68f..54b5dd4ab8b14 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -98,7 +98,7 @@ namespace {
     cudaCheck(cudaGetDevice(&device));
     for(int i=0; i<numberOfDevices; ++i) {
       cudaCheck(cudaSetDevice(i));
-      preallocate<edm::cuda::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+      preallocate<cudautils::device::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
           return cs.make_device_unique<char[]>(size, stream);
         }, bufferSizes);
     }
@@ -106,7 +106,7 @@ namespace {
   }
 
   void hostPreallocate(CUDAService& cs, const std::vector<unsigned int>& bufferSizes) {
-    preallocate<edm::cuda::host::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
+    preallocate<cudautils::host::unique_ptr>([&](size_t size, cuda::stream_t<>& stream) {
         return cs.make_host_unique<char[]>(size, stream);
       }, bufferSizes);
   }
diff --git a/HeterogeneousCore/CUDATest/interface/CUDAThing.h b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
index 294374bba1ce5..ecda1f2aafdf6 100644
--- a/HeterogeneousCore/CUDATest/interface/CUDAThing.h
+++ b/HeterogeneousCore/CUDATest/interface/CUDAThing.h
@@ -1,19 +1,19 @@
 #ifndef HeterogeneousCore_CUDATest_CUDAThing_H
 #define HeterogeneousCore_CUDATest_CUDAThing_H
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 class CUDAThing {
 public:
   CUDAThing() = default;
-  CUDAThing(edm::cuda::device::unique_ptr<float[]> ptr):
+  CUDAThing(cudautils::device::unique_ptr<float[]> ptr):
     ptr_(std::move(ptr))
   {}
 
   const float *get() const { return ptr_.get(); }
 
 private:
-  edm::cuda::device::unique_ptr<float[]> ptr_;;
+  cudautils::device::unique_ptr<float[]> ptr_;;
 };
 
 #endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 20b9308694da4..ee78f91d0a233 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -27,7 +27,7 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
   edm::EDPutTokenT<CUDA<CUDAThing>> dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   CUDAContextToken ctxTmp_;
-  edm::cuda::device::unique_ptr<float[]> devicePtr_;
+  cudautils::device::unique_ptr<float[]> devicePtr_;
   float hostData_ = 0.f;
 };
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index b38d41a02e591..e99924f5cb755 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -41,7 +41,7 @@ void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEven
 
   auto ctx = CUDAScopedContext(streamID);
 
-  edm::cuda::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
+  cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
   iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
 
   edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index 933e5deea1266..b24dfb6642a41 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -54,7 +54,7 @@ namespace {
   }
 }
 
-edm::cuda::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const {
+cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const {
   // First make the sanity check
   if(d_input != nullptr) {
     auto h_check = std::make_unique<float[]>(NUM_VALUES);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
index c8fe63fe2b2f0..d03c4c1e0dc70 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.h
@@ -1,7 +1,7 @@
 #ifndef HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
 #define HeterogeneousCore_CUDACore_TestCUDAProducerGPUKernel_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
@@ -22,10 +22,10 @@ class TestCUDAProducerGPUKernel {
   ~TestCUDAProducerGPUKernel() = default;
 
   // returns (owning) pointer to device memory
-  edm::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, cuda::stream_t<>& stream) const {
+  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label, cuda::stream_t<>& stream) const {
     return runAlgo(label, nullptr, stream);
   }
-  edm::cuda::device::unique_ptr<float[]> runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const;
+  cudautils::device::unique_ptr<float[]> runAlgo(const std::string& label, const float *d_input, cuda::stream_t<>& stream) const;
 };
 
 #endif
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index af9c55a3b78aa..813d75ad69d09 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -26,7 +26,7 @@ class TestCUDAProducerGPUtoCPU: public edm::stream::EDProducer<edm::ExternalWork
 private:
   std::string label_;
   edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
-  edm::cuda::host::unique_ptr<float[]> buffer_;
+  cudautils::host::unique_ptr<float[]> buffer_;
 };
 
 TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig):
diff --git a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
index c8016983070a2..fa0db60aea592 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/copyAsync.h
@@ -1,8 +1,8 @@
 #ifndef HeterogeneousCore_CUDAUtilities_copyAsync_h
 #define HeterogeneousCore_CUDAUtilities_copyAsync_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
@@ -12,7 +12,7 @@ namespace cudautils {
   // Single element
   template <typename T>
   inline
-  void copyAsync(edm::cuda::device::unique_ptr<T>& dst, const edm::cuda::host::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+  void copyAsync(cudautils::device::unique_ptr<T>& dst, const cudautils::host::unique_ptr<T>& src, cuda::stream_t<>& stream) {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
@@ -21,7 +21,7 @@ namespace cudautils {
 
   template <typename T>
   inline
-  void copyAsync(edm::cuda::host::unique_ptr<T>& dst, const edm::cuda::device::unique_ptr<T>& src, cuda::stream_t<>& stream) {
+  void copyAsync(cudautils::host::unique_ptr<T>& dst, const cudautils::device::unique_ptr<T>& src, cuda::stream_t<>& stream) {
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
     cuda::memory::async::copy(dst.get(), src.get(), sizeof(T), stream.id());
   }
@@ -29,13 +29,13 @@ namespace cudautils {
   // Multiple elements
   template <typename T>
   inline
-  void copyAsync(edm::cuda::device::unique_ptr<T[]>& dst, const edm::cuda::host::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+  void copyAsync(cudautils::device::unique_ptr<T[]>& dst, const cudautils::host::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
     cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
   }
 
   template <typename T>
   inline
-  void copyAsync(edm::cuda::host::unique_ptr<T[]>& dst, const edm::cuda::device::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
+  void copyAsync(cudautils::host::unique_ptr<T[]>& dst, const cudautils::device::unique_ptr<T[]>& src, size_t nelements, cuda::stream_t<>& stream) {
     cuda::memory::async::copy(dst.get(), src.get(), nelements*sizeof(T), stream.id());
   }
 }
diff --git a/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
new file mode 100644
index 0000000000000..06a0424450983
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_device_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+namespace cudautils {
+  namespace device {
+    namespace impl {
+      // Additional layer of types to distinguish from host::unique_ptr
+      class DeviceDeleter {
+      public:
+        DeviceDeleter() = default;
+        explicit DeviceDeleter(std::function<void(void *)> f): f_(f) {}
+
+        void operator()(void *ptr) { f_(ptr); }
+      private:
+        std::function<void(void *)> f_;
+      };
+    }
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::DeviceDeleter>;
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
new file mode 100644
index 0000000000000..2a39c475cbb91
--- /dev/null
+++ b/HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h
@@ -0,0 +1,27 @@
+#ifndef HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+#define HeterogeneousCore_CUDAUtilities_interface_host_unique_ptr_h
+
+#include <memory>
+#include <functional>
+
+namespace cudautils {
+  namespace host {
+    namespace impl {
+      // Additional layer of types to distinguish from host::unique_ptr
+      class HostDeleter {
+      public:
+        HostDeleter() = default;
+        explicit HostDeleter(std::function<void(void *)> f): f_(f) {}
+
+        void operator()(void *ptr) { f_(ptr); }
+      private:
+        std::function<void(void *)> f_;
+      };
+    }
+
+    template <typename T>
+    using unique_ptr = std::unique_ptr<T, impl::HostDeleter>;
+  }
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
index 5d4f6e10d747f..d87c50b666b61 100644
--- a/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
+++ b/HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h
@@ -1,7 +1,7 @@
 #ifndef HeterogeneousCore_CUDAUtilities_memsetAsync_h
 #define HeterogeneousCore_CUDAUtilities_memsetAsync_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 
 #include <cuda/api_wrappers.h>
 
@@ -10,7 +10,7 @@
 namespace cudautils {
   template <typename T>
   inline
-  void memsetAsync(edm::cuda::device::unique_ptr<T>& ptr, T value, cuda::stream_t<>& stream) {
+  void memsetAsync(cudautils::device::unique_ptr<T>& ptr, T value, cuda::stream_t<>& stream) {
     // Shouldn't compile for array types because of sizeof(T), but
     // let's add an assert with a more helpful message
     static_assert(std::is_array<T>::value == false, "For array types, use the other overload with the size parameter");
@@ -25,7 +25,7 @@ namespace cudautils {
    */
   template <typename T>
   inline
-  void memsetAsync(edm::cuda::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cuda::stream_t<>& stream) {
+  void memsetAsync(cudautils::device::unique_ptr<T[]>& ptr, int value, size_t nelements, cuda::stream_t<>& stream) {
     cuda::memory::device::async::set(ptr.get(), value, nelements*sizeof(T), stream.id());
   }
 }
diff --git a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
index e5e5f41053e3d..f0d996bd7310b 100644
--- a/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
+++ b/RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPUWrapper.h
@@ -1,10 +1,9 @@
 #ifndef RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 #define RecoLocalTracker_SiPixelClusterizer_SiPixelFedCablingMapGPUWrapper_h
 
-#include "CUDADataFormats/Common/interface/device_unique_ptr.h"
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAESProduct.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/CUDAHostAllocator.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "RecoLocalTracker/SiPixelClusterizer/interface/SiPixelFedCablingMapGPU.h"
 
 #include <cuda/api_wrappers.h>
@@ -30,7 +29,7 @@ class SiPixelFedCablingMapGPUWrapper {
 
   // returns pointer to GPU memory
   const unsigned char *getModToUnpAllAsync(cuda::stream_t<>& cudaStream) const;
-  edm::cuda::device::unique_ptr<unsigned char[]> getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const;
+  cudautils::device::unique_ptr<unsigned char[]> getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const;
 
 private:
   const SiPixelFedCablingMap *cablingMap_;
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index 78cc8d2483709..a27fd420cddbe 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -122,7 +122,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
   // get the GPU product already here so that the async transfer can begin
   const auto *gpuGains = hgains->getGPUProductAsync(ctx.stream());
 
-  edm::cuda::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
+  cudautils::device::unique_ptr<unsigned char[]> modulesToUnpackRegional;
   const unsigned char *gpuModulesToUnpack;
 
   if(regions_) {
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 98f425d88908c..d35d0636a76d3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -5,11 +5,11 @@
 #include <cuda_runtime.h>
 #include "cuda/api_wrappers.h"
 
-#include "CUDADataFormats/Common/interface/host_unique_ptr.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
 #include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
 
 struct SiPixelFedCablingMapGPU;
@@ -167,8 +167,8 @@ namespace pixelgpudetails {
       const unsigned char *fedId() const { return fedId_.get(); }
 
     private:
-      edm::cuda::host::unique_ptr<unsigned int[]> word_;
-      edm::cuda::host::unique_ptr<unsigned char[]> fedId_;
+      cudautils::host::unique_ptr<unsigned int[]> word_;
+      cudautils::host::unique_ptr<unsigned char[]> fedId_;
     };
 
     SiPixelRawToClusterGPUKernel() = default;
@@ -200,8 +200,8 @@ namespace pixelgpudetails {
     uint32_t nDigis = 0;
 
     // Data to be put in the event
-    edm::cuda::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
-    edm::cuda::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+    cudautils::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
+    cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
     SiPixelDigisCUDA digis_d;
     SiPixelClustersCUDA clusters_d;
   };
diff --git a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
index b652100f69e9f..e8726100abe0e 100644
--- a/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/src/SiPixelFedCablingMapGPUWrapper.cc
@@ -135,7 +135,7 @@ const unsigned char *SiPixelFedCablingMapGPUWrapper::getModToUnpAllAsync(cuda::s
   return data.modToUnpDefault;
 }
 
-edm::cuda::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const {
+cudautils::device::unique_ptr<unsigned char[]> SiPixelFedCablingMapGPUWrapper::getModToUnpRegionalAsync(std::set<unsigned int> const& modules, cuda::stream_t<>& cudaStream) const {
   edm::Service<CUDAService> cs;
   auto modToUnpDevice = cs->make_device_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);
   auto modToUnpHost = cs->make_host_unique<unsigned char[]>(pixelgpudetails::MAX_SIZE, cudaStream);

From 57461b51f5e09e4cb654359ce3113f11bbe42e4e Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 27 Dec 2018 18:07:34 +0100
Subject: [PATCH 22/49] Cache and reuse CUDA streams within CUDAService

---
 CUDADataFormats/Common/test/test_CUDA.cc      | 13 ++++++++++-
 .../CUDACore/interface/CUDAScopedContext.h    | 15 ++++++++-----
 .../CUDACore/src/CUDAScopedContext.cc         | 19 +++++++++++-----
 .../CUDACore/test/test_CUDAScopedContext.cc   | 15 +++++++++++--
 .../CUDAServices/interface/CUDAService.h      |  8 +++++++
 .../CUDAServices/src/CUDAService.cc           | 22 +++++++++++++++++++
 .../CUDAServices/test/testCUDAService.cpp     |  8 +++----
 7 files changed, 81 insertions(+), 19 deletions(-)

diff --git a/CUDADataFormats/Common/test/test_CUDA.cc b/CUDADataFormats/Common/test/test_CUDA.cc
index a6c22833a10c4..7754cf2509cf2 100644
--- a/CUDADataFormats/Common/test/test_CUDA.cc
+++ b/CUDADataFormats/Common/test/test_CUDA.cc
@@ -6,6 +6,17 @@
 
 #include <cuda_runtime_api.h>
 
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static
+    CUDAScopedContext make(int dev) {
+      auto device = cuda::device::get(dev);
+      return CUDAScopedContext(dev, std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)));
+    }
+  };
+}
+
 TEST_CASE("Use of CUDA template", "[CUDACore]") {
   SECTION("Default constructed") {
     auto foo = CUDA<int>();
@@ -25,7 +36,7 @@ TEST_CASE("Use of CUDA template", "[CUDACore]") {
 
   constexpr int defaultDevice = 0;
   {
-    auto ctx = CUDAScopedContext(defaultDevice);
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice);
     std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
     auto& data = *dataPtr;
 
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 85439e99a08e3..cb716e4007099 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -13,6 +13,10 @@
 
 #include <optional>
 
+namespace cudatest {
+  class TestCUDAScopedContext;
+}
+
 /**
  * The aim of this class is to do necessary per-event "initialization":
  * - setting the current device
@@ -24,12 +28,6 @@ class CUDAScopedContext {
 public:
   explicit CUDAScopedContext(edm::StreamID streamID);
 
-  // This constructor takes the device as a parameter. It is mainly
-  // inteded for testing, but can be used for special cases if you
-  // really know what you're doing. Please use the StreamID overload
-  // if at all possible.
-  explicit CUDAScopedContext(int device);
-
   explicit CUDAScopedContext(CUDAContextToken&& token):
     currentDevice_(token.device()),
     setDeviceForThisScope_(currentDevice_),
@@ -108,6 +106,11 @@ class CUDAScopedContext {
   }
 
 private:
+  friend class cudatest::TestCUDAScopedContext;
+
+  // This construcor is only meant for testing
+  explicit CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream);
+
   int currentDevice_;
   std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
   cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index 702c01448db69..e57033a1d0968 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -1,19 +1,26 @@
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 #include "chooseCUDADevice.h"
 
 
-CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID): CUDAScopedContext(cudacore::chooseCUDADevice(streamID)) {}
-CUDAScopedContext::CUDAScopedContext(int device):
-  currentDevice_(device),
-  setDeviceForThisScope_(device)
+CUDAScopedContext::CUDAScopedContext(edm::StreamID streamID):
+  currentDevice_(cudacore::chooseCUDADevice(streamID)),
+  setDeviceForThisScope_(currentDevice_)
 {
-  auto current_device = cuda::device::current::get();
-  stream_ = std::make_shared<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
+  edm::Service<CUDAService> cs;
+  stream_ = cs->getCUDAStream();
 }
 
+CUDAScopedContext::CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream):
+  currentDevice_(device),
+  setDeviceForThisScope_(device),
+  stream_(std::move(stream))
+{}
+
 CUDAScopedContext::~CUDAScopedContext() {
   if(waitingTaskHolder_.has_value()) {
     stream_->enqueue.callback([device=currentDevice_,
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index ff18c068aedb6..e35c4b284802d 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -6,9 +6,20 @@
 
 #include "test_CUDAScopedContextKernels.h"
 
+namespace cudatest {
+  class TestCUDAScopedContext {
+  public:
+    static
+    CUDAScopedContext make(int dev) {
+      auto device = cuda::device::get(dev);
+      return CUDAScopedContext(dev, std::make_unique<cuda::stream_t<>>(device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream)));
+    }
+  };
+}
+
 namespace {
   std::unique_ptr<CUDA<int *> > produce(int device, int *d, int *h) {
-    auto ctx = CUDAScopedContext(device);
+    auto ctx = cudatest::TestCUDAScopedContext::make(device);
 
     cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id());
     testCUDAScopedContextKernels_single(d, ctx.stream());
@@ -28,7 +39,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
 
   constexpr int defaultDevice = 0;
   {
-    auto ctx = CUDAScopedContext(defaultDevice);
+    auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice);
 
     SECTION("Construct from device ID") {
       REQUIRE(cuda::device::current::get().id() == defaultDevice);
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 8a3e9f3ee8322..22e03a61f37d2 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -132,6 +132,10 @@ class CUDAService {
   // Free pinned host memory (to be called from unique_ptr)
   void free_host(void *ptr);
 
+  // Gets a (cached) CUDA stream for the current device. The stream
+  // will be returned to the cache by the shared_ptr destructor.
+  std::shared_ptr<cuda::stream_t<>> getCUDAStream();
+
 private:
   // PIMPL to hide details of allocator
   struct Allocator;
@@ -139,6 +143,10 @@ class CUDAService {
   void *allocate_device(int dev, size_t nbytes, cuda::stream_t<>& stream);
   void *allocate_host(size_t nbytes, cuda::stream_t<>& stream);
 
+  // PIMPL to hide details of the CUDA stream cache
+  struct CUDAStreamCache;
+  std::unique_ptr<CUDAStreamCache> cudaStreamCache_;
+
   int numberOfDevices_ = 0;
   unsigned int numberOfStreamsTotal_ = 0;
   std::vector<std::pair<int, int>> computeCapabilities_;
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 54b5dd4ab8b14..1902f3ff843a2 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -9,6 +9,7 @@
 #include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/Utilities/interface/ReusableObjectHolder.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -336,6 +337,10 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
     log << "cub::CachingDeviceAllocator disabled\n";
   }
 
+  cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
+
+  log << "\n";
+
   log << "CUDAService fully initialized";
   enabled_ = true;
 
@@ -350,6 +355,7 @@ CUDAService::~CUDAService() {
     if(allocator_) {
       allocator_.reset();
     }
+    cudaStreamCache_.reset();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
       cudaCheck(cudaSetDevice(i));
@@ -490,3 +496,19 @@ void CUDAService::free_host(void *ptr) {
     cuda::throw_if_error(cudaFreeHost(ptr));
   }
 }
+
+
+// CUDA stream cache
+struct CUDAService::CUDAStreamCache {
+  explicit CUDAStreamCache(int ndev): cache(ndev) {}
+
+  // Separate caches for each device for fast lookup
+  std::vector<edm::ReusableObjectHolder<cuda::stream_t<>>> cache;
+};
+
+std::shared_ptr<cuda::stream_t<>> CUDAService::getCUDAStream() {
+  return cudaStreamCache_->cache[getCurrentDevice()].makeOrGet([](){
+      auto current_device = cuda::device::current::get();
+      return std::make_unique<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
+    });
+}
diff --git a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
index d0a1afcc8203f..4764d74c427c2 100644
--- a/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
+++ b/HeterogeneousCore/CUDAServices/test/testCUDAService.cpp
@@ -179,8 +179,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     ps.addUntrackedParameter("allocator", alloc);
     auto cs = makeCUDAService(ps, ar);
     cs.setCurrentDevice(0);
-    auto current_device = cuda::device::current::get();
-    auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+    auto cudaStreamPtr = cs.getCUDAStream();
+    auto& cudaStream = *cudaStreamPtr;
     
     SECTION("Destructor") {
       auto ptr = cs.make_device_unique<int>(cudaStream);
@@ -214,8 +214,8 @@ TEST_CASE("Tests of CUDAService", "[CUDAService]") {
     ps.addUntrackedParameter("allocator", alloc);
     auto cs = makeCUDAService(ps, ar);
     cs.setCurrentDevice(0);
-    auto current_device = cuda::device::current::get();
-    auto cudaStream = current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream);
+    auto cudaStreamPtr = cs.getCUDAStream();
+    auto& cudaStream = *cudaStreamPtr;
     
     SECTION("Destructor") {
       auto ptr = cs.make_host_unique<int>(cudaStream);

From bb7df45d5198ad62a953f7728b41e3a8e96be6d4 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 27 Dec 2018 18:10:10 +0100
Subject: [PATCH 23/49] Now I don't have to manually keep the CUDA stream alive

---
 .../SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc     | 6 ------
 .../plugins/SiPixelRawToClusterGPUKernel.h                  | 4 ++++
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 4f92f7fe7856f..06c4fab25763b 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -35,8 +35,6 @@ class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   const GPU::SimpleVector<PixelErrorCompact> *error_ = nullptr;
   const PixelFormatterErrors *formatterErrors_ = nullptr;
 
-  CUDAContextToken ctxTmp_;
-
   int nDigis_;
   bool includeErrors_;
 };
@@ -73,8 +71,6 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::Event
     error_ = tmp.second;
     formatterErrors_ = &(gpuDigis.formatterErrors());
   }
-
-  ctxTmp_ = ctx.toToken(); // CUDA stream must live until produce
 }
 
 void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
@@ -103,8 +99,6 @@ void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
   data_.reset();
   error_ = nullptr;
   formatterErrors_ = nullptr;
-  
-  ctxTmp_.reset(); // release CUDA stream etc
 }
 
 // define as framework plugin
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index d35d0636a76d3..19b47c9cceb87 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -192,6 +192,10 @@ namespace pixelgpudetails {
       clusters_d.setNClusters(nModules_Clusters_h[1]);
       // need to explicitly deallocate while the associated CUDA
       // stream is still alive
+      //
+      // technically the statement above is not true anymore now that
+      // the CUDA streams are cached within the CUDAService, but it is
+      // still better to release as early as possible
       nModules_Clusters_h.reset();
       return std::make_pair(std::move(digis_d), std::move(clusters_d));
     }

From 15c7481d5804873f17bafdc7fbc3b139df02b7e0 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 28 Dec 2018 16:30:04 +0100
Subject: [PATCH 24/49] Revert "Add reset() to CUDAContextToken"

This reverts commit 575d2d32cfc8296e8473c570809a42fc209c392a.
---
 HeterogeneousCore/CUDACore/interface/CUDAContextToken.h | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
index c9b1afe8f3ca1..1a599132d13f1 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAContextToken.h
@@ -18,11 +18,6 @@ class CUDAContextToken {
   CUDAContextToken(CUDAContextToken&&) = default;
   CUDAContextToken& operator=(CUDAContextToken&& other) = default;
 
-  void reset() {
-    stream_.reset();
-    device_ = -1;
-  }
-
 private:
   friend class CUDAScopedContext;
 

From 34c123436c18bf190777735634a23ff97291aff0 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 28 Dec 2018 17:35:25 +0100
Subject: [PATCH 25/49] Move digi errors to their own data format classes

---
 .../interface/SiPixelDigiErrorsCUDA.h         | 40 ++++++++++
 .../SiPixelDigi/interface/SiPixelDigisCUDA.h  | 25 +-----
 .../SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc  | 44 +++++++++++
 .../SiPixelDigi/src/SiPixelDigisCUDA.cc       | 40 +---------
 CUDADataFormats/SiPixelDigi/src/classes.h     |  1 +
 .../SiPixelDigi/src/classes_def.xml           |  3 +
 .../interface/SiPixelDigiErrorsSoA.h          | 28 +++++++
 .../SiPixelDigi/interface/SiPixelDigisSoA.h   | 14 ----
 .../SiPixelDigi/src/SiPixelDigiErrorsSoA.cc   | 10 +++
 .../SiPixelDigi/src/SiPixelDigisSoA.cc        | 11 ---
 DataFormats/SiPixelDigi/src/classes.h         |  1 +
 DataFormats/SiPixelDigi/src/classes_def.xml   |  3 +
 .../plugins/SiPixelDigiErrorsSoAFromCUDA.cc   | 77 +++++++++++++++++++
 .../plugins/SiPixelDigisFromSoA.cc            | 24 +++---
 .../plugins/SiPixelDigisSoAFromCUDA.cc        | 24 +-----
 .../python/siPixelDigis_cff.py                |  6 +-
 .../plugins/SiPixelRawToClusterCUDA.cc        | 10 +++
 .../plugins/SiPixelRawToClusterGPUKernel.cu   | 10 ++-
 .../plugins/SiPixelRawToClusterGPUKernel.h    |  8 +-
 19 files changed, 249 insertions(+), 130 deletions(-)
 create mode 100644 CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
 create mode 100644 CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
 create mode 100644 DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h
 create mode 100644 DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc
 create mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc

diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
new file mode 100644
index 0000000000000..e9c8c0f644722
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h
@@ -0,0 +1,40 @@
+#ifndef CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+#define CUDADataFormats_SiPixelDigi_interface_SiPixelDigiErrorsCUDA_h
+
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
+
+#include <cuda/api_wrappers.h>
+
+class SiPixelDigiErrorsCUDA {
+public:
+  SiPixelDigiErrorsCUDA() = default;
+  explicit SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream);
+  ~SiPixelDigiErrorsCUDA() = default;
+
+  SiPixelDigiErrorsCUDA(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA& operator=(const SiPixelDigiErrorsCUDA&) = delete;
+  SiPixelDigiErrorsCUDA(SiPixelDigiErrorsCUDA&&) = default;
+  SiPixelDigiErrorsCUDA& operator=(SiPixelDigiErrorsCUDA&&) = default;
+
+  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
+
+  GPU::SimpleVector<PixelErrorCompact> *error() { return error_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *error() const { return error_d.get(); }
+  GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }
+
+  using HostDataError = std::pair<GPU::SimpleVector<PixelErrorCompact>, cudautils::host::unique_ptr<PixelErrorCompact[]>>;
+  HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
+
+  void copyErrorToHostAsync(cuda::stream_t<>& stream);
+
+private:
+  cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
+  cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
+  cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
+  PixelFormatterErrors formatterErrors_h;
+};
+
+#endif
diff --git a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
index 44785618249d9..6a52545483eb8 100644
--- a/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
+++ b/CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h
@@ -3,16 +3,13 @@
 
 #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
-#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
-#include "FWCore/Utilities/interface/propagate_const.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
 
 #include <cuda/api_wrappers.h>
 
 class SiPixelDigisCUDA {
 public:
   SiPixelDigisCUDA() = default;
-  explicit SiPixelDigisCUDA(size_t maxFedWords, bool includeErrors, cuda::stream_t<>& stream);
+  explicit SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream);
   ~SiPixelDigisCUDA() = default;
 
   SiPixelDigisCUDA(const SiPixelDigisCUDA&) = delete;
@@ -28,10 +25,6 @@ class SiPixelDigisCUDA {
   uint32_t nModules() const { return nModules_h; }
   uint32_t nDigis() const { return nDigis_h; }
 
-  void setFormatterErrors(const PixelFormatterErrors& err) { formatterErrors_h = err; }
-  bool hasErrors() const { return hasErrors_h; }
-  const PixelFormatterErrors& formatterErrors() const { return formatterErrors_h; }
-
   uint16_t * xx() { return xx_d.get(); }
   uint16_t * yy() { return yy_d.get(); }
   uint16_t * adc() { return adc_d.get(); }
@@ -39,7 +32,6 @@ class SiPixelDigisCUDA {
   int32_t  * clus() { return clus_d.get(); }
   uint32_t * pdigi() { return pdigi_d.get(); }
   uint32_t * rawIdArr() { return rawIdArr_d.get(); }
-  GPU::SimpleVector<PixelErrorCompact> *error() { return error_d.get(); }
 
   uint16_t const *xx() const { return xx_d.get(); }
   uint16_t const *yy() const { return yy_d.get(); }
@@ -48,7 +40,6 @@ class SiPixelDigisCUDA {
   int32_t  const *clus() const { return clus_d.get(); } 
   uint32_t const *pdigi() const { return pdigi_d.get(); }
   uint32_t const *rawIdArr() const { return rawIdArr_d.get(); }
-  GPU::SimpleVector<PixelErrorCompact> const *error() const { return error_d.get(); }
 
   uint16_t const *c_xx() const { return xx_d.get(); }
   uint16_t const *c_yy() const { return yy_d.get(); }
@@ -57,18 +48,12 @@ class SiPixelDigisCUDA {
   int32_t  const *c_clus() const { return clus_d.get(); }
   uint32_t const *c_pdigi() const { return pdigi_d.get(); }
   uint32_t const *c_rawIdArr() const { return rawIdArr_d.get(); }
-  GPU::SimpleVector<PixelErrorCompact> const *c_error() const { return error_d.get(); }
   
   cudautils::host::unique_ptr<uint16_t[]> adcToHostAsync(cuda::stream_t<>& stream) const;
   cudautils::host::unique_ptr< int32_t[]> clusToHostAsync(cuda::stream_t<>& stream) const;
   cudautils::host::unique_ptr<uint32_t[]> pdigiToHostAsync(cuda::stream_t<>& stream) const;
   cudautils::host::unique_ptr<uint32_t[]> rawIdArrToHostAsync(cuda::stream_t<>& stream) const;
 
-  using HostDataError = std::pair<cudautils::host::unique_ptr<PixelErrorCompact[]>, GPU::SimpleVector<PixelErrorCompact> const *>;
-  HostDataError dataErrorToHostAsync(cuda::stream_t<>& stream) const;
-
-  void copyErrorToHostAsync(cuda::stream_t<>& stream);
-  
   class DeviceConstView {
   public:
     DeviceConstView() = default;
@@ -107,16 +92,8 @@ class SiPixelDigisCUDA {
   cudautils::device::unique_ptr<uint32_t[]> pdigi_d;
   cudautils::device::unique_ptr<uint32_t[]> rawIdArr_d;
 
-  // These are for error CPU output; should we (eventually) place them
-  // to a separate product?
-  cudautils::device::unique_ptr<PixelErrorCompact[]> data_d;
-  cudautils::device::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_d;
-  cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
-  PixelFormatterErrors formatterErrors_h;
-
   uint32_t nModules_h = 0;
   uint32_t nDigis_h = 0;
-  bool hasErrors_h;
 };
 
 #endif
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
new file mode 100644
index 0000000000000..92aab1ec9d578
--- /dev/null
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigiErrorsCUDA.cc
@@ -0,0 +1,44 @@
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+
+SiPixelDigiErrorsCUDA::SiPixelDigiErrorsCUDA(size_t maxFedWords, PixelFormatterErrors errors, cuda::stream_t<>& stream):
+  formatterErrors_h(std::move(errors))
+{
+  edm::Service<CUDAService> cs;
+
+  error_d = cs->make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  data_d = cs->make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
+
+  cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);
+
+  error_h = cs->make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
+  GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
+  assert(error_h->size() == 0);
+  assert(error_h->capacity() == static_cast<int>(maxFedWords));
+
+  cudautils::copyAsync(error_d, error_h, stream);
+}
+
+void SiPixelDigiErrorsCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
+  cudautils::copyAsync(error_h, error_d, stream);
+}
+
+SiPixelDigiErrorsCUDA::HostDataError SiPixelDigiErrorsCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
+  edm::Service<CUDAService> cs;
+  // On one hand size() could be sufficient. On the other hand, if
+  // someone copies the SimpleVector<>, (s)he might expect the data
+  // buffer to actually have space for capacity() elements.
+  auto data = cs->make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
+
+  // but transfer only the required amount
+  if(error_h->size() > 0) {
+    cudautils::copyAsync(data, data_d, error_h->size(), stream);
+  }
+  auto err = *error_h;
+  err.set_data(data.get());
+  return HostDataError(std::move(err), std::move(data));
+}
diff --git a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
index 30095d7985765..ef13ed9612dbf 100644
--- a/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
+++ b/CUDADataFormats/SiPixelDigi/src/SiPixelDigisCUDA.cc
@@ -3,11 +3,8 @@
 #include "FWCore/ServiceRegistry/interface/Service.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
-#include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
 
-SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, bool includeErrors, cuda::stream_t<>& stream):
-  hasErrors_h(includeErrors)
-{
+SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, cuda::stream_t<>& stream) {
   edm::Service<CUDAService> cs;
 
   xx_d              = cs->make_device_unique<uint16_t[]>(maxFedWords, stream);
@@ -28,20 +25,6 @@ SiPixelDigisCUDA::SiPixelDigisCUDA(size_t maxFedWords, bool includeErrors, cuda:
 
   view_d = cs->make_device_unique<DeviceConstView>(stream);
   cudautils::copyAsync(view_d, view, stream);
-
-  if(includeErrors) {
-    error_d = cs->make_device_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
-    data_d = cs->make_device_unique<PixelErrorCompact[]>(maxFedWords, stream);
-
-    cudautils::memsetAsync(data_d, 0x00, maxFedWords, stream);
-
-    error_h = cs->make_host_unique<GPU::SimpleVector<PixelErrorCompact>>(stream);
-    GPU::make_SimpleVector(error_h.get(), maxFedWords, data_d.get());
-    assert(error_h->size() == 0);
-    assert(error_h->capacity() == static_cast<int>(maxFedWords));
-
-    cudautils::copyAsync(error_d, error_h, stream);
-  }
 }
 
 cudautils::host::unique_ptr<uint16_t[]> SiPixelDigisCUDA::adcToHostAsync(cuda::stream_t<>& stream) const {
@@ -71,24 +54,3 @@ cudautils::host::unique_ptr<uint32_t[]> SiPixelDigisCUDA::rawIdArrToHostAsync(cu
   cudautils::copyAsync(ret, rawIdArr_d, nDigis(), stream);
   return ret;
 }
-
-void SiPixelDigisCUDA::copyErrorToHostAsync(cuda::stream_t<>& stream) {
-  cudautils::copyAsync(error_h, error_d, stream);
-}
-
-
-SiPixelDigisCUDA::HostDataError SiPixelDigisCUDA::dataErrorToHostAsync(cuda::stream_t<>& stream) const {
-  edm::Service<CUDAService> cs;
-  // On one hand size() could be sufficient. On the other hand, if
-  // someone copies the SimpleVector<>, (s)he might expect the data
-  // buffer to actually have space for capacity() elements.
-  auto data = cs->make_host_unique<PixelErrorCompact[]>(error_h->capacity(), stream);
-
-  // but transfer only the required amount
-  if(error_h->size() > 0) {
-    cudautils::copyAsync(data, data_d, error_h->size(), stream);
-  }
-  error_h->set_data(data.get());
-  return HostDataError(std::move(data), error_h.get());
-}
-  
diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h
index 09676c8fdc2f5..188d51277abe5 100644
--- a/CUDADataFormats/SiPixelDigi/src/classes.h
+++ b/CUDADataFormats/SiPixelDigi/src/classes.h
@@ -3,6 +3,7 @@
 
 #include "CUDADataFormats/Common/interface/CUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
 #endif
diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
index 92a8949b12cba..b17f57470cc42 100644
--- a/CUDADataFormats/SiPixelDigi/src/classes_def.xml
+++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
@@ -1,4 +1,7 @@
 <lcgdict>
   <class name="CUDA<SiPixelDigisCUDA>" persistent="false"/>
   <class name="edm::Wrapper<CUDA<SiPixelDigisCUDA>>" persistent="false"/>
+
+  <class name="CUDA<SiPixelDigiErrorsCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDA<SiPixelDigiErrorsCUDA>>" persistent="false"/>
 </lcgdict>
diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h
new file mode 100644
index 0000000000000..1a7710b4fb3c6
--- /dev/null
+++ b/DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h
@@ -0,0 +1,28 @@
+#ifndef DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h
+#define DataFormats_SiPixelDigi_interface_SiPixelDigiErrorsSoA_h
+
+#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
+
+#include <cstdint>
+#include <vector>
+
+class SiPixelDigiErrorsSoA {
+public:
+  SiPixelDigiErrorsSoA() = default;
+  explicit SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err);
+  ~SiPixelDigiErrorsSoA() = default;
+
+  auto size() const { return error_.size(); }
+
+  const PixelFormatterErrors *formatterErrors() const { return formatterErrors_; }
+
+  const PixelErrorCompact& error(size_t i) const { return error_[i]; }
+  
+  const std::vector<PixelErrorCompact>& errorVector() const { return error_; }
+  
+private:
+  std::vector<PixelErrorCompact> error_;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+};
+
+#endif
diff --git a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
index 916dfbcf28136..df249a3790cd2 100644
--- a/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
+++ b/DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h
@@ -1,8 +1,6 @@
 #ifndef DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
 #define DataFormats_SiPixelDigi_interface_SiPixelDigisSoA_h
 
-#include "DataFormats/SiPixelDigi/interface/PixelErrors.h"
-
 #include <cstdint>
 #include <vector>
 
@@ -10,37 +8,25 @@ class SiPixelDigisSoA {
 public:
   SiPixelDigisSoA() = default;
   explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus);
-  explicit SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus,
-                           size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err);
   ~SiPixelDigisSoA() = default;
 
   auto size() const { return pdigi_.size(); }
-  auto errorSize() const { return error_.size(); }
-
-  bool hasError() const { return hasError_; }
-  const PixelFormatterErrors *formatterErrors() const { return formatterErrors_; }
 
   uint32_t pdigi(size_t i) const { return pdigi_[i]; }
   uint32_t rawIdArr(size_t i) const { return rawIdArr_[i]; }
   uint16_t adc(size_t i) const { return adc_[i]; }
   int32_t clus(size_t i) const { return clus_[i]; }
-  const PixelErrorCompact& error(size_t i) const { return error_[i]; }
   
   const std::vector<uint32_t>& pdigiVector() const { return pdigi_; }
   const std::vector<uint32_t>& rawIdArrVector() const { return rawIdArr_; }
   const std::vector<uint16_t>& adcVector() const { return adc_; }
   const std::vector<int32_t>& clusVector() const { return clus_; }
-  const std::vector<PixelErrorCompact>& errorVector() const { return error_; }
   
 private:
   std::vector<uint32_t> pdigi_;
   std::vector<uint32_t> rawIdArr_;
   std::vector<uint16_t> adc_;
   std::vector<int32_t> clus_;
-
-  std::vector<PixelErrorCompact> error_;
-  const PixelFormatterErrors *formatterErrors_ = nullptr;
-  bool hasError_ = false;
 };
 
 #endif
diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc
new file mode 100644
index 0000000000000..ef2b4581fc46e
--- /dev/null
+++ b/DataFormats/SiPixelDigi/src/SiPixelDigiErrorsSoA.cc
@@ -0,0 +1,10 @@
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+
+#include <cassert>
+
+SiPixelDigiErrorsSoA::SiPixelDigiErrorsSoA(size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err):
+  error_(error, error+nErrors),
+  formatterErrors_(err)
+{
+  assert(error_.size() == nErrors);
+}
diff --git a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
index 2d2e6b7eadfe8..ebc8ba2055f78 100644
--- a/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
+++ b/DataFormats/SiPixelDigi/src/SiPixelDigisSoA.cc
@@ -1,6 +1,5 @@
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
 
-#include <algorithm>
 #include <cassert>
 
 SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus):
@@ -11,13 +10,3 @@ SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uin
 {
   assert(pdigi_.size() == nDigis);
 }
-
-SiPixelDigisSoA::SiPixelDigisSoA(size_t nDigis, const uint32_t *pdigi, const uint32_t *rawIdArr, const uint16_t *adc, const int32_t *clus,
-                                 size_t nErrors, const PixelErrorCompact *error, const PixelFormatterErrors *err):
-  SiPixelDigisSoA(nDigis, pdigi, rawIdArr, adc, clus)
-{
-  error_.resize(nErrors);
-  std::copy(error, error+nErrors, error_.begin());
-  formatterErrors_ = err;
-  hasError_ = true;
-}
diff --git a/DataFormats/SiPixelDigi/src/classes.h b/DataFormats/SiPixelDigi/src/classes.h
index bd93840de0d4d..256ca41ad1867 100644
--- a/DataFormats/SiPixelDigi/src/classes.h
+++ b/DataFormats/SiPixelDigi/src/classes.h
@@ -6,6 +6,7 @@
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigi.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelCalibDigiError.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
diff --git a/DataFormats/SiPixelDigi/src/classes_def.xml b/DataFormats/SiPixelDigi/src/classes_def.xml
index e6bc08de161fa..8cabbd3f3f06e 100755
--- a/DataFormats/SiPixelDigi/src/classes_def.xml
+++ b/DataFormats/SiPixelDigi/src/classes_def.xml
@@ -52,4 +52,7 @@
 
    <class name="SiPixelDigisSoA" persistent="false"/>
    <class name="edm::Wrapper<SiPixelDigisSoA>" persistent="false"/>
+
+   <class name="SiPixelDigiErrorsSoA" persistent="false"/>
+   <class name="edm::Wrapper<SiPixelDigiErrorsSoA>" persistent="false"/>
 </lcgdict>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
new file mode 100644
index 0000000000000..bcad448b0a157
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -0,0 +1,77 @@
+#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h"
+
+class SiPixelDigiErrorsSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork> {
+public:
+  explicit SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigiErrorsSoAFromCUDA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<CUDA<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
+  edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
+
+  cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
+  GPU::SimpleVector<PixelErrorCompact> error_;
+  const PixelFormatterErrors *formatterErrors_ = nullptr;
+};
+
+SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig):
+  digiErrorGetToken_(consumes<CUDA<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiErrorPutToken_(produces<SiPixelDigiErrorsSoA>())
+{}
+
+void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("src", edm::InputTag("siPixelClustersCUDA"));
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Do the transfer in a CUDA stream parallel to the computation CUDA stream
+  auto ctx = CUDAScopedContext(iEvent.streamID(), std::move(waitingTaskHolder));
+
+  edm::Handle<CUDA<SiPixelDigiErrorsCUDA>> herror;
+  iEvent.getByToken(digiErrorGetToken_, herror);
+  const auto& gpuDigiErrors = ctx.get(*herror);
+
+  auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream());
+  error_ = std::move(tmp.first);
+  data_ = std::move(tmp.second);
+  formatterErrors_ = &(gpuDigiErrors.formatterErrors());
+}
+
+void SiPixelDigiErrorsSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // The following line copies the data from the pinned host memory to
+  // regular host memory. In principle that feels unnecessary (why not
+  // just use the pinned host memory?). There are a few arguments for
+  // doing it though
+  // - Now can release the pinned host memory back to the (caching) allocator
+  //   * if we'd like to keep the pinned memory, we'd need to also
+  //     keep the CUDA stream around as long as that, or allow pinned
+  //     host memory to be allocated without a CUDA stream
+  // - What if a CPU algorithm would produce the same SoA? We can't
+  //   use cudaMallocHost without a GPU...
+  iEvent.emplace(digiErrorPutToken_, error_.size(), error_.data(), formatterErrors_);
+
+  error_ = GPU::make_SimpleVector<PixelErrorCompact>(0, nullptr);
+  data_.reset();
+  formatterErrors_ = nullptr;
+}
+
+// define as framework plugin
+DEFINE_FWK_MODULE(SiPixelDigiErrorsSoAFromCUDA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
index ddf20bd578430..87b55cc1b64f1 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
@@ -6,7 +6,7 @@
 #include "DataFormats/DetId/interface/DetIdCollection.h"
 #include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
 #include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
-#include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
 #include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
 #include "FWCore/Framework/interface/ESTransientHandle.h"
 #include "FWCore/Framework/interface/ESWatcher.h"
@@ -30,7 +30,7 @@ class SiPixelDigisFromSoA: public edm::stream::EDProducer<> {
 private:
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  edm::EDGetTokenT<SiPixelDigisSoA> digiSoAGetToken_;
+  edm::EDGetTokenT<SiPixelDigiErrorsSoA> digiErrorSoAGetToken_;
   edm::EDGetTokenT<edm::DetSetVector<PixelDigi>> digiGetToken_; // for a copy
 
   edm::EDPutTokenT<edm::DetSetVector<PixelDigi>> digiPutToken_;
@@ -60,7 +60,7 @@ SiPixelDigisFromSoA::SiPixelDigisFromSoA(const edm::ParameterSet& iConfig):
   usePhase1_(iConfig.getParameter<bool> ("UsePhase1"))
 {
   if(includeErrors_) {
-    digiSoAGetToken_         = consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("digiSoASrc"));
+    digiErrorSoAGetToken_    = consumes<SiPixelDigiErrorsSoA>(iConfig.getParameter<edm::InputTag>("digiErrorSoASrc"));
     errorPutToken_           = produces<edm::DetSetVector<SiPixelRawDataError>>();
     tkErrorPutToken_         = produces<DetIdCollection>();
     userErrorPutToken_       = produces<DetIdCollection>("UserErrorModules");
@@ -71,7 +71,7 @@ SiPixelDigisFromSoA::SiPixelDigisFromSoA(const edm::ParameterSet& iConfig):
 void SiPixelDigisFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("digiSrc", edm::InputTag("siPixelClusters"));
-  desc.add<edm::InputTag>("digiSoASrc", edm::InputTag("siPixelDigisSoA"));
+  desc.add<edm::InputTag>("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA"));
   desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label");
   desc.add<bool>("IncludeErrors", true);
   desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
@@ -97,28 +97,24 @@ void SiPixelDigisFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSe
       LogDebug("map version:")<< cabling_->version();
     }
 
-    edm::Handle<SiPixelDigisSoA> hsoa;
-    iEvent.getByToken(digiSoAGetToken_, hsoa);
-    const auto& digis = *hsoa;
+    edm::Handle<SiPixelDigiErrorsSoA> hsoa;
+    iEvent.getByToken(digiErrorSoAGetToken_, hsoa);
+    const auto& digiErrors = *hsoa;
 
-    if(!digis.hasError()) {
-      throw cms::Exception("LogicError") << "The module was configured to include errors, but the input SoA does not include the errors. This is likely a problem in the configuration.";
-    }
-    
     auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
     auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
     auto usererror_detidcollection = std::make_unique<DetIdCollection>();
     auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
 
     PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0
-    const PixelDataFormatter::Errors *formatterErrors = digis.formatterErrors();
+    const PixelDataFormatter::Errors *formatterErrors = digiErrors.formatterErrors();
     assert(formatterErrors != nullptr);
     auto errors = *formatterErrors; // make a copy
     PixelDataFormatter::DetErrors nodeterrors;
 
-    auto size = digis.errorSize();
+    auto size = digiErrors.size();
     for (auto i = 0U; i < size; i++) {
-      PixelErrorCompact err = digis.error(i);
+      PixelErrorCompact err = digiErrors.error(i);
       if (err.errorType != 0) {
         SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
         errors[err.rawId].push_back(error);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 06c4fab25763b..5fa3ed26517eb 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -31,12 +31,7 @@ class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   cudautils::host::unique_ptr<uint16_t[]> adc_;
   cudautils::host::unique_ptr< int32_t[]> clus_;
 
-  cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
-  const GPU::SimpleVector<PixelErrorCompact> *error_ = nullptr;
-  const PixelFormatterErrors *formatterErrors_ = nullptr;
-
   int nDigis_;
-  bool includeErrors_;
 };
 
 SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig):
@@ -63,14 +58,6 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::Event
   rawIdArr_ = gpuDigis.rawIdArrToHostAsync(ctx.stream());
   adc_ = gpuDigis.adcToHostAsync(ctx.stream());
   clus_ = gpuDigis.clusToHostAsync(ctx.stream());
-
-  includeErrors_ = gpuDigis.hasErrors();
-  if(includeErrors_) {
-    auto tmp = gpuDigis.dataErrorToHostAsync(ctx.stream());
-    data_ = std::move(tmp.first);
-    error_ = tmp.second;
-    formatterErrors_ = &(gpuDigis.formatterErrors());
-  }
 }
 
 void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
@@ -84,21 +71,12 @@ void SiPixelDigisSoAFromCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
   //     host memory to be allocated without a CUDA stream
   // - What if a CPU algorithm would produce the same SoA? We can't
   //   use cudaMallocHost without a GPU...
-  if(includeErrors_) {
-    iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get(),
-                   error_->size(), error_->data(), formatterErrors_);
-  }
-  else {
-    iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
-  }
+  iEvent.emplace(digiPutToken_, nDigis_, pdigi_.get(), rawIdArr_.get(), adc_.get(), clus_.get());
 
   pdigi_.reset();
   rawIdArr_.reset();
   adc_.reset();
   clus_.reset();
-  data_.reset();
-  error_ = nullptr;
-  formatterErrors_ = nullptr;
 }
 
 // define as framework plugin
diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
index 623842c03a549..be7e7193926fc 100644
--- a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
+++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
@@ -2,13 +2,17 @@
 
 from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis
 from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA
+from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA
 
 siPixelDigisTask = cms.Task(siPixelDigis)
 
 siPixelDigisSoA = _siPixelDigisSoAFromCUDA.clone(
     src = "siPixelClustersCUDAPreSplitting"
 )
-siPixelDigisTaskCUDA = cms.Task(siPixelDigisSoA)
+siPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone(
+    src = "siPixelClustersCUDAPreSplitting"
+)
+siPixelDigisTaskCUDA = cms.Task(siPixelDigisSoA, siPixelDigiErrorsSoA)
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
 _siPixelDigisTask_gpu = siPixelDigisTask.copy()
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index a27fd420cddbe..e3ad212540b07 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -1,6 +1,7 @@
 #include "CUDADataFormats/Common/interface/CUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "CalibTracker/Records/interface/SiPixelGainCalibrationForHLTGPURcd.h"
 #include "CalibTracker/SiPixelESProducers/interface/SiPixelGainCalibrationForHLTGPU.h"
 #include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
@@ -47,6 +48,7 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
 
   edm::EDPutTokenT<CUDA<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<CUDA<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
   edm::EDPutTokenT<CUDA<SiPixelClustersCUDA>> clusterPutToken_;
 
   CUDAContextToken ctxTmp_;
@@ -78,6 +80,10 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
   usePilotBlade_(iConfig.getParameter<bool> ("UsePilotBlade")), // Control the usage of pilot-blade data, FED=40
   convertADCtoElectrons_(iConfig.getParameter<bool>("ConvertADCtoElectrons"))
 {
+  if(includeErrors_) {
+    digiErrorPutToken_ = produces<CUDA<SiPixelDigiErrorsCUDA>>();
+  }
+
   // regions
   if(!iConfig.getParameter<edm::ParameterSet>("Regions").getParameterNames().empty()) {
     regions_ = std::make_unique<PixelUnpackingRegions>(iConfig, consumesCollector());
@@ -215,6 +221,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
 
   gpuAlgo_.makeClustersAsync(gpuMap, gpuModulesToUnpack, gpuGains,
                              wordFedAppender,
+                             std::move(errors_),
                              wordCounterGPU, fedCounter, convertADCtoElectrons_,
                              useQuality_, includeErrors_,
                              edm::MessageDrop::instance()->debugEnabled,
@@ -229,6 +236,9 @@ void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup&
   auto tmp = gpuAlgo_.getResults();
   ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));
   ctx.emplace(iEvent, clusterPutToken_, std::move(tmp.second));
+  if(includeErrors_) {
+    ctx.emplace(iEvent, digiErrorPutToken_, gpuAlgo_.getErrors());
+  }
 }
 
 // define as framework plugin
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
index d01ce5c6f2f26..fead8e59a0db3 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.cu
@@ -509,6 +509,7 @@ namespace pixelgpudetails {
       const unsigned char *modToUnp,
       const SiPixelGainForHLTonGPU *gains,
       const WordFedAppender& wordFed,
+      PixelFormatterErrors&& errors,
       const uint32_t wordCounter, const uint32_t fedCounter,
       bool convertADCtoElectrons,
       bool useQualityInfo, bool includeErrors, bool debug,
@@ -516,7 +517,10 @@ namespace pixelgpudetails {
   {
     nDigis = wordCounter;
 
-    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, includeErrors, stream);
+    digis_d = SiPixelDigisCUDA(pixelgpudetails::MAX_FED_WORDS, stream);
+    if(includeErrors) {
+      digiErrors_d = SiPixelDigiErrorsCUDA(pixelgpudetails::MAX_FED_WORDS, std::move(errors), stream);
+    }
     clusters_d = SiPixelClustersCUDA(gpuClustering::MaxNumModules, stream);
 
     edm::Service<CUDAService> cs;
@@ -545,14 +549,14 @@ namespace pixelgpudetails {
           digis_d.pdigi(),
           digis_d.rawIdArr(),
           digis_d.moduleInd(),
-          digis_d.error(),
+          digiErrors_d.error(), // returns nullptr if default-constructed
           useQualityInfo,
           includeErrors,
           debug);
       cudaCheck(cudaGetLastError());
 
       if(includeErrors) {
-        digis_d.copyErrorToHostAsync(stream);
+        digiErrors_d.copyErrorToHostAsync(stream);
       }
     }
     // End  of Raw2Digi and passing data for cluserisation
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
index 19b47c9cceb87..1ab8bc3fa5998 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterGPUKernel.h
@@ -6,6 +6,7 @@
 #include "cuda/api_wrappers.h"
 
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
+#include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "FWCore/Utilities/interface/typedefs.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h"
@@ -183,6 +184,7 @@ namespace pixelgpudetails {
     void makeClustersAsync(const SiPixelFedCablingMapGPU *cablingMap, const unsigned char *modToUnp,
                            const SiPixelGainForHLTonGPU *gains,
                            const WordFedAppender& wordFed,
+                           PixelFormatterErrors&& errors,
                            const uint32_t wordCounter, const uint32_t fedCounter, bool convertADCtoElectrons,
                            bool useQualityInfo, bool includeErrors, bool debug,
                            cuda::stream_t<>& stream);
@@ -200,14 +202,18 @@ namespace pixelgpudetails {
       return std::make_pair(std::move(digis_d), std::move(clusters_d));
     }
 
+    SiPixelDigiErrorsCUDA&& getErrors() {
+      return std::move(digiErrors_d);
+    }
+
   private:
     uint32_t nDigis = 0;
 
     // Data to be put in the event
     cudautils::host::unique_ptr<uint32_t[]> nModules_Clusters_h;
-    cudautils::host::unique_ptr<GPU::SimpleVector<PixelErrorCompact>> error_h;
     SiPixelDigisCUDA digis_d;
     SiPixelClustersCUDA clusters_d;
+    SiPixelDigiErrorsCUDA digiErrors_d;
   };
 
   // see RecoLocalTracker/SiPixelClusterizer

From a7cb6b4aa6bea789e00a97d8843f1126f5e01bfa Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 28 Dec 2018 22:00:39 +0100
Subject: [PATCH 26/49] Throw the exception in the callback from
 CUDAScopedContext

---
 HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index e57033a1d0968..bbb890063fdae 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -31,9 +31,14 @@ CUDAScopedContext::~CUDAScopedContext() {
                                   waitingTaskHolder.doneWaiting(nullptr);
                                 }
                                 else {
-                                  auto error = cudaGetErrorName(status);
-                                  auto message = cudaGetErrorString(status);
-                                  waitingTaskHolder.doneWaiting(std::make_exception_ptr(cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message));
+                                  // wrap the exception in a try-catch block to let GDB "catch throw" break on it
+                                  try {
+                                    auto error = cudaGetErrorName(status);
+                                    auto message = cudaGetErrorString(status);
+                                    throw cms::Exception("CUDAError") << "Callback of CUDA stream " << streamId << " in device " << device << " error " << error << ": " << message;
+                                  } catch(cms::Exception&) {
+                                    waitingTaskHolder.doneWaiting(std::current_exception());
+                                  }
                                 }
                               });
   }

From 48691a3c00f922480dda3a0ecdb9d9d85545e07d Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 28 Jan 2019 22:22:32 +0100
Subject: [PATCH 27/49] Modernize test modules

---
 .../CUDACore/test/test_CUDAScopedContext.cc   |  8 ++++----
 .../CUDATest/plugins/TestCUDAProducerCPU.cc   | 12 +++++-------
 .../CUDATest/plugins/TestCUDAProducerGPU.cc   |  9 ++++-----
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 15 +++++++--------
 .../plugins/TestCUDAProducerGPUFirst.cc       |  2 +-
 .../plugins/TestCUDAProducerGPUtoCPU.cc       | 19 +++++++++----------
 6 files changed, 30 insertions(+), 35 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index e35c4b284802d..824784e0a77c1 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -56,7 +56,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
       const auto& data = *dataPtr;
 
-      auto ctx2 = CUDAScopedContext(data);
+      CUDAScopedContext ctx2{data};
       REQUIRE(cuda::device::current::get().id() == data.device());
       REQUIRE(ctx2.stream().id() == data.stream().id());
     }
@@ -66,12 +66,12 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       { // acquire
         std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
         const auto& data = *dataPtr;
-        auto ctx2 = CUDAScopedContext(data);
+        CUDAScopedContext ctx2{data};
         ctxtok = ctx2.toToken();
       }
 
       { // produce
-        auto ctx2 = CUDAScopedContext(std::move(ctxtok));
+        CUDAScopedContext ctx2{std::move(ctxtok)};
         REQUIRE(cuda::device::current::get().id() == ctx.device());
         REQUIRE(ctx2.stream().id() == ctx.stream().id());
       }
@@ -94,7 +94,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       REQUIRE(wprod1->stream().id() != wprod2->stream().id());
 
       // Mimick a third producer "joining" the two streams
-      auto ctx2 = CUDAScopedContext(*wprod1);
+      CUDAScopedContext ctx2{*wprod1};
 
       auto prod1 = ctx.get(*wprod1);
       auto prod2 = ctx.get(*wprod2);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
index 0fe14abdfbd27..79eb6eeead4a4 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -20,17 +20,17 @@ class TestCUDAProducerCPU: public edm::global::EDProducer<> {
 private:
   std::string label_;
   edm::EDGetTokenT<int> srcToken_;
+  edm::EDPutTokenT<int> dstToken_;
 };
 
 TestCUDAProducerCPU::TestCUDAProducerCPU(const edm::ParameterSet& iConfig):
-  label_(iConfig.getParameter<std::string>("@module_label"))
+  label_{iConfig.getParameter<std::string>("@module_label")},
+  dstToken_{produces<int>()}
 {
   auto srcTag = iConfig.getParameter<edm::InputTag>("src");
   if(!srcTag.label().empty()) {
     srcToken_ = consumes<int>(srcTag);
   }
-
-  produces<int>();
 }
 
 void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -45,9 +45,7 @@ void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const ed
 
   int input = 0;
   if(!srcToken_.isUninitialized()) {
-    edm::Handle<int> hin;
-    iEvent.getByToken(srcToken_, hin);
-    input = *hin;
+    input = iEvent.get(srcToken_);
   }
 
   std::random_device r;
@@ -59,7 +57,7 @@ void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const ed
 
   const unsigned int output = input + id*100 + iEvent.id().event();
 
-  iEvent.put(std::make_unique<int>(output));
+  iEvent.emplace(dstToken_, output);
 
   edm::LogPrint("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event() << " stream " << id << " result " << output;
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index 06f51bfc1b7ff..0c578b4288664 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -42,12 +42,11 @@ void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descr
 void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDA<CUDAThing>> hin;
-  iEvent.getByToken(srcToken_, hin);
-  auto ctx = CUDAScopedContext(*hin);
-  const CUDAThing& input = ctx.get(*hin);
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContext ctx{in};
+  const CUDAThing& input = ctx.get(in);
 
-  ctx.emplace(iEvent, dstToken_, CUDAThing(gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())));
+  ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
 
   edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index ee78f91d0a233..3d63a1cb960c5 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -32,9 +32,9 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
 };
 
 TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig):
-  label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
-  dstToken_(produces<CUDA<CUDAThing>>())
+  label_{iConfig.getParameter<std::string>("@module_label")},
+  srcToken_{consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+  dstToken_{produces<CUDA<CUDAThing>>()}
 {}
 
 void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
@@ -46,10 +46,9 @@ void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& des
 void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDA<CUDAThing>> hin;
-  iEvent.getByToken(srcToken_, hin);
-  auto ctx = CUDAScopedContext(*hin, std::move(waitingTaskHolder));
-  const CUDAThing& input = ctx.get(*hin);
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContext ctx{in, std::move(waitingTaskHolder)};
+  const CUDAThing& input = ctx.get(in);
 
   devicePtr_ = gpuAlgo_.runAlgo(label_, input.get(), ctx.stream());
   // Mimick the need to transfer some of the GPU data back to CPU to
@@ -65,7 +64,7 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
 void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
   edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
 
-  auto ctx = CUDAScopedContext(std::move(ctxTmp_));
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
 
   ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
 
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index e99924f5cb755..187c89ea885f7 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -39,7 +39,7 @@ void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions&
 void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  auto ctx = CUDAScopedContext(streamID);
+  CUDAScopedContext ctx{streamID};
 
   cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
   iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 813d75ad69d09..df71dd2c81001 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -26,15 +26,15 @@ class TestCUDAProducerGPUtoCPU: public edm::stream::EDProducer<edm::ExternalWork
 private:
   std::string label_;
   edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<int> dstToken_;
   cudautils::host::unique_ptr<float[]> buffer_;
 };
 
 TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig):
-  label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src")))
-{
-  produces<int>();
-}
+  label_{iConfig.getParameter<std::string>("@module_label")},
+  srcToken_{consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+  dstToken_{produces<int>()}
+{}
 
 void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
@@ -46,10 +46,9 @@ void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions&
 void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
-  edm::Handle<CUDA<CUDAThing>> hin;
-  iEvent.getByToken(srcToken_, hin);
-  auto ctx = CUDAScopedContext(*hin, std::move(waitingTaskHolder));
-  const CUDAThing& device = ctx.get(*hin);
+  const auto& in = iEvent.get(srcToken_);
+  CUDAScopedContext ctx{in, std::move(waitingTaskHolder)};
+  const CUDAThing& device = ctx.get(in);
 
   edm::Service<CUDAService> cs;
   buffer_ = cs->make_host_unique<float[]>(TestCUDAProducerGPUKernel::NUM_VALUES, ctx.stream());
@@ -68,7 +67,7 @@ void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup
   }
   buffer_.reset(); // not so nice, but no way around?
 
-  iEvent.put(std::make_unique<int>(counter));
+  iEvent.emplace(dstToken_, counter);
 
   edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << counter;
 }

From 75dcc8e926930e63761907f5c4bb08e2993b08bf Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 28 Jan 2019 23:31:44 +0100
Subject: [PATCH 28/49] Modernize event access in raw2cluster

---
 .../plugins/SiPixelDigiErrorsSoAFromCUDA.cc               | 6 ++----
 .../SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc       | 8 ++------
 .../SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc   | 6 ++----
 .../SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc  | 4 +---
 .../SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc | 8 +++-----
 5 files changed, 10 insertions(+), 22 deletions(-)

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
index bcad448b0a157..950cf59b53820 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -43,11 +43,9 @@ void SiPixelDigiErrorsSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptio
 
 void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
-  auto ctx = CUDAScopedContext(iEvent.streamID(), std::move(waitingTaskHolder));
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
-  edm::Handle<CUDA<SiPixelDigiErrorsCUDA>> herror;
-  iEvent.getByToken(digiErrorGetToken_, herror);
-  const auto& gpuDigiErrors = ctx.get(*herror);
+  const auto& gpuDigiErrors = ctx.get(iEvent.get(digiErrorGetToken_));
 
   auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream());
   error_ = std::move(tmp.first);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
index 87b55cc1b64f1..c3522445ac7e5 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
@@ -82,9 +82,7 @@ void SiPixelDigisFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descr
 
 void SiPixelDigisFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
   // How could we avoid the copy?
-  edm::Handle<edm::DetSetVector<PixelDigi>> hdigi;
-  iEvent.getByToken(digiGetToken_, hdigi);
-  iEvent.emplace(digiPutToken_, *hdigi);
+  iEvent.emplace(digiPutToken_, iEvent.get(digiGetToken_));
 
   // pack errors into collection
   if (includeErrors_) {
@@ -97,9 +95,7 @@ void SiPixelDigisFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSe
       LogDebug("map version:")<< cabling_->version();
     }
 
-    edm::Handle<SiPixelDigiErrorsSoA> hsoa;
-    iEvent.getByToken(digiErrorSoAGetToken_, hsoa);
-    const auto& digiErrors = *hsoa;
+    const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_);
 
     auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
     auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 5fa3ed26517eb..7f5481b31cd47 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -47,11 +47,9 @@ void SiPixelDigisSoAFromCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 
 void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
-  auto ctx = CUDAScopedContext(iEvent.streamID(), std::move(waitingTaskHolder));
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
-  edm::Handle<CUDA<SiPixelDigisCUDA>> hdigi;
-  iEvent.getByToken(digiGetToken_, hdigi);
-  const auto& gpuDigis = ctx.get(*hdigi);
+  const auto& gpuDigis = ctx.get(iEvent.get(digiGetToken_));
 
   nDigis_ = gpuDigis.nDigis();
   pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
index 731dd3d54ad2a..c07d2496a2f9c 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
@@ -78,9 +78,7 @@ void SiPixelClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& de
 }
 
 void SiPixelClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::Handle<SiPixelDigisSoA> hdigi;
-  iEvent.getByToken(digiGetToken_, hdigi);
-  const auto& digis = *hdigi;
+  const auto& digis = iEvent.get(digiGetToken_);
   
   edm::ESHandle<TrackerTopology> trackerTopologyHandle;
   iSetup.get<TrackerTopologyRcd>().get(trackerTopologyHandle);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index e3ad212540b07..659da17f46889 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -113,7 +113,7 @@ void SiPixelRawToClusterCUDA::fillDescriptions(edm::ConfigurationDescriptions& d
 
 
 void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  auto ctx = CUDAScopedContext(iEvent.streamID(), std::move(waitingTaskHolder));
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
   edm::ESHandle<SiPixelFedCablingMapGPUWrapper> hgpuMap;
   iSetup.get<CkfComponentsRecord>().get(hgpuMap);
@@ -153,9 +153,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
     LogDebug("map version:")<< cabling_->version();
   }
 
-  edm::Handle<FEDRawDataCollection> hbuffers;
-  iEvent.getByToken(rawGetToken_, hbuffers);
-  const auto& buffers = *hbuffers;
+  const auto& buffers = iEvent.get(rawGetToken_);
 
   errors_.clear();
 
@@ -231,7 +229,7 @@ void SiPixelRawToClusterCUDA::acquire(const edm::Event& iEvent, const edm::Event
 }
 
 void SiPixelRawToClusterCUDA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  auto ctx = CUDAScopedContext(std::move(ctxTmp_));
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
 
   auto tmp = gpuAlgo_.getResults();
   ctx.emplace(iEvent, digiPutToken_, std::move(tmp.first));

From f6db0f2d0fae480082255ac24394c07d6f2ceac5 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Jan 2019 00:11:32 +0100
Subject: [PATCH 29/49] Add a program to check whether we can run on CUDA
 devices or not

---
 .../CUDAServices/bin/BuildFile.xml            |  4 +++
 .../CUDAServices/bin/cudaIsEnabled.cpp        | 31 +++++++++++++++++++
 2 files changed, 35 insertions(+)
 create mode 100644 HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp

diff --git a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
index a116ca8d78d33..041ed25ba134a 100644
--- a/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
+++ b/HeterogeneousCore/CUDAServices/bin/BuildFile.xml
@@ -1,3 +1,7 @@
 <bin name="cudaComputeCapabilities" file="cudaComputeCapabilities.cpp">
   <use name="cuda"/>
 </bin>
+
+<bin name="cudaIsEnabled" file="cudaIsEnabled.cpp">
+  <use name="cuda"/>
+</bin>
diff --git a/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
new file mode 100644
index 0000000000000..b24f05adb2213
--- /dev/null
+++ b/HeterogeneousCore/CUDAServices/bin/cudaIsEnabled.cpp
@@ -0,0 +1,31 @@
+#include <algorithm>
+#include <array>
+#include <cstdlib>
+#include <iostream>
+
+#include <cuda_runtime.h>
+
+int main() {
+  int devices = 0;
+  auto status = cudaGetDeviceCount(& devices);
+  if (status != cudaSuccess) {
+    return EXIT_FAILURE;
+  }
+
+  int minimumMajor = 6; // min minor is implicitly 0
+
+  // This approach (requiring all devices are supported) is rather
+  // conservative. In principle we could consider just dropping the
+  // unsupported devices. Currently that would be easiest to achieve
+  // in CUDAService though.
+  for (int i = 0; i < devices; ++i) {
+    cudaDeviceProp properties;
+    cudaGetDeviceProperties(&properties, i);
+
+    if(properties.major < minimumMajor) {
+      return EXIT_FAILURE;
+    }
+  }
+
+  return EXIT_SUCCESS;
+}

From 6ef9e6755a744d497ea2ef5371aa2c689ea1064f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 29 Jan 2019 17:59:19 +0100
Subject: [PATCH 30/49] Add SwitchProducerCUDA

---
 .../CUDACore/python/SwitchProducerCUDA.py     | 34 +++++++++++++++++++
 1 file changed, 34 insertions(+)
 create mode 100644 HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py

diff --git a/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
new file mode 100644
index 0000000000000..ded114e2fddfe
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/python/SwitchProducerCUDA.py
@@ -0,0 +1,34 @@
+import FWCore.ParameterSet.Config as cms
+
+_cuda_enabled_cached = None
+
+def _switch_cuda():
+    global _cuda_enabled_cached
+    if _cuda_enabled_cached is None:
+        import os
+        _cuda_enabled_cached = (os.system("cudaIsEnabled") == 0)
+    return (_cuda_enabled_cached, 2)
+
+class SwitchProducerCUDA(cms.SwitchProducer):
+    def __init__(self, **kargs):
+        super(SwitchProducerCUDA,self).__init__(
+            dict(cpu = cms.SwitchProducer.getCpu(),
+                 cuda = _switch_cuda),
+            **kargs
+        )
+cms.specialImportRegistry.registerSpecialImportForType(SwitchProducerCUDA, "from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA")
+
+if __name__ == "__main__":
+    import unittest
+
+    class TestSwitchProducerCUDA(unittest.TestCase):
+        def testPickle(self):
+            import pickle
+            sp = SwitchProducerCUDA(cpu = cms.EDProducer("Foo"), cuda = cms.EDProducer("Bar"))
+            pkl = pickle.dumps(sp)
+            unpkl = pickle.loads(pkl)
+            self.assertEqual(unpkl.cpu.type_(), "Foo")
+            self.assertEqual(unpkl.cuda.type_(), "Bar")
+
+    unittest.main()
+

From aadc59f119dbcb3263caffce7e82f8aa5301b9b6 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 30 Jan 2019 17:53:59 +0100
Subject: [PATCH 31/49] Add test configurations for SwitchProducer

---
 .../CUDATest/python/prod1Switch_cff.py        | 17 ++++
 .../CUDATest/python/prod5Switch_cff.py        | 17 ++++
 .../CUDATest/python/prod6Switch_cff.py        | 18 ++++
 .../CUDATest/test/testCUDASwitch_cfg.py       | 87 +++++++++++++++++++
 4 files changed, 139 insertions(+)
 create mode 100644 HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
 create mode 100644 HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
 create mode 100644 HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py

diff --git a/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
new file mode 100644
index 0000000000000..72221ade422fa
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod1Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod1CPU_cfi import prod1CPU as _prod1CPU
+from HeterogeneousCore.CUDATest.prod1CUDA_cfi import prod1CUDA
+from HeterogeneousCore.CUDATest.prod1FromCUDA_cfi import prod1FromCUDA as _prod1FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod1 = SwitchProducerCUDA(
+    cpu = _prod1CPU.clone(),
+    cuda = _prod1FromCUDA.clone()
+)
+
+prod1Task = cms.Task(
+    prod1CUDA,
+    prod1
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
new file mode 100644
index 0000000000000..3b5e90d497641
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod5Switch_cff.py
@@ -0,0 +1,17 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod5CPU_cfi import prod5CPU as _prod5CPU
+from HeterogeneousCore.CUDATest.prod5CUDA_cfi import prod5CUDA
+from HeterogeneousCore.CUDATest.prod5FromCUDA_cfi import prod5FromCUDA as _prod5FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod5 = SwitchProducerCUDA(
+    cpu = _prod5CPU.clone(),
+    cuda = _prod5FromCUDA.clone()
+)
+
+prod5Task = cms.Task(
+    prod5CUDA,
+    prod5
+)
diff --git a/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
new file mode 100644
index 0000000000000..fe05a11d4ce5a
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/python/prod6Switch_cff.py
@@ -0,0 +1,18 @@
+import FWCore.ParameterSet.Config as cms
+
+from HeterogeneousCore.CUDATest.prod6CPU_cfi import prod6CPU as _prod6CPU
+from HeterogeneousCore.CUDATest.prod6CUDA_cfi import prod6CUDA
+from HeterogeneousCore.CUDATest.prod6FromCUDA_cfi import prod6FromCUDA as _prod6FromCUDA
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+prod6 = SwitchProducerCUDA(
+    cpu = _prod6CPU.clone(),
+    cuda = _prod6FromCUDA.clone()
+)
+
+prod6Task = cms.Task(
+    prod6CUDA,
+    prod6
+)
+
diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
new file mode 100644
index 0000000000000..3ae16b4c45b49
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -0,0 +1,87 @@
+import FWCore.ParameterSet.Config as cms
+
+from Configuration.ProcessModifiers.gpu_cff import gpu
+process = cms.Process("Test")
+process.load("FWCore.MessageService.MessageLogger_cfi")
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+
+process.source = cms.Source("EmptySource")
+
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) )
+
+process.options = cms.untracked.PSet(
+#    numberOfThreads = cms.untracked.uint32(4),
+    numberOfStreams = cms.untracked.uint32(0)
+)
+#process.Tracer = cms.Service("Tracer")
+
+# Flow diagram of the modules
+#
+#     1   5
+#    / \  |
+#   2  4  6
+#   |
+#   3
+
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+
+process.load("HeterogeneousCore.CUDATest.prod1Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod5Switch_cff")
+process.load("HeterogeneousCore.CUDATest.prod6Switch_cff")
+
+# GPU producers
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
+from HeterogeneousCore.CUDATest.testCUDAProducerGPU_cfi import testCUDAProducerGPU
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUEW_cfi import testCUDAProducerGPUEW
+from HeterogeneousCore.CUDATest.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
+
+process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
+process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
+process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
+
+# CPU producers, ssiwtched with modules to copy data from GPU to CPU
+# (as "on demand" as any other EDProducer, i.e. according to
+# consumes() and prefetching). If a separate conversion step is needed
+# to get the same data formats as the CPU modules, those are then ones
+# that should be replaced-with here.
+from HeterogeneousCore.CUDATest.testCUDAProducerCPU_cfi import testCUDAProducerCPU
+process.prod2 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod2CUDA")
+)
+process.prod3 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod2"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod3CUDA")
+)
+process.prod4 = SwitchProducerCUDA(
+    cpu = testCUDAProducerCPU.clone(src = "prod1"),
+    cuda = testCUDAProducerGPUtoCPU.clone(src = "prod4CUDA")
+)
+
+process.out = cms.OutputModule("AsciiOutputModule",
+    outputCommands = cms.untracked.vstring(
+        "keep *_prod3_*_*",
+        "keep *_prod4_*_*",
+        "keep *_prod5_*_*",
+    ),
+    verbosity = cms.untracked.uint32(0),
+)
+
+process.prod2Task = cms.Task(process.prod2, process.prod2CUDA)
+process.prod3Task = cms.Task(process.prod3, process.prod3CUDA)
+process.prod4Task = cms.Task(process.prod4, process.prod4CUDA)
+
+process.t = cms.Task(
+    process.prod1Task,
+    process.prod2Task,
+    process.prod3Task,
+    process.prod4Task,
+    process.prod5Task,
+    process.prod6Task
+)
+process.p = cms.Path()
+process.p.associate(process.t)
+process.ep = cms.EndPath(process.out)
+
+# Example of limiting the number of EDM streams per device
+#process.CUDAService.numberOfStreamsPerDevice = 1

From f53da8beb0df858ebd2977489eab4872055044bd Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 30 Jan 2019 20:48:42 +0100
Subject: [PATCH 32/49] Use SwitchProducer for Raw2Cluster

---
 .../StandardSequences/python/RawToDigi_cff.py      |  3 +--
 .../python/SiPixelRawToDigi_cfi.py                 | 14 +++++++++-----
 .../python/SiPixelClusterizerPreSplitting_cfi.py   |  9 +++++++--
 3 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/Configuration/StandardSequences/python/RawToDigi_cff.py b/Configuration/StandardSequences/python/RawToDigi_cff.py
index 4830da7dbf25a..ed10c78a40c9b 100644
--- a/Configuration/StandardSequences/python/RawToDigi_cff.py
+++ b/Configuration/StandardSequences/python/RawToDigi_cff.py
@@ -71,8 +71,7 @@
 RawToDigi_pixelOnly = cms.Sequence(RawToDigiTask_pixelOnly)
 
 scalersRawToDigi.scalersInputTag = 'rawDataCollector'
-from Configuration.ProcessModifiers.gpu_cff import gpu
-(~gpu).toModify(siPixelDigis, InputLabel = 'rawDataCollector')
+siPixelDigis.cpu.InputLabel = 'rawDataCollector'
 #false by default anyways ecalDigis.DoRegional = False
 ecalDigis.InputLabel = 'rawDataCollector'
 ecalPreshowerDigis.sourceTag = 'rawDataCollector'
diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
index 63d63145d5f8e..5be42aa22ad2a 100644
--- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
+++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
@@ -1,12 +1,16 @@
 import FWCore.ParameterSet.Config as cms
-import EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi
+from EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi import siPixelRawToDigi as _siPixelRawToDigi
 
-siPixelDigis = EventFilter.SiPixelRawToDigi.siPixelRawToDigi_cfi.siPixelRawToDigi.clone()
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelDigis = SwitchProducerCUDA(
+    cpu = _siPixelRawToDigi.clone()
+)
 
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
-phase1Pixel.toModify(siPixelDigis, UsePhase1=True)
+phase1Pixel.toModify(siPixelDigis.cpu, UsePhase1=True)
 
 from EventFilter.SiPixelRawToDigi.siPixelDigisFromSoA_cfi import siPixelDigisFromSoA as _siPixelDigisFromSoA
-_siPixelDigis_gpu = _siPixelDigisFromSoA.clone(digiSrc = "siPixelClustersPreSplitting")
 from Configuration.ProcessModifiers.gpu_cff import gpu
-gpu.toReplaceWith(siPixelDigis, _siPixelDigis_gpu)
+gpu.toModify(siPixelDigis,
+    cuda = _siPixelDigisFromSoA.clone(digiSrc = "siPixelClustersPreSplitting")
+)
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
index b95d97f227aa4..661d82f41c54a 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
@@ -2,8 +2,13 @@
 
 from CondTools.SiPixel.SiPixelGainCalibrationService_cfi import *
 from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizer_cfi import siPixelClusters as _siPixelClusters
-siPixelClustersPreSplitting = _siPixelClusters.clone()
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+siPixelClustersPreSplitting = SwitchProducerCUDA(
+    cpu = _siPixelClusters.clone()
+)
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
 from RecoLocalTracker.SiPixelClusterizer.siPixelClustersFromSoA_cfi import siPixelClustersFromSoA as _siPixelClustersFromSoA
-gpu.toReplaceWith(siPixelClustersPreSplitting, _siPixelClustersFromSoA.clone())
+gpu.toModify(siPixelClustersPreSplitting,
+    cuda = _siPixelClustersFromSoA.clone()
+)

From 66eb7cc3e4fd74c6fa5fc596998b4b877a408f2d Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 7 Feb 2019 23:03:31 +0100
Subject: [PATCH 33/49] Use EDAlias in the SwitchProducer in raw2cluster

---
 .../plugins/SiPixelDigiErrorsFromSoA.cc       | 183 ++++++++++++++++
 .../plugins/SiPixelDigisFromSoA.cc            | 198 ------------------
 .../python/SiPixelRawToDigi_cfi.py            |  12 +-
 .../python/siPixelDigis_cff.py                |   8 +-
 ...mSoA.cc => SiPixelDigisClustersFromSoA.cc} |  14 +-
 .../SiPixelClusterizerPreSplitting_cfi.py     |   7 +-
 .../python/siPixelClustersPreSplitting_cff.py |   3 +
 7 files changed, 215 insertions(+), 210 deletions(-)
 create mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
 delete mode 100644 EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
 rename RecoLocalTracker/SiPixelClusterizer/plugins/{SiPixelClustersFromSoA.cc => SiPixelDigisClustersFromSoA.cc} (91%)

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
new file mode 100644
index 0000000000000..9e998b92fc403
--- /dev/null
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsFromSoA.cc
@@ -0,0 +1,183 @@
+#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
+#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
+#include "DataFormats/Common/interface/DetSetVector.h"
+#include "DataFormats/Common/interface/Handle.h"
+#include "DataFormats/DetId/interface/DetIdCollection.h"
+#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
+#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
+#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
+#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
+#include "FWCore/Framework/interface/ESTransientHandle.h"
+#include "FWCore/Framework/interface/ESWatcher.h"
+#include "FWCore/Framework/interface/EventSetup.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/Framework/interface/stream/EDProducer.h"
+#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+
+#include <memory>
+
+class SiPixelDigiErrorsFromSoA: public edm::stream::EDProducer<> {
+public:
+  explicit SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigiErrorsFromSoA() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+private:
+  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
+
+  edm::EDGetTokenT<SiPixelDigiErrorsSoA> digiErrorSoAGetToken_;
+
+  edm::EDPutTokenT<edm::DetSetVector<SiPixelRawDataError>> errorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> tkErrorPutToken_;
+  edm::EDPutTokenT<DetIdCollection> userErrorPutToken_;
+  edm::EDPutTokenT<edmNew::DetSetVector<PixelFEDChannel>> disabledChannelPutToken_;
+
+  edm::ESWatcher<SiPixelFedCablingMapRcd> cablingWatcher_;
+  std::unique_ptr<SiPixelFedCablingTree> cabling_;
+  const std::string cablingMapLabel_;
+
+  const std::vector<int> tkerrorlist_;
+  const std::vector<int> usererrorlist_;
+
+  const bool usePhase1_;
+};
+
+SiPixelDigiErrorsFromSoA::SiPixelDigiErrorsFromSoA(const edm::ParameterSet& iConfig):
+  digiErrorSoAGetToken_{consumes<SiPixelDigiErrorsSoA>(iConfig.getParameter<edm::InputTag>("digiErrorSoASrc"))},
+  errorPutToken_{produces<edm::DetSetVector<SiPixelRawDataError>>()},
+  tkErrorPutToken_{produces<DetIdCollection>()},
+  userErrorPutToken_{produces<DetIdCollection>("UserErrorModules")},
+  disabledChannelPutToken_{produces<edmNew::DetSetVector<PixelFEDChannel>>()},
+  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
+  tkerrorlist_(iConfig.getParameter<std::vector<int>>("ErrorList")),
+  usererrorlist_(iConfig.getParameter<std::vector<int>>("UserErrorList")),
+  usePhase1_(iConfig.getParameter<bool> ("UsePhase1"))
+{}
+
+void SiPixelDigiErrorsFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<edm::InputTag>("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA"));
+  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label");
+  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
+  desc.add<std::vector<int> >("ErrorList", std::vector<int>{29})->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
+  desc.add<std::vector<int> >("UserErrorList", std::vector<int>{40})->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
+  descriptions.addWithDefaultLabel(desc);
+}
+
+void SiPixelDigiErrorsFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
+  // pack errors into collection
+
+  // initialize cabling map or update if necessary
+  if (cablingWatcher_.check(iSetup)) {
+    // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
+    edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
+    iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap);
+    cabling_ = cablingMap->cablingTree();
+    LogDebug("map version:")<< cabling_->version();
+  }
+
+  const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_);
+
+
+  edm::DetSetVector<SiPixelRawDataError> errorcollection{};
+  DetIdCollection tkerror_detidcollection{};
+  DetIdCollection usererror_detidcollection{};
+  edmNew::DetSetVector<PixelFEDChannel> disabled_channelcollection{};
+
+  PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0
+  const PixelDataFormatter::Errors *formatterErrors = digiErrors.formatterErrors();
+  assert(formatterErrors != nullptr);
+  auto errors = *formatterErrors; // make a copy
+  PixelDataFormatter::DetErrors nodeterrors;
+
+  auto size = digiErrors.size();
+  for (auto i = 0U; i < size; i++) {
+    PixelErrorCompact err = digiErrors.error(i);
+    if (err.errorType != 0) {
+      SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
+      errors[err.rawId].push_back(error);
+    }
+  }
+
+  constexpr uint32_t dummydetid = 0xffffffff;
+  typedef PixelDataFormatter::Errors::iterator IE;
+  for (IE is = errors.begin(); is != errors.end(); is++) {
+
+    uint32_t errordetid = is->first;
+    if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
+      nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
+    }
+    else {
+      edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection.find_or_insert(errordetid);
+      errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
+      // Fill detid of the detectors where there is error AND the error number is listed
+      // in the configurable error list in the job option cfi.
+      // Code needs to be here, because there can be a set of errors for each
+      // entry in the for loop over PixelDataFormatter::Errors
+
+      std::vector<PixelFEDChannel> disabledChannelsDetSet;
+
+      for (auto const& aPixelError : errorDetSet) {
+        // For the time being, we extend the error handling functionality with ErrorType 25
+        // In the future, we should sort out how the usage of tkerrorlist can be generalized
+        if (aPixelError.getType() == 25) {
+          int fedId = aPixelError.getFedId();
+          const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
+          if (fed) {
+            cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
+            const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
+            if (link) {
+              // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
+              // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
+              PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
+              for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
+                const sipixelobjects::PixelROC * roc = link->roc(iRoc);
+                if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
+                if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
+              }
+              if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
+            }
+          }
+        }
+        else {
+          // fill list of detIds to be turned off by tracking
+          if (!tkerrorlist_.empty()) {
+            auto it_find = std::find(tkerrorlist_.begin(), tkerrorlist_.end(), aPixelError.getType());
+            if (it_find != tkerrorlist_.end()) {
+              tkerror_detidcollection.push_back(errordetid);
+            }
+          }
+        }
+        
+        // fill list of detIds with errors to be studied
+        if (!usererrorlist_.empty()) {
+          auto it_find = std::find(usererrorlist_.begin(), usererrorlist_.end(), aPixelError.getType());
+          if (it_find != usererrorlist_.end()) {
+            usererror_detidcollection.push_back(errordetid);
+          }
+        }
+
+      } // loop on DetSet of errors
+
+      if (!disabledChannelsDetSet.empty()) {
+        disabled_channelcollection.insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
+      }
+
+    } // if error assigned to a real DetId
+  } // loop on errors in event for this FED
+
+  edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection.find_or_insert(dummydetid);
+  errorDetSet.data = nodeterrors;
+
+  iEvent.emplace(errorPutToken_, std::move(errorcollection));
+  iEvent.emplace(tkErrorPutToken_, std::move(tkerror_detidcollection));
+  iEvent.emplace(userErrorPutToken_, std::move(usererror_detidcollection));
+  iEvent.emplace(disabledChannelPutToken_, std::move(disabled_channelcollection));
+}
+
+DEFINE_FWK_MODULE(SiPixelDigiErrorsFromSoA);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
deleted file mode 100644
index c3522445ac7e5..0000000000000
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisFromSoA.cc
+++ /dev/null
@@ -1,198 +0,0 @@
-#include "CondFormats/DataRecord/interface/SiPixelFedCablingMapRcd.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingMap.h"
-#include "CondFormats/SiPixelObjects/interface/SiPixelFedCablingTree.h"
-#include "DataFormats/Common/interface/DetSetVector.h"
-#include "DataFormats/Common/interface/Handle.h"
-#include "DataFormats/DetId/interface/DetIdCollection.h"
-#include "DataFormats/SiPixelDetId/interface/PixelFEDChannel.h"
-#include "DataFormats/SiPixelDigi/interface/PixelDigi.h"
-#include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
-#include "EventFilter/SiPixelRawToDigi/interface/PixelDataFormatter.h"
-#include "FWCore/Framework/interface/ESTransientHandle.h"
-#include "FWCore/Framework/interface/ESWatcher.h"
-#include "FWCore/Framework/interface/EventSetup.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/Framework/interface/stream/EDProducer.h"
-#include "FWCore/ParameterSet/interface/ConfigurationDescriptions.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-
-#include <memory>
-
-class SiPixelDigisFromSoA: public edm::stream::EDProducer<> {
-public:
-  explicit SiPixelDigisFromSoA(const edm::ParameterSet& iConfig);
-  ~SiPixelDigisFromSoA() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-private:
-  void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
-
-  edm::EDGetTokenT<SiPixelDigiErrorsSoA> digiErrorSoAGetToken_;
-  edm::EDGetTokenT<edm::DetSetVector<PixelDigi>> digiGetToken_; // for a copy
-
-  edm::EDPutTokenT<edm::DetSetVector<PixelDigi>> digiPutToken_;
-  edm::EDPutTokenT<edm::DetSetVector<SiPixelRawDataError>> errorPutToken_;
-  edm::EDPutTokenT<DetIdCollection> tkErrorPutToken_;
-  edm::EDPutTokenT<DetIdCollection> userErrorPutToken_;
-  edm::EDPutTokenT<edmNew::DetSetVector<PixelFEDChannel>> disabledChannelPutToken_;
-
-  edm::ESWatcher<SiPixelFedCablingMapRcd> cablingWatcher_;
-  std::unique_ptr<SiPixelFedCablingTree> cabling_;
-  const std::string cablingMapLabel_;
-
-  const std::vector<int> tkerrorlist_;
-  const std::vector<int> usererrorlist_;
-
-  const bool includeErrors_;
-  const bool usePhase1_;
-};
-
-SiPixelDigisFromSoA::SiPixelDigisFromSoA(const edm::ParameterSet& iConfig):
-  digiGetToken_(consumes<edm::DetSetVector<PixelDigi>>(iConfig.getParameter<edm::InputTag>("digiSrc"))),
-  digiPutToken_(produces<edm::DetSetVector<PixelDigi>>()),
-  cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
-  tkerrorlist_(iConfig.getParameter<std::vector<int>>("ErrorList")),
-  usererrorlist_(iConfig.getParameter<std::vector<int>>("UserErrorList")),
-  includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
-  usePhase1_(iConfig.getParameter<bool> ("UsePhase1"))
-{
-  if(includeErrors_) {
-    digiErrorSoAGetToken_    = consumes<SiPixelDigiErrorsSoA>(iConfig.getParameter<edm::InputTag>("digiErrorSoASrc"));
-    errorPutToken_           = produces<edm::DetSetVector<SiPixelRawDataError>>();
-    tkErrorPutToken_         = produces<DetIdCollection>();
-    userErrorPutToken_       = produces<DetIdCollection>("UserErrorModules");
-    disabledChannelPutToken_ = produces<edmNew::DetSetVector<PixelFEDChannel>>();
-  }
-}
-
-void SiPixelDigisFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("digiSrc", edm::InputTag("siPixelClusters"));
-  desc.add<edm::InputTag>("digiErrorSoASrc", edm::InputTag("siPixelDigiErrorsSoA"));
-  desc.add<std::string>("CablingMapLabel","")->setComment("CablingMap label");
-  desc.add<bool>("IncludeErrors", true);
-  desc.add<bool>("UsePhase1",false)->setComment("##  Use phase1");
-  desc.add<std::vector<int> >("ErrorList", std::vector<int>{29})->setComment("## ErrorList: list of error codes used by tracking to invalidate modules");
-  desc.add<std::vector<int> >("UserErrorList", std::vector<int>{40})->setComment("## UserErrorList: list of error codes used by Pixel experts for investigation");
-  descriptions.addWithDefaultLabel(desc);
-}
-
-void SiPixelDigisFromSoA::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  // How could we avoid the copy?
-  iEvent.emplace(digiPutToken_, iEvent.get(digiGetToken_));
-
-  // pack errors into collection
-  if (includeErrors_) {
-    // initialize cabling map or update if necessary
-    if (cablingWatcher_.check(iSetup)) {
-      // cabling map, which maps online address (fed->link->ROC->local pixel) to offline (DetId->global pixel)
-      edm::ESTransientHandle<SiPixelFedCablingMap> cablingMap;
-      iSetup.get<SiPixelFedCablingMapRcd>().get(cablingMapLabel_, cablingMap);
-      cabling_ = cablingMap->cablingTree();
-      LogDebug("map version:")<< cabling_->version();
-    }
-
-    const auto& digiErrors = iEvent.get(digiErrorSoAGetToken_);
-
-    auto errorcollection = std::make_unique<edm::DetSetVector<SiPixelRawDataError>>();
-    auto tkerror_detidcollection = std::make_unique<DetIdCollection>();
-    auto usererror_detidcollection = std::make_unique<DetIdCollection>();
-    auto disabled_channelcollection = std::make_unique< edmNew::DetSetVector<PixelFEDChannel>>();
-
-    PixelDataFormatter formatter(cabling_.get(), usePhase1_); // for phase 1 & 0
-    const PixelDataFormatter::Errors *formatterErrors = digiErrors.formatterErrors();
-    assert(formatterErrors != nullptr);
-    auto errors = *formatterErrors; // make a copy
-    PixelDataFormatter::DetErrors nodeterrors;
-
-    auto size = digiErrors.size();
-    for (auto i = 0U; i < size; i++) {
-      PixelErrorCompact err = digiErrors.error(i);
-      if (err.errorType != 0) {
-        SiPixelRawDataError error(err.word, err.errorType, err.fedId + 1200);
-        errors[err.rawId].push_back(error);
-      }
-    }
-
-    constexpr uint32_t dummydetid = 0xffffffff;
-    typedef PixelDataFormatter::Errors::iterator IE;
-    for (IE is = errors.begin(); is != errors.end(); is++) {
-
-      uint32_t errordetid = is->first;
-      if (errordetid == dummydetid) {// errors given dummy detId must be sorted by Fed
-        nodeterrors.insert( nodeterrors.end(), errors[errordetid].begin(), errors[errordetid].end() );
-      }
-      else {
-        edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(errordetid);
-        errorDetSet.data.insert(errorDetSet.data.end(), is->second.begin(), is->second.end());
-        // Fill detid of the detectors where there is error AND the error number is listed
-        // in the configurable error list in the job option cfi.
-        // Code needs to be here, because there can be a set of errors for each
-        // entry in the for loop over PixelDataFormatter::Errors
-
-        std::vector<PixelFEDChannel> disabledChannelsDetSet;
-
-        for (auto const& aPixelError : errorDetSet) {
-          // For the time being, we extend the error handling functionality with ErrorType 25
-          // In the future, we should sort out how the usage of tkerrorlist can be generalized
-          if (aPixelError.getType() == 25) {
-            int fedId = aPixelError.getFedId();
-            const sipixelobjects::PixelFEDCabling* fed = cabling_->fed(fedId);
-            if (fed) {
-              cms_uint32_t linkId = formatter.linkId(aPixelError.getWord32());
-              const sipixelobjects::PixelFEDLink* link = fed->link(linkId);
-              if (link) {
-                // The "offline" 0..15 numbering is fixed by definition, also, the FrameConversion depends on it
-                // in contrast, the ROC-in-channel numbering is determined by hardware --> better to use the "offline" scheme
-                PixelFEDChannel ch = {fed->id(), linkId, 25, 0};
-                for (unsigned int iRoc = 1; iRoc <= link->numberOfROCs(); iRoc++) {
-                  const sipixelobjects::PixelROC * roc = link->roc(iRoc);
-                  if (roc->idInDetUnit() < ch.roc_first) ch.roc_first = roc->idInDetUnit();
-                  if (roc->idInDetUnit() > ch.roc_last) ch.roc_last = roc->idInDetUnit();
-                }
-                if (ch.roc_first<ch.roc_last) disabledChannelsDetSet.push_back(ch);
-              }
-            }
-          }
-          else {
-            // fill list of detIds to be turned off by tracking
-            if (!tkerrorlist_.empty()) {
-              auto it_find = std::find(tkerrorlist_.begin(), tkerrorlist_.end(), aPixelError.getType());
-              if (it_find != tkerrorlist_.end()) {
-                tkerror_detidcollection->push_back(errordetid);
-              }
-            }
-          }
-
-          // fill list of detIds with errors to be studied
-          if (!usererrorlist_.empty()) {
-            auto it_find = std::find(usererrorlist_.begin(), usererrorlist_.end(), aPixelError.getType());
-            if (it_find != usererrorlist_.end()) {
-              usererror_detidcollection->push_back(errordetid);
-            }
-          }
-
-        } // loop on DetSet of errors
-
-        if (!disabledChannelsDetSet.empty()) {
-          disabled_channelcollection->insert(errordetid, disabledChannelsDetSet.data(), disabledChannelsDetSet.size());
-        }
-
-      } // if error assigned to a real DetId
-    } // loop on errors in event for this FED
-
-    edm::DetSet<SiPixelRawDataError>& errorDetSet = errorcollection->find_or_insert(dummydetid);
-    errorDetSet.data = nodeterrors;
-
-    iEvent.put(errorPutToken_, std::move(errorcollection));
-    iEvent.put(tkErrorPutToken_, std::move(tkerror_detidcollection));
-    iEvent.put(userErrorPutToken_, std::move(usererror_detidcollection));
-    iEvent.put(disabledChannelPutToken_, std::move(disabled_channelcollection));
-    
-  } // if errors to be included in the event
-}
-
-DEFINE_FWK_MODULE(SiPixelDigisFromSoA);
diff --git a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
index 5be42aa22ad2a..50c8f0fcabd3c 100644
--- a/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
+++ b/EventFilter/SiPixelRawToDigi/python/SiPixelRawToDigi_cfi.py
@@ -9,8 +9,16 @@
 from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
 phase1Pixel.toModify(siPixelDigis.cpu, UsePhase1=True)
 
-from EventFilter.SiPixelRawToDigi.siPixelDigisFromSoA_cfi import siPixelDigisFromSoA as _siPixelDigisFromSoA
 from Configuration.ProcessModifiers.gpu_cff import gpu
 gpu.toModify(siPixelDigis,
-    cuda = _siPixelDigisFromSoA.clone(digiSrc = "siPixelClustersPreSplitting")
+    cuda = cms.EDAlias(
+        siPixelDigiErrors = cms.VPSet(
+            cms.PSet(type = cms.string("DetIdedmEDCollection")),
+            cms.PSet(type = cms.string("SiPixelRawDataErroredmDetSetVector")),
+            cms.PSet(type = cms.string("PixelFEDChanneledmNewDetSetVector"))
+        ),
+        siPixelDigisClustersPreSplitting = cms.VPSet(
+            cms.PSet(type = cms.string("PixelDigiedmDetSetVector"))
+        )
+    )
 )
diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
index be7e7193926fc..a60dd5de6d0a4 100644
--- a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
+++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
@@ -3,6 +3,7 @@
 from EventFilter.SiPixelRawToDigi.SiPixelRawToDigi_cfi import siPixelDigis
 from EventFilter.SiPixelRawToDigi.siPixelDigisSoAFromCUDA_cfi import siPixelDigisSoAFromCUDA as _siPixelDigisSoAFromCUDA
 from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsSoAFromCUDA_cfi import siPixelDigiErrorsSoAFromCUDA as _siPixelDigiErrorsSoAFromCUDA
+from EventFilter.SiPixelRawToDigi.siPixelDigiErrorsFromSoA_cfi import siPixelDigiErrorsFromSoA as _siPixelDigiErrorsFromSoA
 
 siPixelDigisTask = cms.Task(siPixelDigis)
 
@@ -12,7 +13,12 @@
 siPixelDigiErrorsSoA = _siPixelDigiErrorsSoAFromCUDA.clone(
     src = "siPixelClustersCUDAPreSplitting"
 )
-siPixelDigisTaskCUDA = cms.Task(siPixelDigisSoA, siPixelDigiErrorsSoA)
+siPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone()
+siPixelDigisTaskCUDA = cms.Task(
+    siPixelDigisSoA,
+    siPixelDigiErrorsSoA,
+    siPixelDigiErrors
+)
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
 _siPixelDigisTask_gpu = siPixelDigisTask.copy()
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
similarity index 91%
rename from RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
rename to RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
index c07d2496a2f9c..4c405a8c85afd 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelClustersFromSoA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelDigisClustersFromSoA.cc
@@ -48,10 +48,10 @@ namespace {
   constexpr uint32_t dummydetid = 0xffffffff;
 }
 
-class SiPixelClustersFromSoA: public edm::global::EDProducer<> {
+class SiPixelDigisClustersFromSoA: public edm::global::EDProducer<> {
 public:
-  explicit SiPixelClustersFromSoA(const edm::ParameterSet& iConfig);
-  ~SiPixelClustersFromSoA() override = default;
+  explicit SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig);
+  ~SiPixelDigisClustersFromSoA() override = default;
 
   static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
 
@@ -65,19 +65,19 @@ class SiPixelClustersFromSoA: public edm::global::EDProducer<> {
  
 };
 
-SiPixelClustersFromSoA::SiPixelClustersFromSoA(const edm::ParameterSet& iConfig):
+SiPixelDigisClustersFromSoA::SiPixelDigisClustersFromSoA(const edm::ParameterSet& iConfig):
   digiGetToken_(consumes<SiPixelDigisSoA>(iConfig.getParameter<edm::InputTag>("src"))),
   digiPutToken_(produces<edm::DetSetVector<PixelDigi>>()),
   clusterPutToken_(produces<SiPixelClusterCollectionNew>())
 {}
 
-void SiPixelClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+void SiPixelDigisClustersFromSoA::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   desc.add<edm::InputTag>("src", edm::InputTag("siPixelDigisSoA"));
   descriptions.addWithDefaultLabel(desc);
 }
 
-void SiPixelClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+void SiPixelDigisClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
   const auto& digis = iEvent.get(digiGetToken_);
   
   edm::ESHandle<TrackerTopology> trackerTopologyHandle;
@@ -155,4 +155,4 @@ void SiPixelClustersFromSoA::produce(edm::StreamID, edm::Event& iEvent, const ed
   iEvent.put(clusterPutToken_, std::move(outputClusters));
 }
 
-DEFINE_FWK_MODULE(SiPixelClustersFromSoA);
+DEFINE_FWK_MODULE(SiPixelDigisClustersFromSoA);
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
index 661d82f41c54a..b9c6862b015bf 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/SiPixelClusterizerPreSplitting_cfi.py
@@ -8,7 +8,10 @@
 )
 
 from Configuration.ProcessModifiers.gpu_cff import gpu
-from RecoLocalTracker.SiPixelClusterizer.siPixelClustersFromSoA_cfi import siPixelClustersFromSoA as _siPixelClustersFromSoA
 gpu.toModify(siPixelClustersPreSplitting,
-    cuda = _siPixelClustersFromSoA.clone()
+    cuda = cms.EDAlias(
+        siPixelDigisClustersPreSplitting = cms.VPSet(
+            cms.PSet(type = cms.string("SiPixelClusteredmNewDetSetVector"))
+        )
+    )
 )
diff --git a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
index 547fb061f96bf..c80f3b16b3a43 100644
--- a/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
+++ b/RecoLocalTracker/SiPixelClusterizer/python/siPixelClustersPreSplitting_cff.py
@@ -2,14 +2,17 @@
 
 from RecoLocalTracker.SiPixelClusterizer.SiPixelClusterizerPreSplitting_cfi import siPixelClustersPreSplitting
 from RecoLocalTracker.SiPixelClusterizer.siPixelRawToClusterCUDA_cfi import siPixelRawToClusterCUDA as _siPixelRawToClusterCUDA
+from RecoLocalTracker.SiPixelClusterizer.siPixelDigisClustersFromSoA_cfi import siPixelDigisClustersFromSoA as _siPixelDigisClustersFromSoA
 from RecoLocalTracker.SiPixelClusterizer.siPixelFedCablingMapGPUWrapper_cfi import *
 from CalibTracker.SiPixelESProducers.siPixelGainCalibrationForHLTGPU_cfi import *
 
 siPixelClustersPreSplittingTask = cms.Task(siPixelClustersPreSplitting)
 
 siPixelClustersCUDAPreSplitting = _siPixelRawToClusterCUDA.clone()
+siPixelDigisClustersPreSplitting = _siPixelDigisClustersFromSoA.clone()
 siPixelClustersPreSplittingTaskCUDA = cms.Task(
     siPixelClustersCUDAPreSplitting,
+    siPixelDigisClustersPreSplitting,
 )
 
 from Configuration.ProcessModifiers.gpu_cff import gpu

From 13365285f8e5dd1124bb3c44dcd8deb7986eb717 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 11 Feb 2019 20:57:42 +0100
Subject: [PATCH 34/49] Pass device and stream to CUDA<T> constructor instead
 of CUDAScopedContext

Simplifies the structure by removing the circular dependence between
CUDA<T> and CUDAScopedContext.
---
 CUDADataFormats/Common/interface/CUDA.h               | 11 ++++-------
 .../CUDACore/interface/CUDAScopedContext.h            |  8 ++++----
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/CUDADataFormats/Common/interface/CUDA.h b/CUDADataFormats/Common/interface/CUDA.h
index f7a08b0751f92..8a8085b9351ae 100644
--- a/CUDADataFormats/Common/interface/CUDA.h
+++ b/CUDADataFormats/Common/interface/CUDA.h
@@ -49,15 +49,13 @@ class CUDA {
   friend class CUDAScopedContext;
   friend class edm::Wrapper<CUDA<T>>;
 
-  // Using template to break circular dependency
-  template <typename Context>
-  explicit CUDA(const Context& ctx, T data):
-    stream_(ctx.streamPtr()),
-    event_(std::make_unique<cuda::event_t>(cuda::event::create(ctx.device(),
+  explicit CUDA(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
+    stream_(std::move(stream)),
+    event_(std::make_unique<cuda::event_t>(cuda::event::create(device,
                                                                cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
                                                                cuda::event::dont_record_timings))), // it should be a bit faster to ignore timings
     data_(std::move(data)),
-    device_(ctx.device())
+    device_(device)
   {
     // Record CUDA event to the CUDA stream. The event will become
     // "occurred" after all work queued to the stream before this
@@ -65,7 +63,6 @@ class CUDA {
     event_->record(stream_->id());
   }
 
-private:
   // The cuda::stream_t is really shared among edm::Event products, so
   // using shared_ptr also here
   std::shared_ptr<cuda::stream_t<>> stream_; //!
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index cb716e4007099..8ba2e2295d379 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -91,18 +91,18 @@ class CUDAScopedContext {
   }
 
   template <typename T>
-  std::unique_ptr<CUDA<T> > wrap(T data) {
+  std::unique_ptr<CUDA<T> > wrap(T data) const {
     // make_unique doesn't work because of private constructor
     //
     // CUDA<T> constructor records CUDA event to the CUDA stream. The
     // event will become "occurred" after all work queued to the
     // stream before this point has been finished.
-    return std::unique_ptr<CUDA<T> >(new CUDA<T>(*this, std::move(data)));
+    return std::unique_ptr<CUDA<T> >(new CUDA<T>(device(), streamPtr(), std::move(data)));
   }
 
   template <typename T, typename... Args>
-  auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) {
-    return iEvent.emplace(token, *this, std::forward<Args>(args)...);
+  auto emplace(edm::Event& iEvent, edm::EDPutTokenT<T> token, Args&&... args) const {
+    return iEvent.emplace(token, device(), streamPtr(), std::forward<Args>(args)...);
   }
 
 private:

From 289fff3ad783dfddab36423a3aeed6879371c6c0 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 11 Feb 2019 21:54:22 +0100
Subject: [PATCH 35/49] Move template-independent members to a CUDABase base
 class

---
 CUDADataFormats/Common/BuildFile.xml        |  5 +++
 CUDADataFormats/Common/interface/CUDA.h     | 41 ++++-----------------
 CUDADataFormats/Common/interface/CUDABase.h | 41 +++++++++++++++++++++
 CUDADataFormats/Common/src/CUDABase.cc      | 16 ++++++++
 HeterogeneousCore/CUDACore/BuildFile.xml    |  1 +
 5 files changed, 70 insertions(+), 34 deletions(-)
 create mode 100644 CUDADataFormats/Common/BuildFile.xml
 create mode 100644 CUDADataFormats/Common/interface/CUDABase.h
 create mode 100644 CUDADataFormats/Common/src/CUDABase.cc

diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
new file mode 100644
index 0000000000000..060edc3875ac1
--- /dev/null
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -0,0 +1,5 @@
+<use name="cuda-api-wrappers"/>
+
+<export>
+    <lib name="1"/>
+</export>
diff --git a/CUDADataFormats/Common/interface/CUDA.h b/CUDADataFormats/Common/interface/CUDA.h
index 8a8085b9351ae..ca0446a1a5bd2 100644
--- a/CUDADataFormats/Common/interface/CUDA.h
+++ b/CUDADataFormats/Common/interface/CUDA.h
@@ -1,10 +1,12 @@
 #ifndef CUDADataFormats_Common_CUDA_h
 #define CUDADataFormats_Common_CUDA_h
 
-#include <optional>
+#include <memory>
 
 #include <cuda/api_wrappers.h>
 
+#include "CUDADataFormats/Common/interface/CUDABase.h"
+
 namespace edm {
   template <typename T> class Wrapper;
 }
@@ -25,7 +27,7 @@ namespace edm {
  * use them only where synchronization between streams is needed.
  */
 template <typename T>
-class CUDA {
+class CUDA: public CUDABase {
 public:
   CUDA() = default; // Needed only for ROOT dictionary generation
 
@@ -34,45 +36,16 @@ class CUDA {
   CUDA(CUDA&&) = default;
   CUDA& operator=(CUDA&&) = default;
 
-  bool isValid() const { return stream_.get() != nullptr; }
-
-  int device() const { return device_; }
-
-  const cuda::stream_t<>& stream() const { return *stream_; }
-  cuda::stream_t<>& stream() { return *stream_; }
-  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
-
-  const cuda::event_t& event() const { return *event_; }
-  cuda::event_t& event() { return *event_; }
-
 private:
   friend class CUDAScopedContext;
   friend class edm::Wrapper<CUDA<T>>;
 
   explicit CUDA(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
-    stream_(std::move(stream)),
-    event_(std::make_unique<cuda::event_t>(cuda::event::create(device,
-                                                               cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
-                                                               cuda::event::dont_record_timings))), // it should be a bit faster to ignore timings
-    data_(std::move(data)),
-    device_(device)
-  {
-    // Record CUDA event to the CUDA stream. The event will become
-    // "occurred" after all work queued to the stream before this
-    // point has been finished.
-    event_->record(stream_->id());
-  }
-
-  // The cuda::stream_t is really shared among edm::Event products, so
-  // using shared_ptr also here
-  std::shared_ptr<cuda::stream_t<>> stream_; //!
-  // Using unique_ptr to support the default constructor. Tried
-  // std::optional, but cuda::event_t has its move assignment
-  // operators deleted.
-  std::unique_ptr<cuda::event_t> event_; //!
+    CUDABase(device, std::move(stream)),
+    data_(std::move(data))
+  {}
 
   T data_; //!
-  int device_ = -1; //!
 };
 
 #endif
diff --git a/CUDADataFormats/Common/interface/CUDABase.h b/CUDADataFormats/Common/interface/CUDABase.h
new file mode 100644
index 0000000000000..b851e4c6a0144
--- /dev/null
+++ b/CUDADataFormats/Common/interface/CUDABase.h
@@ -0,0 +1,41 @@
+#ifndef CUDADataFormats_Common_CUDABase_h
+#define CUDADataFormats_Common_CUDABase_h
+
+#include <memory>
+
+#include <cuda/api_wrappers.h>
+
+/**
+ * Base class for all instantiations of CUDA<T> to hold the
+ * non-T-dependent members.
+ */
+class CUDABase {
+public:
+  CUDABase() = default; // Needed only for ROOT dictionary generation
+
+  bool isValid() const { return stream_.get() != nullptr; }
+
+  int device() const { return device_; }
+
+  const cuda::stream_t<>& stream() const { return *stream_; }
+  cuda::stream_t<>& stream() { return *stream_; }
+  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
+
+  const cuda::event_t& event() const { return *event_; }
+  cuda::event_t& event() { return *event_; }
+
+protected:
+  explicit CUDABase(int device, std::shared_ptr<cuda::stream_t<>> stream);
+
+  // The cuda::stream_t is really shared among edm::Event products, so
+  // using shared_ptr also here
+  std::shared_ptr<cuda::stream_t<>> stream_; //!
+  // Using unique_ptr to support the default constructor. Tried
+  // std::optional, but cuda::event_t has its move assignment
+  // operators deleted.
+  std::unique_ptr<cuda::event_t> event_; //!
+
+  int device_ = -1; //!
+};
+
+#endif
diff --git a/CUDADataFormats/Common/src/CUDABase.cc b/CUDADataFormats/Common/src/CUDABase.cc
new file mode 100644
index 0000000000000..21c84e87a1ddd
--- /dev/null
+++ b/CUDADataFormats/Common/src/CUDABase.cc
@@ -0,0 +1,16 @@
+#include "CUDADataFormats/Common/interface/CUDABase.h"
+
+CUDABase::CUDABase(int device, std::shared_ptr<cuda::stream_t<>> stream):
+  stream_(std::move(stream)),
+  event_(std::make_unique<cuda::event_t>(cuda::event::create(device,
+                                                             cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
+                                                             cuda::event::dont_record_timings))), // it should be a bit faster to ignore timings
+  device_(device)
+{
+  // Record CUDA event to the CUDA stream. The event will become
+  // "occurred" after all work queued to the stream before this
+  // point has been finished.
+  event_->record(stream_->id());
+}
+
+
diff --git a/HeterogeneousCore/CUDACore/BuildFile.xml b/HeterogeneousCore/CUDACore/BuildFile.xml
index 9f4b814ad644a..dc0066701ece3 100644
--- a/HeterogeneousCore/CUDACore/BuildFile.xml
+++ b/HeterogeneousCore/CUDACore/BuildFile.xml
@@ -2,6 +2,7 @@
 <use name="FWCore/Framework"/>
 <use name="FWCore/ServiceRegistry"/>
 <use name="FWCore/ParameterSet"/>
+<use name="CUDADataFormats/Common"/>
 <use name="DataFormats/Provenance"/>
 <use name="HeterogeneousCore/CUDAServices"/>
 <use name="cuda"/>

From 20fc7b6b8699a6e826213373bc03b63e50784c7c Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 22 Feb 2019 19:08:16 +0100
Subject: [PATCH 36/49] Rename CUDA to CUDAProduct

---
 .../interface/{CUDA.h => CUDAProduct.h}       | 24 +++++++++----------
 .../{CUDABase.h => CUDAProductBase.h}         | 10 ++++----
 .../src/{CUDABase.cc => CUDAProductBase.cc}   |  4 ++--
 .../{test_CUDA.cc => test_CUDAProduct.cc}     | 12 +++++-----
 CUDADataFormats/SiPixelCluster/src/classes.h  |  2 +-
 .../SiPixelCluster/src/classes_def.xml        |  4 ++--
 CUDADataFormats/SiPixelDigi/src/classes.h     |  2 +-
 .../SiPixelDigi/src/classes_def.xml           |  8 +++----
 .../plugins/SiPixelDigiErrorsSoAFromCUDA.cc   |  6 ++---
 .../plugins/SiPixelDigisSoAFromCUDA.cc        |  6 ++---
 .../CUDACore/interface/CUDAScopedContext.h    | 18 +++++++-------
 .../CUDACore/test/test_CUDAScopedContext.cc   | 14 +++++------
 .../CUDATest/plugins/TestCUDAProducerGPU.cc   | 14 +++++------
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc | 10 ++++----
 .../plugins/TestCUDAProducerGPUFirst.cc       |  6 ++---
 .../plugins/TestCUDAProducerGPUtoCPU.cc       |  8 +++----
 HeterogeneousCore/CUDATest/src/classes.h      |  2 +-
 .../CUDATest/src/classes_def.xml              |  4 ++--
 .../test/test_TestCUDAProducerGPUFirst.cc     |  4 ++--
 .../plugins/SiPixelRawToClusterCUDA.cc        | 14 +++++------
 .../plugins/SiPixelRecHitHeterogeneous.cc     | 14 +++++------
 .../ClusterTPAssociationHeterogeneous.cc      |  8 +++----
 22 files changed, 97 insertions(+), 97 deletions(-)
 rename CUDADataFormats/Common/interface/{CUDA.h => CUDAProduct.h} (59%)
 rename CUDADataFormats/Common/interface/{CUDABase.h => CUDAProductBase.h} (77%)
 rename CUDADataFormats/Common/src/{CUDABase.cc => CUDAProductBase.cc} (81%)
 rename CUDADataFormats/Common/test/{test_CUDA.cc => test_CUDAProduct.cc} (85%)

diff --git a/CUDADataFormats/Common/interface/CUDA.h b/CUDADataFormats/Common/interface/CUDAProduct.h
similarity index 59%
rename from CUDADataFormats/Common/interface/CUDA.h
rename to CUDADataFormats/Common/interface/CUDAProduct.h
index ca0446a1a5bd2..ca07a344ba2d5 100644
--- a/CUDADataFormats/Common/interface/CUDA.h
+++ b/CUDADataFormats/Common/interface/CUDAProduct.h
@@ -1,11 +1,11 @@
-#ifndef CUDADataFormats_Common_CUDA_h
-#define CUDADataFormats_Common_CUDA_h
+#ifndef CUDADataFormats_Common_CUDAProduct_h
+#define CUDADataFormats_Common_CUDAProduct_h
 
 #include <memory>
 
 #include <cuda/api_wrappers.h>
 
-#include "CUDADataFormats/Common/interface/CUDABase.h"
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
 
 namespace edm {
   template <typename T> class Wrapper;
@@ -27,21 +27,21 @@ namespace edm {
  * use them only where synchronization between streams is needed.
  */
 template <typename T>
-class CUDA: public CUDABase {
+class CUDAProduct: public CUDAProductBase {
 public:
-  CUDA() = default; // Needed only for ROOT dictionary generation
+  CUDAProduct() = default; // Needed only for ROOT dictionary generation
 
-  CUDA(const CUDA&) = delete;
-  CUDA& operator=(const CUDA&) = delete;
-  CUDA(CUDA&&) = default;
-  CUDA& operator=(CUDA&&) = default;
+  CUDAProduct(const CUDAProduct&) = delete;
+  CUDAProduct& operator=(const CUDAProduct&) = delete;
+  CUDAProduct(CUDAProduct&&) = default;
+  CUDAProduct& operator=(CUDAProduct&&) = default;
 
 private:
   friend class CUDAScopedContext;
-  friend class edm::Wrapper<CUDA<T>>;
+  friend class edm::Wrapper<CUDAProduct<T>>;
 
-  explicit CUDA(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
-    CUDABase(device, std::move(stream)),
+  explicit CUDAProduct(int device, std::shared_ptr<cuda::stream_t<>> stream, T data):
+    CUDAProductBase(device, std::move(stream)),
     data_(std::move(data))
   {}
 
diff --git a/CUDADataFormats/Common/interface/CUDABase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
similarity index 77%
rename from CUDADataFormats/Common/interface/CUDABase.h
rename to CUDADataFormats/Common/interface/CUDAProductBase.h
index b851e4c6a0144..9b0f931d9744d 100644
--- a/CUDADataFormats/Common/interface/CUDABase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -1,5 +1,5 @@
-#ifndef CUDADataFormats_Common_CUDABase_h
-#define CUDADataFormats_Common_CUDABase_h
+#ifndef CUDADataFormats_Common_CUDAProductBase_h
+#define CUDADataFormats_Common_CUDAProductBase_h
 
 #include <memory>
 
@@ -9,9 +9,9 @@
  * Base class for all instantiations of CUDA<T> to hold the
  * non-T-dependent members.
  */
-class CUDABase {
+class CUDAProductBase {
 public:
-  CUDABase() = default; // Needed only for ROOT dictionary generation
+  CUDAProductBase() = default; // Needed only for ROOT dictionary generation
 
   bool isValid() const { return stream_.get() != nullptr; }
 
@@ -25,7 +25,7 @@ class CUDABase {
   cuda::event_t& event() { return *event_; }
 
 protected:
-  explicit CUDABase(int device, std::shared_ptr<cuda::stream_t<>> stream);
+  explicit CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream);
 
   // The cuda::stream_t is really shared among edm::Event products, so
   // using shared_ptr also here
diff --git a/CUDADataFormats/Common/src/CUDABase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
similarity index 81%
rename from CUDADataFormats/Common/src/CUDABase.cc
rename to CUDADataFormats/Common/src/CUDAProductBase.cc
index 21c84e87a1ddd..b09a1d0445c81 100644
--- a/CUDADataFormats/Common/src/CUDABase.cc
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -1,6 +1,6 @@
-#include "CUDADataFormats/Common/interface/CUDABase.h"
+#include "CUDADataFormats/Common/interface/CUDAProductBase.h"
 
-CUDABase::CUDABase(int device, std::shared_ptr<cuda::stream_t<>> stream):
+CUDAProductBase::CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream):
   stream_(std::move(stream)),
   event_(std::make_unique<cuda::event_t>(cuda::event::create(device,
                                                              cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
diff --git a/CUDADataFormats/Common/test/test_CUDA.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
similarity index 85%
rename from CUDADataFormats/Common/test/test_CUDA.cc
rename to CUDADataFormats/Common/test/test_CUDAProduct.cc
index 7754cf2509cf2..2e4da69b4ee26 100644
--- a/CUDADataFormats/Common/test/test_CUDA.cc
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -1,6 +1,6 @@
 #include "catch.hpp"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -17,9 +17,9 @@ namespace cudatest {
   };
 }
 
-TEST_CASE("Use of CUDA template", "[CUDACore]") {
+TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
   SECTION("Default constructed") {
-    auto foo = CUDA<int>();
+    auto foo = CUDAProduct<int>();
     REQUIRE(!foo.isValid());
 
     auto bar = std::move(foo);
@@ -37,7 +37,7 @@ TEST_CASE("Use of CUDA template", "[CUDACore]") {
   constexpr int defaultDevice = 0;
   {
     auto ctx = cudatest::TestCUDAScopedContext::make(defaultDevice);
-    std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
+    std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
     auto& data = *dataPtr;
 
     SECTION("Construct from CUDAScopedContext") {
@@ -48,13 +48,13 @@ TEST_CASE("Use of CUDA template", "[CUDACore]") {
     }
 
     SECTION("Move constructor") {
-      auto data2 = CUDA<int>(std::move(data));
+      auto data2 = CUDAProduct<int>(std::move(data));
       REQUIRE(data2.isValid());
       REQUIRE(!data.isValid());
     }
 
     SECTION("Move assignment") {
-      CUDA<int> data2;
+      CUDAProduct<int> data2;
       data2 = std::move(data);
       REQUIRE(data2.isValid());
       REQUIRE(!data.isValid());
diff --git a/CUDADataFormats/SiPixelCluster/src/classes.h b/CUDADataFormats/SiPixelCluster/src/classes.h
index c04bec77a3d02..08d46244adc7d 100644
--- a/CUDADataFormats/SiPixelCluster/src/classes.h
+++ b/CUDADataFormats/SiPixelCluster/src/classes.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_SiPixelCluster_classes_h
 #define CUDADataFormats_SiPixelCluster_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
 
diff --git a/CUDADataFormats/SiPixelCluster/src/classes_def.xml b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
index 562783f0db308..ba0706ac4b8aa 100644
--- a/CUDADataFormats/SiPixelCluster/src/classes_def.xml
+++ b/CUDADataFormats/SiPixelCluster/src/classes_def.xml
@@ -1,4 +1,4 @@
 <lcgdict>
-  <class name="CUDA<SiPixelClustersCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDA<SiPixelClustersCUDA>>" persistent="false"/>
+  <class name="CUDAProduct<SiPixelClustersCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelClustersCUDA>>" persistent="false"/>
 </lcgdict>
diff --git a/CUDADataFormats/SiPixelDigi/src/classes.h b/CUDADataFormats/SiPixelDigi/src/classes.h
index 188d51277abe5..41b135640b883 100644
--- a/CUDADataFormats/SiPixelDigi/src/classes.h
+++ b/CUDADataFormats/SiPixelDigi/src/classes.h
@@ -1,7 +1,7 @@
 #ifndef CUDADataFormats_SiPixelDigi_classes_h
 #define CUDADataFormats_SiPixelDigi_classes_h
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "DataFormats/Common/interface/Wrapper.h"
diff --git a/CUDADataFormats/SiPixelDigi/src/classes_def.xml b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
index b17f57470cc42..9d6816ed3b14c 100644
--- a/CUDADataFormats/SiPixelDigi/src/classes_def.xml
+++ b/CUDADataFormats/SiPixelDigi/src/classes_def.xml
@@ -1,7 +1,7 @@
 <lcgdict>
-  <class name="CUDA<SiPixelDigisCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDA<SiPixelDigisCUDA>>" persistent="false"/>
+  <class name="CUDAProduct<SiPixelDigisCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelDigisCUDA>>" persistent="false"/>
 
-  <class name="CUDA<SiPixelDigiErrorsCUDA>" persistent="false"/>
-  <class name="edm::Wrapper<CUDA<SiPixelDigiErrorsCUDA>>" persistent="false"/>
+  <class name="CUDAProduct<SiPixelDigiErrorsCUDA>" persistent="false"/>
+  <class name="edm::Wrapper<CUDAProduct<SiPixelDigiErrorsCUDA>>" persistent="false"/>
 </lcgdict>
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
index 950cf59b53820..de5a1d21ba3c8 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigiErrorsSoA.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -22,7 +22,7 @@ class SiPixelDigiErrorsSoAFromCUDA: public edm::stream::EDProducer<edm::External
   void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  edm::EDGetTokenT<CUDA<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorGetToken_;
   edm::EDPutTokenT<SiPixelDigiErrorsSoA> digiErrorPutToken_;
 
   cudautils::host::unique_ptr<PixelErrorCompact[]> data_;
@@ -31,7 +31,7 @@ class SiPixelDigiErrorsSoAFromCUDA: public edm::stream::EDProducer<edm::External
 };
 
 SiPixelDigiErrorsSoAFromCUDA::SiPixelDigiErrorsSoAFromCUDA(const edm::ParameterSet& iConfig):
-  digiErrorGetToken_(consumes<CUDA<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiErrorGetToken_(consumes<CUDAProduct<SiPixelDigiErrorsCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
   digiErrorPutToken_(produces<SiPixelDigiErrorsSoA>())
 {}
 
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index 7f5481b31cd47..ce561f36f477c 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/SiPixelDigi/interface/SiPixelDigisSoA.h"
 #include "FWCore/Framework/interface/EventSetup.h"
@@ -23,7 +23,7 @@ class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork>
   void acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 
-  edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> digiGetToken_;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> digiGetToken_;
   edm::EDPutTokenT<SiPixelDigisSoA> digiPutToken_;
 
   cudautils::host::unique_ptr<uint32_t[]> pdigi_;
@@ -35,7 +35,7 @@ class SiPixelDigisSoAFromCUDA: public edm::stream::EDProducer<edm::ExternalWork>
 };
 
 SiPixelDigisSoAFromCUDA::SiPixelDigisSoAFromCUDA(const edm::ParameterSet& iConfig):
-  digiGetToken_(consumes<CUDA<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
+  digiGetToken_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("src"))),
   digiPutToken_(produces<SiPixelDigisSoA>())
 {}
 
diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 8ba2e2295d379..4333a9d072566 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -6,7 +6,7 @@
 #include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/StreamID.h"
 #include "FWCore/Utilities/interface/EDPutToken.h"
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 
 #include <cuda/api_wrappers.h>
@@ -35,7 +35,7 @@ class CUDAScopedContext {
   {}
 
   template<typename T>
-  explicit CUDAScopedContext(const CUDA<T>& data):
+  explicit CUDAScopedContext(const CUDAProduct<T>& data):
     currentDevice_(data.device()),
     setDeviceForThisScope_(currentDevice_),
     stream_(data.streamPtr())
@@ -48,7 +48,7 @@ class CUDAScopedContext {
   }
 
   template <typename T>
-  explicit CUDAScopedContext(const CUDA<T>& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
+  explicit CUDAScopedContext(const CUDAProduct<T>& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
     CUDAScopedContext(data)
   {
     waitingTaskHolder_ = waitingTaskHolder;
@@ -67,7 +67,7 @@ class CUDAScopedContext {
   }
 
   template <typename T>
-  const T& get(const CUDA<T>& data) {
+  const T& get(const CUDAProduct<T>& data) {
     if(data.device() != currentDevice_) {
       // Eventually replace with prefetch to current device (assuming unified memory works)
       // If we won't go to unified memory, need to figure out something else...
@@ -91,13 +91,13 @@ class CUDAScopedContext {
   }
 
   template <typename T>
-  std::unique_ptr<CUDA<T> > wrap(T data) const {
+  std::unique_ptr<CUDAProduct<T> > wrap(T data) const {
     // make_unique doesn't work because of private constructor
     //
-    // CUDA<T> constructor records CUDA event to the CUDA stream. The
-    // event will become "occurred" after all work queued to the
-    // stream before this point has been finished.
-    return std::unique_ptr<CUDA<T> >(new CUDA<T>(device(), streamPtr(), std::move(data)));
+    // CUDAProduct<T> constructor records CUDA event to the CUDA
+    // stream. The event will become "occurred" after all work queued
+    // to the stream before this point has been finished.
+    return std::unique_ptr<CUDAProduct<T> >(new CUDAProduct<T>(device(), streamPtr(), std::move(data)));
   }
 
   template <typename T, typename... Args>
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 824784e0a77c1..885ae4747c359 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -1,6 +1,6 @@
 #include "catch.hpp"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
 
@@ -18,7 +18,7 @@ namespace cudatest {
 }
 
 namespace {
-  std::unique_ptr<CUDA<int *> > produce(int device, int *d, int *h) {
+  std::unique_ptr<CUDAProduct<int *> > produce(int device, int *d, int *h) {
     auto ctx = cudatest::TestCUDAScopedContext::make(device);
 
     cuda::memory::async::copy(d, h, sizeof(int), ctx.stream().id());
@@ -45,15 +45,15 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
       REQUIRE(cuda::device::current::get().id() == defaultDevice);
     }
 
-    SECTION("Wrap T to CUDA<T>") {
-      std::unique_ptr<CUDA<int> > dataPtr = ctx.wrap(10);
+    SECTION("Wrap T to CUDAProduct<T>") {
+      std::unique_ptr<CUDAProduct<int> > dataPtr = ctx.wrap(10);
       REQUIRE(dataPtr.get() != nullptr);
       REQUIRE(dataPtr->device() == ctx.device());
       REQUIRE(dataPtr->stream().id() == ctx.stream().id());
     }
 
-    SECTION("Construct from from CUDA<T>") {
-      std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
+    SECTION("Construct from from CUDAProduct<T>") {
+      std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
       const auto& data = *dataPtr;
 
       CUDAScopedContext ctx2{data};
@@ -64,7 +64,7 @@ TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
     SECTION("Storing state as CUDAContextToken") {
       CUDAContextToken ctxtok;
       { // acquire
-        std::unique_ptr<CUDA<int>> dataPtr = ctx.wrap(10);
+        std::unique_ptr<CUDAProduct<int>> dataPtr = ctx.wrap(10);
         const auto& data = *dataPtr;
         CUDAScopedContext ctx2{data};
         ctxtok = ctx2.toToken();
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index 0c578b4288664..0fea3c6fb0c93 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -5,7 +5,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
@@ -21,22 +21,22 @@ class TestCUDAProducerGPU: public edm::global::EDProducer<> {
   void produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
-  edm::EDPutTokenT<CUDA<CUDAThing>> dstToken_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
 };
 
 TestCUDAProducerGPU::TestCUDAProducerGPU(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label")),
-  srcToken_(consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
-  dstToken_(produces<CUDA<CUDAThing>>())
+  srcToken_(consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))),
+  dstToken_(produces<CUDAProduct<CUDAThing>>())
 {}
 
 void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDA<CUDAThing>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source of CUDAProduct<CUDAThing>.");
   descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDA<CUDAThing>.");
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this is not the first algorithm in the chain of the GPU EDProducers. Produces CUDAProduct<CUDAThing>.");
 }
 
 void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index 3d63a1cb960c5..e52f43208b0b8 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -5,7 +5,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
@@ -23,8 +23,8 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
-  edm::EDPutTokenT<CUDA<CUDAThing>> dstToken_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
+  edm::EDPutTokenT<CUDAProduct<CUDAThing>> dstToken_;
   TestCUDAProducerGPUKernel gpuAlgo_;
   CUDAContextToken ctxTmp_;
   cudautils::device::unique_ptr<float[]> devicePtr_;
@@ -33,8 +33,8 @@ class TestCUDAProducerGPUEW: public edm::stream::EDProducer<edm::ExternalWork> {
 
 TestCUDAProducerGPUEW::TestCUDAProducerGPUEW(const edm::ParameterSet& iConfig):
   label_{iConfig.getParameter<std::string>("@module_label")},
-  srcToken_{consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
-  dstToken_{produces<CUDA<CUDAThing>>()}
+  srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+  dstToken_{produces<CUDAProduct<CUDAThing>>()}
 {}
 
 void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index 187c89ea885f7..b8b533691bde2 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -5,7 +5,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSet.h"
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
@@ -27,13 +27,13 @@ class TestCUDAProducerGPUFirst: public edm::global::EDProducer<> {
 TestCUDAProducerGPUFirst::TestCUDAProducerGPUFirst(const edm::ParameterSet& iConfig):
   label_(iConfig.getParameter<std::string>("@module_label"))
 {
-  produces<CUDA<CUDAThing>>();
+  produces<CUDAProduct<CUDAThing>>();
 }
 
 void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
   descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA<CUDAThing>.");
+  descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models a GPU algorithm this the first algorithm in the chain of the GPU EDProducers. Produces CUDA<ProductCUDAThing>.");
 }
 
 void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index df71dd2c81001..79d4368b22468 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -6,7 +6,7 @@
 #include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
@@ -25,20 +25,20 @@ class TestCUDAProducerGPUtoCPU: public edm::stream::EDProducer<edm::ExternalWork
   void produce(edm::Event& iEvent, const edm::EventSetup& iSetup) override;
 private:
   std::string label_;
-  edm::EDGetTokenT<CUDA<CUDAThing>> srcToken_;
+  edm::EDGetTokenT<CUDAProduct<CUDAThing>> srcToken_;
   edm::EDPutTokenT<int> dstToken_;
   cudautils::host::unique_ptr<float[]> buffer_;
 };
 
 TestCUDAProducerGPUtoCPU::TestCUDAProducerGPUtoCPU(const edm::ParameterSet& iConfig):
   label_{iConfig.getParameter<std::string>("@module_label")},
-  srcToken_{consumes<CUDA<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
+  srcToken_{consumes<CUDAProduct<CUDAThing>>(iConfig.getParameter<edm::InputTag>("src"))},
   dstToken_{produces<int>()}
 {}
 
 void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
   edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDA<CUDAThing>.");
+  desc.add<edm::InputTag>("src", edm::InputTag())->setComment("Source for CUDAProduct<CUDAThing>.");
   descriptions.addWithDefaultLabel(desc);
   descriptions.setComment("This EDProducer is part of the TestCUDAProducer* family. It models the GPU->CPU data transfer and formatting of the data to legacy data format. Produces int, to be compatible with TestCUDAProducerCPU.");
 }
diff --git a/HeterogeneousCore/CUDATest/src/classes.h b/HeterogeneousCore/CUDATest/src/classes.h
index 333529467d409..33d9bba2bb9b2 100644
--- a/HeterogeneousCore/CUDATest/src/classes.h
+++ b/HeterogeneousCore/CUDATest/src/classes.h
@@ -1,3 +1,3 @@
 #include "DataFormats/Common/interface/Wrapper.h"
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
diff --git a/HeterogeneousCore/CUDATest/src/classes_def.xml b/HeterogeneousCore/CUDATest/src/classes_def.xml
index 5ae2e0f8117fb..bece1ece62a7b 100644
--- a/HeterogeneousCore/CUDATest/src/classes_def.xml
+++ b/HeterogeneousCore/CUDATest/src/classes_def.xml
@@ -1,4 +1,4 @@
 <lcgdict>
-    <class name="CUDA<CUDAThing>" persistent="false"/>
-    <class name="edm::Wrapper<CUDA<CUDAThing>>" persistent="false"/>
+    <class name="CUDAProduct<CUDAThing>" persistent="false"/>
+    <class name="edm::Wrapper<CUDAProduct<CUDAThing>>" persistent="false"/>
 </lcgdict>
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 4d2eb7c68aebc..450ec98abaf72 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -2,7 +2,7 @@
 #include "FWCore/TestProcessor/interface/TestProcessor.h"
 #include "FWCore/Utilities/interface/Exception.h"
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
 
@@ -76,7 +76,7 @@ process.moduleToTest(process.toTest)
   SECTION("Produce") {
     edm::test::TestProcessor tester{config};
     auto event = tester.test();
-    auto prod = event.get<CUDA<CUDAThing> >();
+    auto prod = event.get<CUDAProduct<CUDAThing> >();
     REQUIRE(prod->device() == defaultDevice);
     auto ctx = CUDAScopedContext(*prod);
     const CUDAThing& thing = ctx.get(*prod);
diff --git a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
index 659da17f46889..5dc04009f4832 100644
--- a/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
+++ b/RecoLocalTracker/SiPixelClusterizer/plugins/SiPixelRawToClusterCUDA.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigiErrorsCUDA.h"
@@ -47,9 +47,9 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
 
   edm::EDGetTokenT<FEDRawDataCollection> rawGetToken_;
 
-  edm::EDPutTokenT<CUDA<SiPixelDigisCUDA>> digiPutToken_;
-  edm::EDPutTokenT<CUDA<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
-  edm::EDPutTokenT<CUDA<SiPixelClustersCUDA>> clusterPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelDigisCUDA>> digiPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelDigiErrorsCUDA>> digiErrorPutToken_;
+  edm::EDPutTokenT<CUDAProduct<SiPixelClustersCUDA>> clusterPutToken_;
 
   CUDAContextToken ctxTmp_;
 
@@ -72,8 +72,8 @@ class SiPixelRawToClusterCUDA: public edm::stream::EDProducer<edm::ExternalWork>
 
 SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfig):
   rawGetToken_(consumes<FEDRawDataCollection>(iConfig.getParameter<edm::InputTag>("InputLabel"))),
-  digiPutToken_(produces<CUDA<SiPixelDigisCUDA>>()),
-  clusterPutToken_(produces<CUDA<SiPixelClustersCUDA>>()),
+  digiPutToken_(produces<CUDAProduct<SiPixelDigisCUDA>>()),
+  clusterPutToken_(produces<CUDAProduct<SiPixelClustersCUDA>>()),
   cablingMapLabel_(iConfig.getParameter<std::string>("CablingMapLabel")),
   includeErrors_(iConfig.getParameter<bool>("IncludeErrors")),
   useQuality_(iConfig.getParameter<bool>("UseQualityInfo")),
@@ -81,7 +81,7 @@ SiPixelRawToClusterCUDA::SiPixelRawToClusterCUDA(const edm::ParameterSet& iConfi
   convertADCtoElectrons_(iConfig.getParameter<bool>("ConvertADCtoElectrons"))
 {
   if(includeErrors_) {
-    digiErrorPutToken_ = produces<CUDA<SiPixelDigiErrorsCUDA>>();
+    digiErrorPutToken_ = produces<CUDAProduct<SiPixelDigiErrorsCUDA>>();
   }
 
   // regions
diff --git a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
index 40b80ff6ca4f6..d8e07667f976b 100644
--- a/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
+++ b/RecoLocalTracker/SiPixelRecHits/plugins/SiPixelRecHitHeterogeneous.cc
@@ -1,4 +1,4 @@
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelCluster/interface/SiPixelClustersCUDA.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -66,8 +66,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 
   edm::EDGetTokenT<reco::BeamSpot> 	 tBeamSpot;
   // The mess with inputs will be cleaned up when migrating to the new framework
-  edm::EDGetTokenT<CUDA<SiPixelClustersCUDA>> token_;
-  edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> tokenDigi_;
+  edm::EDGetTokenT<CUDAProduct<SiPixelClustersCUDA>> token_;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tokenDigi_;
   edm::EDGetTokenT<SiPixelClusterCollectionNew> clusterToken_;
   std::string cpeName_;
 
@@ -83,8 +83,8 @@ class SiPixelRecHitHeterogeneous: public HeterogeneousEDProducer<heterogeneous::
 SiPixelRecHitHeterogeneous::SiPixelRecHitHeterogeneous(const edm::ParameterSet& iConfig):
   HeterogeneousEDProducer(iConfig),
   tBeamSpot(consumes<reco::BeamSpot>(iConfig.getParameter<edm::InputTag>("beamSpot"))),
-  token_(consumes<CUDA<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
-  tokenDigi_(consumes<CUDA<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
+  token_(consumes<CUDAProduct<SiPixelClustersCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
+  tokenDigi_(consumes<CUDAProduct<SiPixelDigisCUDA>>(iConfig.getParameter<edm::InputTag>("heterogeneousSrc"))),
   cpeName_(iConfig.getParameter<std::string>("CPE"))
 {
   enableConversion_ = iConfig.getParameter<bool>("gpuEnableConversion");
@@ -171,7 +171,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
     throw cms::Exception("Configuration") << "too bad, not a fast cpe gpu processing not possible....";
   }
 
-  edm::Handle<CUDA<SiPixelClustersCUDA>> hclusters;
+  edm::Handle<CUDAProduct<SiPixelClustersCUDA>> hclusters;
   iEvent.getByToken(token_, hclusters);
   // temporary check (until the migration)
   edm::Service<CUDAService> cs;
@@ -179,7 +179,7 @@ void SiPixelRecHitHeterogeneous::acquireGPUCuda(const edm::HeterogeneousEvent& i
   CUDAScopedContext ctx{*hclusters};
   auto const& clusters = ctx.get(*hclusters);
 
-  edm::Handle<CUDA<SiPixelDigisCUDA>> hdigis;
+  edm::Handle<CUDAProduct<SiPixelDigisCUDA>> hdigis;
   iEvent.getByToken(tokenDigi_, hdigis);
   auto const& digis = ctx.get(*hdigis);
 
diff --git a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
index 188573b6d6d26..e9e271e1e58cc 100644
--- a/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
+++ b/SimTracker/TrackerHitAssociation/plugins/ClusterTPAssociationHeterogeneous.cc
@@ -4,7 +4,7 @@
 
 #include <cuda_runtime.h>
 
-#include "CUDADataFormats/Common/interface/CUDA.h"
+#include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "CUDADataFormats/SiPixelDigi/interface/SiPixelDigisCUDA.h"
 #include "DataFormats/Common/interface/DetSetVector.h"
 #include "DataFormats/Common/interface/DetSetVectorNew.h"
@@ -91,7 +91,7 @@ class ClusterTPAssociationHeterogeneous : public HeterogeneousEDProducer<heterog
   edm::EDGetTokenT<edmNew::DetSetVector<Phase2TrackerCluster1D>> phase2OTClustersToken_;
   edm::EDGetTokenT<TrackingParticleCollection> trackingParticleToken_;
 
-  edm::EDGetTokenT<CUDA<SiPixelDigisCUDA>> tGpuDigis;
+  edm::EDGetTokenT<CUDAProduct<SiPixelDigisCUDA>> tGpuDigis;
   edm::EDGetTokenT<HeterogeneousProduct> tGpuHits;
 
   std::unique_ptr<clusterSLOnGPU::Kernel> gpuAlgo;
@@ -113,7 +113,7 @@ ClusterTPAssociationHeterogeneous::ClusterTPAssociationHeterogeneous(const edm::
     stripClustersToken_(consumes<edmNew::DetSetVector<SiStripCluster>>(cfg.getParameter<edm::InputTag>("stripClusterSrc"))),
     phase2OTClustersToken_(consumes<edmNew::DetSetVector<Phase2TrackerCluster1D>>(cfg.getParameter<edm::InputTag>("phase2OTClusterSrc"))),
     trackingParticleToken_(consumes<TrackingParticleCollection>(cfg.getParameter<edm::InputTag>("trackingParticleSrc"))),
-    tGpuDigis(consumes<CUDA<SiPixelDigisCUDA>>(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
+    tGpuDigis(consumes<CUDAProduct<SiPixelDigisCUDA>>(cfg.getParameter<edm::InputTag>("heterogeneousPixelDigiClusterSrc"))),
     tGpuHits(consumesHeterogeneous(cfg.getParameter<edm::InputTag>("heterogeneousPixelRecHitSrc"))),
     doDump(cfg.getParameter<bool>("dumpCSV"))
 {
@@ -186,7 +186,7 @@ void ClusterTPAssociationHeterogeneous::acquireGPUCuda(const edm::HeterogeneousE
 
     //  gpu stuff ------------------------
 
-    edm::Handle<CUDA<SiPixelDigisCUDA>> gd;
+    edm::Handle<CUDAProduct<SiPixelDigisCUDA>> gd;
     iEvent.getByToken(tGpuDigis, gd);
     // temporary check (until the migration)
     edm::Service<CUDAService> cs;

From 5b47953acca3f4a3ef8f2f76a1c888048a3c7204 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 22 Feb 2019 23:06:50 +0100
Subject: [PATCH 37/49] Add overload to get() taking event and token

---
 HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index 4333a9d072566..ae26244da4536 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -5,6 +5,7 @@
 #include "FWCore/Framework/interface/Event.h"
 #include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/StreamID.h"
+#include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/EDPutToken.h"
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAContextToken.h"
@@ -90,6 +91,11 @@ class CUDAScopedContext {
     return data.data_;
   }
 
+  template <typename T>
+  const T& get(const edm::Event& iEvent, edm::EDGetTokenT<CUDAProduct<T>> token) {
+    return get(iEvent.get(token));
+  }
+
   template <typename T>
   std::unique_ptr<CUDAProduct<T> > wrap(T data) const {
     // make_unique doesn't work because of private constructor

From 781a97e506d1a3e4a4305d01ef23657d3bf50351 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 22 Feb 2019 23:07:22 +0100
Subject: [PATCH 38/49] Use the new get() overload

---
 .../SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc    | 2 +-
 EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
index de5a1d21ba3c8..d47542528ed86 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigiErrorsSoAFromCUDA.cc
@@ -45,7 +45,7 @@ void SiPixelDigiErrorsSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
   CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
-  const auto& gpuDigiErrors = ctx.get(iEvent.get(digiErrorGetToken_));
+  const auto& gpuDigiErrors = ctx.get(iEvent, digiErrorGetToken_);
 
   auto tmp = gpuDigiErrors.dataErrorToHostAsync(ctx.stream());
   error_ = std::move(tmp.first);
diff --git a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
index ce561f36f477c..068701f0bcf07 100644
--- a/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
+++ b/EventFilter/SiPixelRawToDigi/plugins/SiPixelDigisSoAFromCUDA.cc
@@ -49,7 +49,7 @@ void SiPixelDigisSoAFromCUDA::acquire(const edm::Event& iEvent, const edm::Event
   // Do the transfer in a CUDA stream parallel to the computation CUDA stream
   CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
-  const auto& gpuDigis = ctx.get(iEvent.get(digiGetToken_));
+  const auto& gpuDigis = ctx.get(iEvent, digiGetToken_);
 
   nDigis_ = gpuDigis.nDigis();
   pdigi_ = gpuDigis.pdigiToHostAsync(ctx.stream());

From 83ff7cce46005a4124aa78287aa78c3af56a24be Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Thu, 7 Feb 2019 17:21:24 +0100
Subject: [PATCH 39/49] Update README

---
 HeterogeneousCore/CUDACore/README.md | 326 ++++++++++++++++++++++++++-
 1 file changed, 315 insertions(+), 11 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index e59fca3e3ed22..a538e3dc53ccb 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -1,16 +1,71 @@
-# Next iteration of the prototype for CMSSW interface to heterogeneous algorithms
+# Prototype for CMSSW interface to CUDA algorithms
 
-## Introduction
+## Outline
+
+* [Introduction](introduction)
+  * [Design goals]()
+  * [Overall guidelines]()
+* [Sub-packages]()
+* Examples
+  * Isolated producer (no CUDA input nor output)
+  * Producer with CUDA input
+  * Producer with CUDA output
+  * Produder with CUDA input and output (with ExternalWork)
+  * Producer with CUDA input and output (without ExternalWork)
+  * Configuration
+* More details
 
-The current prototype with `HeterogeneousEDProducer` and
-`HeterogeneousProduct` is documented [here](../Producer/README.md).
-The main differences wrt. that are
-* Split device-specific code to different EDProducers
-* Plug components together in the configuration
 
-This page documents the CUDA integration, and discusses briefly on how
-to extend to other devices. It will be extended if/when it gets
-deployed and `HeterogeneousEDProducer` retired.
+## Introduction
+
+This page documents the CUDA integration within CMSSW
+
+### Design goals
+
+1. Provide a mechanism for a chain of modules to share a resource
+   * Resource can be e.g. CUDA device memory or a CUDA stream
+2. Minimize data movements between the CPU and the device
+3. Support multi devices
+4. Allow the same job configuration to be used on all hardware combinations
+
+### Overall guidelines
+
+1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
+   * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
+   * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
+     * Occasional calls are permitted through a caching allocator
+   * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
+     * With the latter the `assert()`s in CUDA code are disabled by
+       default, but can be enabled by defining `GPU_DEBUG` macro
+       (before the aforementioned include)
+2. Synchronization needs should be fulfilled with
+   [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
+   extension to EDProducers
+   * `ExternalWork` can be used to replace one synchronization point
+     (e.g. between device kernels and copying a known amount of data
+     back to CPU).
+   * For further synchronization points (e.g. copying data whose
+     amount is known only at the device side), split the work to
+     multiple `ExternalWork` producers. This approach has the added
+     benefit that e.g. data transfers to CPU become on-demand automatically
+   * A general breakdown of the possible steps:
+     * Convert input legacy CPU data format to CPU SoA
+     * Transfer input CPU SoA to GPU
+     * Run kernels
+     * Transfer the number of output elements to CPU
+     * Transfer the output data from GPU to CPU SoA
+     * Convert the output SoA to legacy GPU data formats
+3. Within `acquire()`/`produce()`, the CUDA device is set implicitly
+   and the CUDA stream is provided by the system (with
+   `CUDAScopedContext`)
+   * It is strongly recommended to use the provided CUDA stream for all operations
+     * If that is not feasible for some reason, the provided CUDA
+       stream must synchronize with the work queued on other CUDA
+       streams (with CUDA events and `cudaStreamWaitEvent()`)
+4. Outside of `acquire()`/`produce()`, CUDA API functions may be
+   called only if `CUDAService::enabled()` returns `true`.
+   * With point 3 it follows that in these cases multiple devices have
+     to be dealt with explicitly, as well as CUDA streams
 
 ## Sub-packages
 * [`HeterogeneousCore/CUDACore`](#cuda-integration) CUDA-specific core components
@@ -19,7 +74,256 @@ deployed and `HeterogeneousEDProducer` retired.
 * [`HeterogeneousCore/CUDATest`](../CUDATest) Test modules and configurations
 * [`CUDADataFormats/Common`](../../CUDADataFormats/Common) Utilities for event products with CUDA data
 
-# CUDA integration
+## Examples
+
+### Isolated producer (no CUDA input nor output)
+
+```cpp
+class IsolatedProducerCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  IsolatedProducerGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo_.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA output
+
+```cpp
+class ProducerOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  ProducerOutputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<InputData> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  CUDAContextToken ctxTmp_;
+};
+...
+void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  // Sets the current device and creates a CUDA stream
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
+
+  auto const& inputData = iEvent.get(inputToken_);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Passes the current device and CUDA stream to produce()
+  // Feels a bit silly, and will hopefully get improved in the future
+  ctxTmp_ = ctx.toToken();
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContext::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input
+
+```cpp
+class ProducerInputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<OutputData> outputToken_;
+};
+...
+void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+  // Alternatively, if e.g. there is another module queuing //
+  // independent work to the CUDA stream, a new CUDA stream can also be
+  // created here with
+  CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder);
+  
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContext holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Real life is likely more complex than this simple example. Here
+  // getResult() returns some data in CPU memory that is passed
+  // directly to the OutputData constructor.
+  iEvent.emplace(outputToken_, gpuAlgo_.getResult());
+}
+```
+
+### Producer with CUDA input and output (with ExternalWork)
+
+```cpp
+class ProducerInputOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
+public:
+  ...
+  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+  CUDAContextToken ctxTmp_;
+};
+...
+void ProducerInputOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContext holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream()
+  gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Passes the current device and CUDA stream to produce()
+  // Feels a bit silly, and will hopefully get improved in the future
+  ctxTmp_ = ctx.toToken();
+
+// Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+
+// Called after the asynchronous work has finished
+void ProducerInputOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+  // Sets again the current device, uses the CUDA stream created in the acquire()
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
+
+  // Now getResult() returns data in GPU memory that is passed to the
+  // constructor of OutputData. CUDAScopedContext::emplace() wraps the
+  // OutputData to CUDAProduct<OutputData>. CUDAProduct<T> stores also
+  // the current device and the CUDA stream since those will be needed
+  // in the consumer side.
+  ctx.emplace(iEvent, outputToken_, gpuAlgo.getResult());
+}
+```
+
+### Producer with CUDA input and output (without ExternalWork)
+
+If the producer does not need to transfer anything back to CPU (like
+the number of output elements), the `ExternalWork` extension is not
+needed as there is no need to synchronize.
+
+```cpp
+class ProducerInputOutputCUDA: public edm::global::EDProducer<> {
+public:
+  ...
+  void produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const override;
+  ...
+private:
+  ...
+  ProducerInputGPUAlgo gpuAlgo_;
+  edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDPutTokenT<CUDAProduct<OutputData>> outputToken_;
+};
+...
+void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent, edm::EventSetup& iSetup) const {
+  CUDAProduct<InputData> const& inputDataWrapped = iEvent.get(inputToken_);
+
+  // Set the current device to the same that was used to produce
+  // InputData, and also use the same CUDA stream
+  CUDAScopedContext ctx{streamID};
+
+  // Grab the real input data. Checks that the input data is on the
+  // current device. If the input data was produced in a different CUDA
+  // stream than the CUDAScopedContext holds, create an inter-stream
+  // synchronization point with CUDA event and cudaStreamWaitEvent()
+  auto const& inputData = ctx.get(inputDataWrapped);
+
+  // Queues asynchronous data transfers and kernels to the CUDA stream
+  // returned by CUDAScopedContext::stream(). Here makeAsync() also
+  // returns data in GPU memory that is passed to the constructor of
+  // OutputData. CUDAScopedContext::emplace() wraps the OutputData to
+  // CUDAProduct<OutputData>. CUDAProduct<T> stores also the current
+  // device and the CUDA stream since those will be needed in the
+  // consumer side.
+  ctx.emplace(iEvent, outputToken, gpuAlgo.makeAsync(inputData, ctx.stream());
+
+  // Destructor of ctx queues a callback to the CUDA stream notifying
+  // waitingTaskHolder when the queued asynchronous work has finished
+}
+```
+
+### Configuration
+
+```python
+```
+
+
+##################################################
 
 ## Choosing device
 

From b3f9dbf3d454764ecd34983067510a076bcc47f9 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Tue, 5 Mar 2019 16:11:00 +0100
Subject: [PATCH 40/49] Use exitSansCUDADevices

---
 .../Common/test/test_CUDAProduct.cc           | 10 ++-----
 .../CUDACore/test/test_CUDAScopedContext.cc   | 10 ++-----
 .../test/test_TestCUDAProducerGPUFirst.cc     | 26 +++++++------------
 .../CUDAUtilities/test/copyAsync_t.cpp        | 10 ++-----
 .../CUDAUtilities/test/memsetAsync_t.cpp      | 12 +++------
 5 files changed, 19 insertions(+), 49 deletions(-)

diff --git a/CUDADataFormats/Common/test/test_CUDAProduct.cc b/CUDADataFormats/Common/test/test_CUDAProduct.cc
index 2e4da69b4ee26..bd5ddf7f512fe 100644
--- a/CUDADataFormats/Common/test/test_CUDAProduct.cc
+++ b/CUDADataFormats/Common/test/test_CUDAProduct.cc
@@ -3,6 +3,7 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 #include <cuda_runtime_api.h>
 
@@ -25,14 +26,7 @@ TEST_CASE("Use of CUDAProduct template", "[CUDACore]") {
     auto bar = std::move(foo);
   }
 
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
+  exitSansCUDADevices();
 
   constexpr int defaultDevice = 0;
   {
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
index 885ae4747c359..eda2b94f5dfb4 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDAScopedContext.cc
@@ -3,6 +3,7 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 #include "test_CUDAScopedContextKernels.h"
 
@@ -28,14 +29,7 @@ namespace {
 }
 
 TEST_CASE("Use of CUDAScopedContext", "[CUDACore]") {
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
+  exitSansCUDADevices();
 
   constexpr int defaultDevice = 0;
   {
diff --git a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
index 450ec98abaf72..f19bd24813fbf 100644
--- a/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/test/test_TestCUDAProducerGPUFirst.cc
@@ -5,6 +5,7 @@
 #include "CUDADataFormats/Common/interface/CUDAProduct.h"
 #include "HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h"
 #include "HeterogeneousCore/CUDATest/interface/CUDAThing.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 #include <iostream>
 
@@ -19,18 +20,18 @@ process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
 process.moduleToTest(process.toTest)
 )_"
   };
-  
-  edm::test::TestProcessor::Config config{ baseConfig };  
+
+  edm::test::TestProcessor::Config config{ baseConfig };
   SECTION("base configuration is OK") {
     REQUIRE_NOTHROW(edm::test::TestProcessor(config));
   }
-  
+
   SECTION("No event data") {
     edm::test::TestProcessor tester(config);
 
     REQUIRE_NOTHROW(tester.test());
   }
-  
+
   SECTION("beginJob and endJob only") {
     edm::test::TestProcessor tester(config);
 
@@ -39,13 +40,13 @@ process.moduleToTest(process.toTest)
 
   SECTION("Run with no LuminosityBlocks") {
     edm::test::TestProcessor tester(config);
-    
+
     REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
   }
 
   SECTION("LuminosityBlock with no Events") {
     edm::test::TestProcessor tester(config);
-    
+
     REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
   }
 
@@ -60,16 +61,9 @@ process.toTest = cms.EDProducer("TestCUDAProducerGPUFirst")
 process.moduleToTest(process.toTest)
 )_"
   };
-  edm::test::TestProcessor::Config config{ baseConfig };  
-
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
+  edm::test::TestProcessor::Config config{ baseConfig };
+
+  exitSansCUDADevices();
 
   constexpr int defaultDevice = 0;
 
diff --git a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
index 385e2925ac923..adfe833c5dcb4 100644
--- a/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/copyAsync_t.cpp
@@ -5,6 +5,7 @@
 #include "FWCore/ServiceRegistry/interface/ActivityRegistry.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 namespace {
   CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -16,14 +17,7 @@ namespace {
 }
 
 TEST_CASE("copyAsync", "[cudaMemTools]") {
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
+  exitSansCUDADevices();
 
   edm::ActivityRegistry ar;
   edm::ParameterSet ps;
diff --git a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
index 06ce3dc1ad726..dc116007ae925 100644
--- a/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
+++ b/HeterogeneousCore/CUDAUtilities/test/memsetAsync_t.cpp
@@ -6,6 +6,7 @@
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/copyAsync.h"
 #include "HeterogeneousCore/CUDAUtilities/interface/memsetAsync.h"
+#include "HeterogeneousCore/CUDAUtilities/interface/exitSansCUDADevices.h"
 
 namespace {
   CUDAService makeCUDAService(edm::ParameterSet ps, edm::ActivityRegistry& ar) {
@@ -17,14 +18,7 @@ namespace {
 }
 
 TEST_CASE("memsetAsync", "[cudaMemTools]") {
-  int deviceCount = 0;
-  auto ret = cudaGetDeviceCount( &deviceCount );
-  if( ret != cudaSuccess ) {
-    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
-         << ret << ") " << cudaGetErrorString( ret ) 
-         << ". Ignoring tests requiring device to be present.");
-    return;
-  }
+  exitSansCUDADevices();
 
   edm::ActivityRegistry ar;
   edm::ParameterSet ps;
@@ -61,7 +55,7 @@ TEST_CASE("memsetAsync", "[cudaMemTools]") {
     cudautils::memsetAsync(device, 0, N, stream);
     cudautils::copyAsync(host, device, N, stream);
     stream.synchronize();
-    
+
     for(int i=0; i < N; ++i) {
       CHECK(host[i] == 0);
     }

From 770df81032e9fc38d1112ed3dcfa28768aa9b1d0 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 6 Mar 2019 16:02:55 +0100
Subject: [PATCH 41/49] Disable running PixelTriplets_InvPrbl_prec as it
 expects input from stdin

---
 RecoPixelVertexing/PixelTriplets/test/BuildFile.xml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
index 9f5d10ad020e9..767d140a5d5ed 100644
--- a/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
+++ b/RecoPixelVertexing/PixelTriplets/test/BuildFile.xml
@@ -18,6 +18,7 @@
 </bin>
 <bin file="PixelTriplets_InvPrbl_prec.cpp">
   <use   name="RecoPixelVertexing/PixelTriplets"/>
+  <flags NO_TESTRUN="1"/>
 </bin>
 
 <bin file="fastDPHI_t.cpp"/>

From 2bb5412dce1bb8fce7d98ba013e5882451b02d95 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 6 Mar 2019 16:20:51 +0100
Subject: [PATCH 42/49] Run testCUDASwitch_cfg.py as a unit test

---
 .../CUDATest/plugins/TestCUDAProducerCPU.cc          |  6 +++---
 .../CUDATest/plugins/TestCUDAProducerGPU.cc          |  4 ++--
 .../CUDATest/plugins/TestCUDAProducerGPUEW.cc        |  8 ++++----
 .../CUDATest/plugins/TestCUDAProducerGPUFirst.cc     |  4 ++--
 .../CUDATest/plugins/TestCUDAProducerGPUKernel.cu    |  4 ++--
 .../CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc     |  8 ++++----
 HeterogeneousCore/CUDATest/test/BuildFile.xml        |  5 +++++
 HeterogeneousCore/CUDATest/test/TestCUDATest.cc      |  3 +++
 HeterogeneousCore/CUDATest/test/runtests.sh          | 11 +++++++++++
 .../CUDATest/test/testCUDASwitch_cfg.py              | 12 ++++++++++--
 10 files changed, 46 insertions(+), 19 deletions(-)
 create mode 100644 HeterogeneousCore/CUDATest/test/TestCUDATest.cc
 create mode 100755 HeterogeneousCore/CUDATest/test/runtests.sh

diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
index 79eb6eeead4a4..30131f796a32c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerCPU.cc
@@ -41,7 +41,7 @@ void TestCUDAProducerCPU::fillDescriptions(edm::ConfigurationDescriptions& descr
 }
 
 void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::LogPrint("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
+  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce begin event " << iEvent.id().event() << " stream " << id;
 
   int input = 0;
   if(!srcToken_.isUninitialized()) {
@@ -52,14 +52,14 @@ void TestCUDAProducerCPU::produce(edm::StreamID id, edm::Event& iEvent, const ed
   std::mt19937 gen(r());
   auto dist = std::uniform_real_distribution<>(0.2, 1.5); 
   auto dur = dist(gen);
-  edm::LogPrint("TestCUDAProducerCPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
+  edm::LogVerbatim("TestCUDAProducerCPU") << " Task (CPU) for event " << iEvent.id().event() << " in stream " << id << " will take " << dur << " seconds";
   std::this_thread::sleep_for(std::chrono::seconds(1)*dur);
 
   const unsigned int output = input + id*100 + iEvent.id().event();
 
   iEvent.emplace(dstToken_, output);
 
-  edm::LogPrint("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event() << " stream " << id << " result " << output;
+  edm::LogVerbatim("TestCUDAProducerCPU") << label_ << " TestCUDAProducerCPU::produce end event " << iEvent.id().event() << " stream " << id << " result " << output;
 }
 
 DEFINE_FWK_MODULE(TestCUDAProducerCPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
index 0fea3c6fb0c93..fc74b714f22c5 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPU.cc
@@ -40,7 +40,7 @@ void TestCUDAProducerGPU::fillDescriptions(edm::ConfigurationDescriptions& descr
 }
 
 void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   const auto& in = iEvent.get(srcToken_);
   CUDAScopedContext ctx{in};
@@ -48,7 +48,7 @@ void TestCUDAProducerGPU::produce(edm::StreamID streamID, edm::Event& iEvent, co
 
   ctx.emplace(iEvent, dstToken_, CUDAThing{gpuAlgo_.runAlgo(label_, input.get(), ctx.stream())});
 
-  edm::LogPrint("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPU") << label_ << " TestCUDAProducerGPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
 
 DEFINE_FWK_MODULE(TestCUDAProducerGPU);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
index e52f43208b0b8..b084ad8920c20 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUEW.cc
@@ -44,7 +44,7 @@ void TestCUDAProducerGPUEW::fillDescriptions(edm::ConfigurationDescriptions& des
 }
 
 void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   const auto& in = iEvent.get(srcToken_);
   CUDAScopedContext ctx{in, std::move(waitingTaskHolder)};
@@ -56,19 +56,19 @@ void TestCUDAProducerGPUEW::acquire(const edm::Event& iEvent, const edm::EventSe
   // event.
   cuda::memory::async::copy(&hostData_, devicePtr_.get()+10, sizeof(float), ctx.stream().id());
 
-  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   ctxTmp_ = ctx.toToken();
 }
 
 void TestCUDAProducerGPUEW::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID() << " 10th element " << hostData_; 
 
   CUDAScopedContext ctx{std::move(ctxTmp_)};
 
   ctx.emplace(iEvent, dstToken_, std::move(devicePtr_));
 
-  edm::LogPrint("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUEW") << label_ << " TestCUDAProducerGPUEW::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
 
 DEFINE_FWK_MODULE(TestCUDAProducerGPUEW);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
index b8b533691bde2..2c78e2730be8b 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUFirst.cc
@@ -37,14 +37,14 @@ void TestCUDAProducerGPUFirst::fillDescriptions(edm::ConfigurationDescriptions&
 }
 
 void TestCUDAProducerGPUFirst::produce(edm::StreamID streamID, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   CUDAScopedContext ctx{streamID};
 
   cudautils::device::unique_ptr<float[]> output = gpuAlgo_.runAlgo(label_, ctx.stream());
   iEvent.put(ctx.wrap(CUDAThing(std::move(output))));
 
-  edm::LogPrint("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUFirst") << label_ << " TestCUDAProducerGPUFirst::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
 
 DEFINE_FWK_MODULE(TestCUDAProducerGPUFirst);
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
index b24dfb6642a41..0bffb6656f31c 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUKernel.cu
@@ -87,7 +87,7 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
 
   auto d_c = cs->make_device_unique<float[]>(NUM_VALUES, stream);
   auto current_device = cuda::device::current::get();
-  edm::LogPrint("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id();
+  edm::LogVerbatim("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU launching kernels device " << current_device.id() << " CUDA stream " << stream.id();
   vectorAdd<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_a.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
   auto d_ma = cs->make_device_unique<float[]>(NUM_VALUES*NUM_VALUES, stream);
@@ -107,6 +107,6 @@ cudautils::device::unique_ptr<float[]> TestCUDAProducerGPUKernel::runAlgo(const
 
   matrixMulVector<<<blocksPerGrid, threadsPerBlock, 0, stream.id()>>>(d_mc.get(), d_b.get(), d_c.get(), NUM_VALUES);
 
-  edm::LogPrint("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU kernels launched, returning return pointer device " << current_device.id() << " CUDA stream " << stream.id();
+  edm::LogVerbatim("TestHeterogeneousEDProducerGPU") << "  " << label << " GPU kernels launched, returning return pointer device " << current_device.id() << " CUDA stream " << stream.id();
   return d_a;
 }
diff --git a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
index 79d4368b22468..b5653d0af366d 100644
--- a/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
+++ b/HeterogeneousCore/CUDATest/plugins/TestCUDAProducerGPUtoCPU.cc
@@ -44,7 +44,7 @@ void TestCUDAProducerGPUtoCPU::fillDescriptions(edm::ConfigurationDescriptions&
 }
 
 void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   const auto& in = iEvent.get(srcToken_);
   CUDAScopedContext ctx{in, std::move(waitingTaskHolder)};
@@ -55,11 +55,11 @@ void TestCUDAProducerGPUtoCPU::acquire(const edm::Event& iEvent, const edm::Even
   // Enqueue async copy, continue in produce once finished
   cuda::memory::async::copy(buffer_.get(), device.get(), TestCUDAProducerGPUKernel::NUM_VALUES*sizeof(float), ctx.stream().id());
 
-  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::acquire end event " << iEvent.id().event() << " stream " << iEvent.streamID();
 }
 
 void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup& iSetup) {
-  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce begin event " << iEvent.id().event() << " stream " << iEvent.streamID();
 
   int counter = 0;
   for(int i=0; i<TestCUDAProducerGPUKernel::NUM_VALUES; ++i) {
@@ -69,7 +69,7 @@ void TestCUDAProducerGPUtoCPU::produce(edm::Event& iEvent, const edm::EventSetup
 
   iEvent.emplace(dstToken_, counter);
 
-  edm::LogPrint("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << counter;
+  edm::LogVerbatim("TestCUDAProducerGPUtoCPU") << label_ << " TestCUDAProducerGPUtoCPU::produce end event " << iEvent.id().event() << " stream " << iEvent.streamID() << " result " << counter;
 }
 
 DEFINE_FWK_MODULE(TestCUDAProducerGPUtoCPU);
diff --git a/HeterogeneousCore/CUDATest/test/BuildFile.xml b/HeterogeneousCore/CUDATest/test/BuildFile.xml
index f73f9e7b7c5a6..3287d65c14470 100644
--- a/HeterogeneousCore/CUDATest/test/BuildFile.xml
+++ b/HeterogeneousCore/CUDATest/test/BuildFile.xml
@@ -3,3 +3,8 @@
   <use name="HeterogeneousCore/CUDACore"/>
   <use name="catch2"/>
 </bin>
+
+<bin file="TestCUDATest.cc">
+  <flags TEST_RUNNER_ARGS=" /bin/bash HeterogeneousCore/CUDATest/test runtests.sh"/>
+  <use name="FWCore/Utilities"/>
+</bin>
diff --git a/HeterogeneousCore/CUDATest/test/TestCUDATest.cc b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
new file mode 100644
index 0000000000000..b2991bd18ae57
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/TestCUDATest.cc
@@ -0,0 +1,3 @@
+#include "FWCore/Utilities/interface/TestHelper.h"
+
+RUNTEST()
diff --git a/HeterogeneousCore/CUDATest/test/runtests.sh b/HeterogeneousCore/CUDATest/test/runtests.sh
new file mode 100755
index 0000000000000..6817aa8d7ffab
--- /dev/null
+++ b/HeterogeneousCore/CUDATest/test/runtests.sh
@@ -0,0 +1,11 @@
+#!/bin/bash
+
+function die { echo Failure $1: status $2 ; exit $2 ; }
+
+pushd ${LOCAL_TMP_DIR}
+
+  echo "*************************************************"
+  echo "CUDA producer configuration with SwitchProducer"
+  cmsRun ${LOCAL_TEST_DIR}/testCUDASwitch_cfg.py || die "cmsRun testCUDASwitch_cfg.py 1" $?
+
+popd
diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
index 3ae16b4c45b49..6fb75d96ddcc0 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -1,5 +1,8 @@
 import FWCore.ParameterSet.Config as cms
 
+silent = True
+#silent = False
+
 from Configuration.ProcessModifiers.gpu_cff import gpu
 process = cms.Process("Test")
 process.load("FWCore.MessageService.MessageLogger_cfi")
@@ -7,7 +10,12 @@
 
 process.source = cms.Source("EmptySource")
 
-process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(10) )
+process.maxEvents = cms.untracked.PSet( input = cms.untracked.int32(3) )
+if not silent:
+    process.maxEvents.input = 10
+    process.MessageLogger.cerr.threshold = cms.untracked.string("INFO")
+    process.MessageLogger.cerr.INFO.limit = process.MessageLogger.cerr.default.limit
+
 
 process.options = cms.untracked.PSet(
 #    numberOfThreads = cms.untracked.uint32(4),
@@ -39,7 +47,7 @@
 process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
 process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
 
-# CPU producers, ssiwtched with modules to copy data from GPU to CPU
+# CPU producers, switched with modules to copy data from GPU to CPU
 # (as "on demand" as any other EDProducer, i.e. according to
 # consumes() and prefetching). If a separate conversion step is needed
 # to get the same data formats as the CPU modules, those are then ones

From a01c307959f51ae5899fe7bdbc6de59dca0464f6 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 6 Mar 2019 20:11:57 +0100
Subject: [PATCH 43/49] Make member data of CUDAProductBase private

---
 CUDADataFormats/Common/interface/CUDAProductBase.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index 9b0f931d9744d..61baaea6111d6 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -27,6 +27,7 @@ class CUDAProductBase {
 protected:
   explicit CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream);
 
+private:
   // The cuda::stream_t is really shared among edm::Event products, so
   // using shared_ptr also here
   std::shared_ptr<cuda::stream_t<>> stream_; //!

From 42e82b9e8a18cdb160b94df17ae111e1d9c2fa2e Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 6 Mar 2019 20:25:03 +0100
Subject: [PATCH 44/49] Customize UsePhase1 flag of siPixelDigiErrors

---
 EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
index a60dd5de6d0a4..31ba8596bddc6 100644
--- a/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
+++ b/EventFilter/SiPixelRawToDigi/python/siPixelDigis_cff.py
@@ -14,6 +14,10 @@
     src = "siPixelClustersCUDAPreSplitting"
 )
 siPixelDigiErrors = _siPixelDigiErrorsFromSoA.clone()
+
+from Configuration.Eras.Modifier_phase1Pixel_cff import phase1Pixel
+phase1Pixel.toModify(siPixelDigiErrors, UsePhase1=True)
+
 siPixelDigisTaskCUDA = cms.Task(
     siPixelDigisSoA,
     siPixelDigiErrorsSoA,

From 78811028a67ebf8ad83dd098b05f745d1b7412f5 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 6 Mar 2019 20:26:45 +0100
Subject: [PATCH 45/49] Cleanup CUDAScopedContext

---
 .../CUDACore/interface/CUDAScopedContext.h    | 29 ++++---------------
 .../CUDACore/src/CUDAScopedContext.cc         | 22 ++++++++++++++
 2 files changed, 28 insertions(+), 23 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
index ae26244da4536..ef87d017373f8 100644
--- a/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
+++ b/HeterogeneousCore/CUDACore/interface/CUDAScopedContext.h
@@ -3,7 +3,6 @@
 
 #include "FWCore/Concurrency/interface/WaitingTaskWithArenaHolder.h"
 #include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Utilities/interface/Exception.h"
 #include "FWCore/Utilities/interface/StreamID.h"
 #include "FWCore/Utilities/interface/EDGetToken.h"
 #include "FWCore/Utilities/interface/EDPutToken.h"
@@ -45,14 +44,14 @@ class CUDAScopedContext {
   explicit CUDAScopedContext(edm::StreamID streamID, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
     CUDAScopedContext(streamID)
   {
-    waitingTaskHolder_ = waitingTaskHolder;
+    waitingTaskHolder_ = std::move(waitingTaskHolder);
   }
 
   template <typename T>
   explicit CUDAScopedContext(const CUDAProduct<T>& data, edm::WaitingTaskWithArenaHolder waitingTaskHolder):
     CUDAScopedContext(data)
   {
-    waitingTaskHolder_ = waitingTaskHolder;
+    waitingTaskHolder_ = std::move(waitingTaskHolder);
   }
 
   ~CUDAScopedContext();
@@ -61,7 +60,7 @@ class CUDAScopedContext {
 
   cuda::stream_t<>& stream() { return *stream_; }
   const cuda::stream_t<>& stream() const { return *stream_; }
-  const std::shared_ptr<cuda::stream_t<>> streamPtr() const { return stream_; }
+  const std::shared_ptr<cuda::stream_t<>>& streamPtr() const { return stream_; }
 
   CUDAContextToken toToken() {
     return CUDAContextToken(currentDevice_, stream_);
@@ -69,25 +68,7 @@ class CUDAScopedContext {
 
   template <typename T>
   const T& get(const CUDAProduct<T>& data) {
-    if(data.device() != currentDevice_) {
-      // Eventually replace with prefetch to current device (assuming unified memory works)
-      // If we won't go to unified memory, need to figure out something else...
-      throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
-    }
-
-    if(data.stream().id() != stream_->id()) {
-      // Different streams, need to synchronize
-      if(!data.event().has_occurred()) {
-        // Event not yet occurred, so need to add synchronization
-        // here. Sychronization is done by making the CUDA stream to
-        // wait for an event, so all subsequent work in the stream
-        // will run only after the event has "occurred" (i.e. data
-        // product became available).
-        auto ret = cudaStreamWaitEvent(stream_->id(), data.event().id(), 0);
-        cuda::throw_if_error(ret, "Failed to make a stream to wait for an event");
-      }
-    }
-
+    synchronizeStreams(data.device(), data.stream(), data.event());
     return data.data_;
   }
 
@@ -117,6 +98,8 @@ class CUDAScopedContext {
   // This construcor is only meant for testing
   explicit CUDAScopedContext(int device, std::unique_ptr<cuda::stream_t<>> stream);
 
+  void synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent);
+
   int currentDevice_;
   std::optional<edm::WaitingTaskWithArenaHolder> waitingTaskHolder_;
   cuda::device::current::scoped_override_t<> setDeviceForThisScope_;
diff --git a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
index bbb890063fdae..a29fbee36865f 100644
--- a/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
+++ b/HeterogeneousCore/CUDACore/src/CUDAScopedContext.cc
@@ -2,6 +2,7 @@
 
 #include "FWCore/MessageLogger/interface/MessageLogger.h"
 #include "FWCore/ServiceRegistry/interface/Service.h"
+#include "FWCore/Utilities/interface/Exception.h"
 #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
 
 #include "chooseCUDADevice.h"
@@ -43,3 +44,24 @@ CUDAScopedContext::~CUDAScopedContext() {
                               });
   }
 }
+
+void CUDAScopedContext::synchronizeStreams(int dataDevice, const cuda::stream_t<>& dataStream, const cuda::event_t& dataEvent) {
+  if(dataDevice != currentDevice_) {
+    // Eventually replace with prefetch to current device (assuming unified memory works)
+    // If we won't go to unified memory, need to figure out something else...
+    throw cms::Exception("LogicError") << "Handling data from multiple devices is not yet supported";
+  }
+
+  if(dataStream.id() != stream_->id()) {
+    // Different streams, need to synchronize
+    if(!dataEvent.has_occurred()) {
+      // Event not yet occurred, so need to add synchronization
+      // here. Sychronization is done by making the CUDA stream to
+      // wait for an event, so all subsequent work in the stream
+      // will run only after the event has "occurred" (i.e. data
+      // product became available).
+      auto ret = cudaStreamWaitEvent(stream_->id(), dataEvent.id(), 0);
+      cuda::throw_if_error(ret, "Failed to make a stream to wait for an event");
+    }
+  }
+}

From 6d02251a405f24987af50e00d3839b43f374565f Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Wed, 6 Mar 2019 16:46:56 -0600
Subject: [PATCH 46/49] Finalize the README

---
 HeterogeneousCore/CUDACore/README.md | 331 ++++++++++++++-------------
 1 file changed, 178 insertions(+), 153 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index a538e3dc53ccb..9cf36521e44cb 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -1,19 +1,35 @@
-# Prototype for CMSSW interface to CUDA algorithms
+# CUDA algorithms in CMSSW
 
 ## Outline
 
-* [Introduction](introduction)
-  * [Design goals]()
-  * [Overall guidelines]()
-* [Sub-packages]()
-* Examples
-  * Isolated producer (no CUDA input nor output)
-  * Producer with CUDA input
-  * Producer with CUDA output
-  * Produder with CUDA input and output (with ExternalWork)
-  * Producer with CUDA input and output (without ExternalWork)
-  * Configuration
-* More details
+* [Introduction](#introduction)
+  * [Design goals](#design-goals)
+  * [Overall guidelines](#overall-guidelines)
+* [Sub-packages](#sub-packages)
+* [Examples](#examples)
+  * [Isolated producer (no CUDA input nor output)](#isolated-producer-no-cuda-input-nor-output)
+  * [Producer with CUDA input](#producer-with-cuda-output)
+  * [Producer with CUDA output](#producer-with-cuda-input)
+  * [Producer with CUDA input and output (with ExternalWork)](#producer-with-cuda-input-and-output-with-externalwork)
+  * [Producer with CUDA input and output (without ExternalWork)](#producer-with-cuda-input-and-output-without-externalwork)
+  * [Configuration](#configuration)
+    * [GPU-only configuration](#gpu-only-configuration)
+    * [Automatic switching between CPU and GPU modules](#automatic-switching-between-cpu-and-gpu-modules)
+* [More details](#more-details)
+  * [Device choice](#device-choice)
+  * [Data model](#data-model)
+  * [CUDA EDProducer](#cuda-edproducer)
+    * [Class declaration](#class-declaration)
+    * [Memory allocation](#memory-allocation)
+      * [Caching allocator](#caching-allocator)
+      * [CUDA API](#cuda-api)
+    * [Setting the current device](#setting-the-current-device)
+    * [Getting input](#getting-input)
+    * [Calling the CUDA kernels](#calling-the-cuda-kernels)
+    * [Putting output](#putting-output)
+    * [`ExternalWork` extension](#externalwork-extension)
+    * [Transferring GPU data to CPU](#transferring-gpu-data-to-cpu)
+    * [Synchronizing between CUDA streams](#synchronizing-between-cuda-streams)
 
 
 ## Introduction
@@ -25,7 +41,7 @@ This page documents the CUDA integration within CMSSW
 1. Provide a mechanism for a chain of modules to share a resource
    * Resource can be e.g. CUDA device memory or a CUDA stream
 2. Minimize data movements between the CPU and the device
-3. Support multi devices
+3. Support multiple devices
 4. Allow the same job configuration to be used on all hardware combinations
 
 ### Overall guidelines
@@ -33,10 +49,10 @@ This page documents the CUDA integration within CMSSW
 1. Within the `acquire()`/`produce()` functions all CUDA operations should be asynchronous, i.e.
    * Use `cudaMemcpyAsync()`, `cudaMemsetAsync()`, `cudaMemPrefetchAsync()` etc.
    * Avoid `cudaMalloc*()`, `cudaHostAlloc()`, `cudaFree*()`, `cudaHostRegister()`, `cudaHostUnregister()` on every event
-     * Occasional calls are permitted through a caching allocator
+     * Occasional calls are permitted through a caching mechanism that amortizes the cost (see also [Caching allocator](#caching-allocator))
    * Avoid `assert()` in device functions, or use `#include HeterogeneousCore/CUDAUtilities/interface/cuda_assert.h`
-     * With the latter the `assert()`s in CUDA code are disabled by
-       default, but can be enabled by defining `GPU_DEBUG` macro
+     * With the latter the `assert()` calls in CUDA code are disabled by
+       default, but can be enabled by defining a `GPU_DEBUG` macro
        (before the aforementioned include)
 2. Synchronization needs should be fulfilled with
    [`ExternalWork`](https://twiki.cern.ch/twiki/bin/view/CMSPublic/FWMultithreadedFrameworkStreamModuleInterface#edm_ExternalWork)
@@ -51,12 +67,12 @@ This page documents the CUDA integration within CMSSW
    * A general breakdown of the possible steps:
      * Convert input legacy CPU data format to CPU SoA
      * Transfer input CPU SoA to GPU
-     * Run kernels
+     * Launch kernels
      * Transfer the number of output elements to CPU
      * Transfer the output data from GPU to CPU SoA
-     * Convert the output SoA to legacy GPU data formats
-3. Within `acquire()`/`produce()`, the CUDA device is set implicitly
-   and the CUDA stream is provided by the system (with
+     * Convert the output SoA to legacy CPU data formats
+3. Within `acquire()`/`produce()`, the current CUDA device is set
+   implicitly and the CUDA stream is provided by the system (with
    `CUDAScopedContext`)
    * It is strongly recommended to use the provided CUDA stream for all operations
      * If that is not feasible for some reason, the provided CUDA
@@ -82,8 +98,8 @@ This page documents the CUDA integration within CMSSW
 class IsolatedProducerCUDA: public edm::stream::EDProducer<ExternalWork> {
 public:
   ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
   ...
 private:
   ...
@@ -92,7 +108,7 @@ private:
   edm::EDPutTokenT<OutputData> outputToken_;
 };
 ...
-void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Sets the current device and creates a CUDA stream
   CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
@@ -107,7 +123,7 @@ void IsolatedProducerCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iS
 }
 
 // Called after the asynchronous work has finished
-void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // Real life is likely more complex than this simple example. Here
   // getResult() returns some data in CPU memory that is passed
   // directly to the OutputData constructor.
@@ -121,8 +137,8 @@ void IsolatedProducerCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup)
 class ProducerOutputCUDA: public edm::stream::EDProducer<ExternalWork> {
 public:
   ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
   ...
 private:
   ...
@@ -132,7 +148,7 @@ private:
   CUDAContextToken ctxTmp_;
 };
 ...
-void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
+void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
   // Sets the current device and creates a CUDA stream
   CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder)};
 
@@ -151,7 +167,7 @@ void ProducerOutputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSet
 }
 
 // Called after the asynchronous work has finished
-void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
+void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup const& iSetup) {
   // Sets again the current device, uses the CUDA stream created in the acquire()
   CUDAScopedContext ctx{std::move(ctxTmp_)};
 
@@ -170,8 +186,8 @@ void ProducerOutputCUDA::produce(edm::Event& iEvent, edm::EventSetup& iSetup) {
 class ProducerInputCUDA: public edm::stream::EDProducer<ExternalWork> {
 public:
   ...
-  void acquire(edm::Event const& iEvent, edm::EventSetup& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
-  void produce(edm::Event& iEvent, edm::EventSetup& iSetup) override;
+  void acquire(edm::Event const& iEvent, edm::EventSetup const& iSetup, edm::WaitingTaskWithArenaHolder waitingTaskHolder) override;
+  void produce(edm::Event& iEvent, edm::EventSetup const& iSetup) override;
   ...
 private:
   ...
@@ -187,7 +203,7 @@ void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetu
   // InputData, and also use the same CUDA stream
   CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
 
-  // Alternatively, if e.g. there is another module queuing //
+  // Alternatively, if e.g. there is another module queuing
   // independent work to the CUDA stream, a new CUDA stream can also be
   // created here with
   CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder);
@@ -319,69 +335,107 @@ void ProducerInputOutputCUDA::produce(edm::StreamID streamID, edm::Event& iEvent
 
 ### Configuration
 
-```python
-```
+#### GPU-only configuration
 
+For a GPU-only configuration there is nothing special to be done, just
+construct the Paths/Sequences/Tasks from the GPU modules.
 
-##################################################
+#### Automatic switching between CPU and GPU modules
 
-## Choosing device
+The `SwitchProducer` mechanism can be used to switch automatically
+between CPU and GPU modules based on the availability of GPUs on the
+machine where the configuration is done. Framework decides at the
+beginning of the job which of the modules to run for a given module
+label.
 
-### GPU and CPU
+Framework requires that the modules in the switch must produce the
+same types of output products (the closer the actual results are the
+better, but the framework can not enforce that). This means that for a
+chain of GPU modules, it is the module that transforms the SoA data
+format back to the legacy data formats (possibly, but not necessarily,
+transferring the SoA data from GPU to CPU) that should be switched
+between the legacy CPU module. The rest of the GPU modules should be
+placed to a `Task`, in which case framework runs them only if their
+output is needed by another module.
 
-Currently the device type choice (CPU vs. GPU) is done at the
-configuration level with `cms.Modifier`. In the near future this will
-be changed to a decision made at the beginning of the job with a
-[`SwitchProducer`](https://github.com/cms-sw/cmssw/pull/25439).
+```python
+from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
+process.foo = SwitchProducerCUDA(
+    cpu = cms.EDProducer("FooProducer"), # legacy CPU
+    cuda = cms.EDProducer("FooProducerFromCUDA", src="fooCUDA")
+)
+process.fooCUDA = cms.EDProducer("FooProducerCUDA")
 
-For multi-GPU setup the device is chosen in the first CUDA module in a
-chain of modules by one of the constructors of
-`CUDAScopedContext`
-```cpp
-auto ctx = CUDAScopedContext(iEvent.streamID());
+process.fooTaskCUDA = cms.Task(process.fooCUDA)
+process.fooTask = cms.Task(
+    process.foo,
+    process.fooTaskCUDA
+)
 ```
-As the choice is still the static EDM stream to device assignment, the
-EDM stream ID is needed. The logic will likely evolve in the future.
 
-### Always on GPU
+For a more complete example, see [here](../CUDATest/test/testCUDASwitch_cfg.py).
+
 
-In case the chain of modules should always be run on a GPU, the
-configuration should be built only with the GPU modules.
 
 
-## Data model
 
-The GPU data should be a class/struct containing smart pointer(s) to
-device data (see [Memory allocation](#memory-allocation)). When
-putting the data to event, the data is wrapped to `CUDA<T>` template,
-which holds
-* the GPU data
-  * must be movable, but no other restrictions
+## More details
+
+### Device choice
+
+As discussed above, with `SwitchProducer` the choice between CPU and
+GPU modules is done at the beginning of the job.
+
+For multi-GPU setup the device is chosen in the first CUDA module in a
+chain of modules by one of the constructors of `CUDAScopedContext`
+```cpp
+CUDAScopedContext ctx{iEvent.streamID()};
+```
+As the choice is still the static EDM stream to device assignment, the
+EDM stream ID is needed. The logic will likely evolve in the future to
+be more dynamic, and likely the device choice has to be made for the
+full event.
+
+### Data model
+
+The "GPU data product" should be a class/struct containing smart
+pointer(s) to device data (see [Memory allocation](#memory-allocation)).
+When putting the data to event, the data is wrapped to
+`CUDAProduct<T>` template, which holds
+* the GPU data product
+  * must be moveable, but no other restrictions
 * the current device where the data was produced, and the CUDA stream the data was produced with
 * [CUDA event for synchronization between multiple CUDA streams](#synchronizing-between-cuda-streams)
 
-Note that the `CUDA<T>` wrapper can be constructed only with
+Note that the `CUDAProduct<T>` wrapper can be constructed only with
 `CUDAScopedContext::wrap()`, and the data `T` can be obtained from it
 only with `CUDAScopedContext::get()`, as described further below. When
 putting the data product directly to `edm::Event`, also
 `CUDASCopedContext::emplace()` can be used.
 
-## CUDA EDProducer
+The GPU data products that depend on the CUDA runtime should be placed
+under `CUDADataFormats` package, using the same name for sub-package
+that would be used in `DataFormats`. Everything else, e.g. SoA for
+CPU, should go under `DataFormats` as usual.
+
+
+### CUDA EDProducer
 
-### Class declaration
+#### Class declaration
 
-The CUDA producers are normal EDProducers. Contrary to
-`HeterogeneousEDProducer`, the `ExternalWork` extension is **not**
-required. Its use is recommended though when transferring data from
-GPU to CPU.
+The CUDA producers are normal EDProducers. The `ExternalWork`
+extension should be used if a synchronization between the GPU and CPU
+is needed, e.g. when transferring data from GPU to CPU.
 
-### Memory allocation
+#### Memory allocation
+
+##### Caching allocator
 
 The memory allocations should be done dynamically with `CUDAService`
 ```cpp
 edm::Service<CUDAService> cs;
-edm::cuda::device::unique_ptr<float[]> device_buffer = cs->make_device_unique<float[]>(50, cudaStream);
-edm::cuda::host::unique_ptr<float[]>   host_buffer   = cs->make_host_unique<float[]>(50, cudaStream);
+cudautils::device::unique_ptr<float[]> device_buffer = cs->make_device_unique<float[]>(50, cudaStream);
+cudautils::host::unique_ptr<float[]>   host_buffer   = cs->make_host_unique<float[]>(50, cudaStream);
 ```
 
 in the `acquire()` and `produce()` functions. The same
@@ -393,47 +447,56 @@ guaranteed to be reserved
 * for the host: up to the destructor of the `unique_ptr`
 * for the device: until all work queued in the `cudaStream` up to the point when the `unique_ptr` destructor is called has finished
 
-### Setting the current device
+##### CUDA API
+
+The `cudaMalloc()` etc may be used outside of the event loop, but that
+should be limited to only relatively small allocations in order to
+allow as much re-use of device memory as possible.
+
+If really needed, the `cudaMalloc()` etc may be used also within the
+event loop, but then the cost of allocation and implicit
+synchronization should be explicitly amortized e.g. by caching.
+
+#### Setting the current device
 
 A CUDA producer should construct `CUDAScopedContext` in `acquire()`
-either with `edm::StreamID`, or with a `CUDA<T>` read as an input.
+(`produce()` if not using `ExternalWork`) either with `edm::StreamID`,
+or with a `CUDAProduct<T>` read as an input.
 
-A CUDA producer should read either `CUDAToken` (from
-`CUDADeviceChooser`) or one or more `CUDA<T>` products. Then, in the
-`acquire()`/`produce()`, it should construct `CUDAScopedContext` from
-one of them
 ```cpp
 // From edm::StreamID
-auto ctx = CUDAScopedContext(iEvent.streamID());
+CUDAScopedContext ctx{iEvent.streamID()};
 
-/// From CUDA<T>
-edm::Handle<CUDA<GPUClusters>> handle;
-iEvent.getByToken(srctoken_, handle);
-auto ctx = CUDAScopedContext(*handle);
+// From CUDAProduct<T>
+CUDAProduct<GPUClusters> cclus = iEvent.get(srcToken_);
+CUDAScopedContext ctx{cclus};
 ```
 
 `CUDAScopedContext` works in the RAII way and does the following
 * Sets the current device for the current scope
-  - If constructed from the `edm::StreamID`, makes the device choice and creates a new CUDA stream
-  - If constructed from the `CUDA<T>`, uses the same device and CUDA stream as was used to produce the `CUDA<T>`
+  - If constructed from the `edm::StreamID`, chooses the device and creates a new CUDA stream
+  - If constructed from the `CUDAProduct<T>`, uses the same device and CUDA stream as was used to produce the `CUDAProduct<T>`
 * Gives access to the CUDA stream the algorithm should use to queue asynchronous work
 * Calls `edm::WaitingTaskWithArenaHolder::doneWaiting()` when necessary
 * [Synchronizes between CUDA streams if necessary](#synchronizing-between-cuda-streams)
-* Needed to get/put `CUDA<T>` from/to the event
+* Needed to get/put `CUDAProduct<T>` from/to the event
 
 In case of multiple input products, from possibly different CUDA
 streams and/or CUDA devices, this approach gives the developer full
 control in which of them the kernels of the algorithm should be run.
 
-### Getting input
+#### Getting input
 
-The real product (`T`) can be obtained from `CUDA<T>` only with the
-help of `CUDAScopedContext`. 
+The real product (`T`) can be obtained from `CUDAProduct<T>` only with
+the help of `CUDAScopedContext`.
 
 ```cpp
-edm::Handle<CUDA<GPUClusters>> hclus;
-iEvent.getByToken(srctoken_, hclus);
-GPUClusters const& clus = ctx.get(*hclus);
+// From CUDAProduct<T>
+CUDAProduct<GPUClusters> cclus = iEvent.get(srcToken_);
+GPUClusters const& clus = ctx.get(cclus);
+
+// Directly from Event
+GPUClusters const& clus = ctx.get(iEvent, srcToken_);
 ```
 
 This step is needed to
@@ -441,18 +504,25 @@ This step is needed to
   * if not, throw an exception (with unified memory could prefetch instead)
 * if the CUDA streams are different, synchronize between them
 
-### Calling the CUDA kernels
+#### Calling the CUDA kernels
 
-There is nothing special, except the CUDA stream should be obtained from
-the `CUDAScopedContext`
+It is usually best to wrap the CUDA kernel calls to a separate class,
+and then call methods of that class from the EDProducer. The only
+requirement is that the CUDA stream where to queue the operations
+should be the one from the `CUDAScopedContext`
 
 ```cpp
 gpuAlgo.makeClustersAsync(..., ctx.stream());
 ```
 
-### Putting output
+If necessary, different CUDA streams may be used internally, but they
+should to be made to synchronize with the provided CUDA stream with
+CUDA events and `cudaStreamWaitEvent()`.
+
+
+#### Putting output
 
-The GPU data needs to be wrapped to `CUDA<T>` template with
+The GPU data needs to be wrapped to `CUDAProduct<T>` template with
 `CUDAScopedContext::wrap()` or `CUDAScopedContext::emplace()`
 
 ```cpp
@@ -465,34 +535,34 @@ iEvent.put(ctx.wrap(gpuAlgo.makeClustersAsync(ctx.stream())));
 
 // or avoid one unique_ptr with emplace
 edm::PutTokenT<CUDA<GPUClusters>> putToken_ = produces<CUDA<GPUClusters>>(); // in constructor
+...
 ctx.emplace(iEvent, putToken_, gpuAlgo.makeClustersAsync(ctx.stream()));
 ```
 
 This step is needed to
-* store the current device and CUDA stream into `CUDA<T>`
+* store the current device and CUDA stream into `CUDAProduct<T>`
 * record the CUDA event needed for CUDA stream synchronization
 
-### `ExternalWork` extension
+#### `ExternalWork` extension
 
 Everything above works both with and without `ExternalWork`.
 
 Without `ExternalWork` the `EDProducer`s act similar to TBB
 flowgraph's "streaming node". In other words, they just queue more
-asynchronous work in their `produce()`.
+asynchronous work to the CUDA stream in their `produce()`.
 
 The `ExternalWork` is needed when one would otherwise call
 `cudeStreamSynchronize()`. For example transferring something to CPU
 needed for downstream DQM, or queueing more asynchronous work. With
 `ExternalWork` an `acquire()` method needs to be implemented that gets
 an `edm::WaitingTaskWithArenaHolder` parameter. The
-`WaitingTaskWithArenaHolder` should then be passed to the constructor
-of `CUDAScopedContext` along
+`edm::WaitingTaskWithArenaHolder` should then be passed to the
+constructor of `CUDAScopedContext` along
 
 ```cpp
 void acquire(..., edm::WaitingTaskWithArenaHolder waitingTaskHolder) {
-  edm::Handle<CUDA<GPUClusters>> handle;
-  iEvent.getByToken(token_, handle);
-  auto ctx = CUDAScopedContext(*handle, std::move(waitingTaskHolder)); // can also copy instead of move if waitingTaskHolder is needed for something else as well
+  CUDAProduct<GPUClusters> const& cclus = iEvent.get(token_);
+  CUDAScopedContext ctx{cclus, std::move(waitingTaskHolder)}; // can also copy instead of move if waitingTaskHolder is needed for something else as well
   ...
 ```
 
@@ -517,14 +587,14 @@ void acquire(...) {
 
 void produce(...( {
   ...
-  auto ctx = CUDAScopedContext(std::move(ctxTmp_));
+  CUDAScopedContext ctx{std::move(ctxTmp_)};
 }
 ```
 
 Ideas for improvements are welcome.
 
 
-### Transferring GPU data to CPU
+#### Transferring GPU data to CPU
 
 The GPU->CPU data transfer needs synchronization to ensure the CPU
 memory to have all data before putting that to the event. This means
@@ -537,65 +607,20 @@ the `ExternalWork` needs to be used along
   * Reformat data back to legacy data formats
   * Note: `CUDAScopedContext` is **not** needed in `produce()`
 
-### Synchronizing between CUDA streams
+#### Synchronizing between CUDA streams
 
 In case the producer needs input data that were produced in two (or
-more) CUDA streams, these streams have to be synchronized (since CMSSW
-framework no longer guarantees the synchronization as was the case
-with `HeterogeneousEDProducer`). Here this synchronization is achieved
-with CUDA events.
+more) CUDA streams, these streams have to be synchronized. Here this
+synchronization is achieved with CUDA events.
 
-Each `CUDA<T>` constains also a CUDA event object. The call to
+Each `CUDAProduct<T>` constains also a CUDA event object. The call to
 `CUDAScopedContext::wrap()` will *record* the event in the CUDA stream.
 This means that when all work queued to the CUDA stream up to that
 point has been finished, the CUDA event becomes *occurred*. Then, in
-`CUDAScopedContext::get()`, if the `CUDA<T>` to get from has a
+`CUDAScopedContext::get()`, if the `CUDAProduct<T>` to get from has a
 different CUDA stream than the `CUDAScopedContext`,
 `cudaStreamWaitEvent(stream, event)` is called. This means that all
 subsequent work queued to the CUDA stream will wait for the CUDA event
 to become occurred. Therefore this subsequent work can assume that the
 to-be-getted CUDA product exists.
 
-## Configuration
-
-### With `cms.Modifier`
-
-```python
-process.foo = cms.EDProducer("FooProducer") # legacy CPU
-
-from Configuration.ProcessModifiers.gpu_cff import gpu
-process.fooCUDA = cms.EDProducer("FooProducerCUDA")
-gpu.toReplaceWith(process.foo, cms.EDProducer("FooProducerFromCUDA", src="fooCUDA"))
-
-process.fooTaskCUDA = cms.Task(process.fooCUDA)
-process.fooTask = cms.Task(
-    process.foo,
-    process.fooTaskCUDA
-)
-```
-
-For a more complete example, see [here](../CUDATests/test/testCUDA_cfg.py).
-
-### With `SwitchProducer`
-
-```python
-from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
-process.foo = SwitchProducerCUDA(
-    cpu = cms.EDProducer("FooProducer"), # legacy CPU
-    cuda = cms.EDProducer("FooProducerFromCUDA", src="fooCUDA")
-)
-process.fooCUDA = cms.EDProducer("FooProducerCUDA")
-
-process.fooTaskCUDA = cms.Task(process.fooCUDA)
-process.fooTask = cms.Task(
-    process.foo,
-    process.fooTaskCUDA
-)
-```
-
-# Extension to other devices
-
-The C++ side extends in a straightforward way. One has to add classes
-similar to `CUDAToken`, `CUDA<T>`, and `CUDAScopedContext`. Of course,
-much depends on the exact details. The python configuration side
-extends as well.

From f0383bae71fd703acb4a20912660c545b5bb6df4 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 11 Mar 2019 17:30:37 +0100
Subject: [PATCH 47/49] Improve README

---
 HeterogeneousCore/CUDACore/README.md | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 9cf36521e44cb..a2bdb67cfba11 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -193,6 +193,7 @@ private:
   ...
   ProducerInputGPUAlgo gpuAlgo_;
   edm::EDGetTokenT<CUDAProduct<InputData>> inputToken_;
+  edm::EDGetTokenT<CUDAProduct<OtherInputData>> otherInputToken_;
   edm::EDPutTokenT<OutputData> outputToken_;
 };
 ...
@@ -203,9 +204,10 @@ void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetu
   // InputData, and also use the same CUDA stream
   CUDAScopedContext ctx{inputDataWrapped, std::move(waitingTaskHolder)};
 
-  // Alternatively, if e.g. there is another module queuing
-  // independent work to the CUDA stream, a new CUDA stream can also be
-  // created here with
+  // Alternatively a new CUDA stream can be created here. This is for
+  // a case where there are two (or more) consumers of
+  // CUDAProduct<InputData> whose work is independent and thus can be run
+  // in parallel.
   CUDAScopedContext ctx{iEvent.streamID(), std::move(waitingTaskHolder);
   
   // Grab the real input data. Checks that the input data is on the
@@ -214,9 +216,15 @@ void ProducerInputCUDA::acquire(edm::Event const& iEvent, edm::EventSetup& iSetu
   // synchronization point with CUDA event and cudaStreamWaitEvent()
   auto const& inputData = ctx.get(inputDataWrapped);
 
+  // Input data from another producer
+  auto const& otherInputData = ctx.get(iEvent.get(otherInputToken_));
+  // or
+  auto const& otherInputData = ctx.get(iEvent, otherInputToken_);
+
+
   // Queues asynchronous data transfers and kernels to the CUDA stream
   // returned by CUDAScopedContext::stream()
-  gpuAlgo.makeAsync(inputData, ctx.stream());
+  gpuAlgo.makeAsync(inputData, otherInputData, ctx.stream());
 
   // Destructor of ctx queues a callback to the CUDA stream notifying
   // waitingTaskHolder when the queued asynchronous work has finished
@@ -362,7 +370,9 @@ output is needed by another module.
 from HeterogeneousCore.CUDACore.SwitchProducerCUDA import SwitchProducerCUDA
 process.foo = SwitchProducerCUDA(
     cpu = cms.EDProducer("FooProducer"), # legacy CPU
-    cuda = cms.EDProducer("FooProducerFromCUDA", src="fooCUDA")
+    cuda = cms.EDProducer("FooProducerFromCUDA",
+        src="fooCUDA"
+    )
 )
 process.fooCUDA = cms.EDProducer("FooProducerCUDA")
 

From 0c4d2db173037624cda43abae1c250738938c6dc Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Mon, 11 Mar 2019 20:24:02 +0100
Subject: [PATCH 48/49] Remove references to
 CUDAService.numberOfStreamsPerDevice

---
 HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py | 3 ---
 HeterogeneousCore/CUDATest/test/testCUDA_cfg.py       | 3 ---
 2 files changed, 6 deletions(-)

diff --git a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
index 6fb75d96ddcc0..8bac73608065d 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDASwitch_cfg.py
@@ -90,6 +90,3 @@
 process.p = cms.Path()
 process.p.associate(process.t)
 process.ep = cms.EndPath(process.out)
-
-# Example of limiting the number of EDM streams per device
-#process.CUDAService.numberOfStreamsPerDevice = 1
diff --git a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
index 626eef7b207e1..5cb6c678402b4 100644
--- a/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDATest/test/testCUDA_cfg.py
@@ -77,6 +77,3 @@
 process.p = cms.Path()
 process.p.associate(process.t)
 process.ep = cms.EndPath(process.out)
-
-# Example of limiting the number of EDM streams per device
-#process.CUDAService.numberOfStreamsPerDevice = 1

From dd27ab8df5dff1fda6cc8e4f230cd8865eecded5 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 8 Mar 2019 23:35:29 +0100
Subject: [PATCH 49/49] Recycle CUDA events via a cache in CUDAService

---
 CUDADataFormats/Common/BuildFile.xml          |  2 ++
 .../Common/interface/CUDAProductBase.h        |  6 ++----
 CUDADataFormats/Common/src/CUDAProductBase.cc |  9 ++++++---
 .../CUDAServices/interface/CUDAService.h      |  8 ++++++++
 .../CUDAServices/src/CUDAService.cc           | 19 +++++++++++++++++++
 5 files changed, 37 insertions(+), 7 deletions(-)

diff --git a/CUDADataFormats/Common/BuildFile.xml b/CUDADataFormats/Common/BuildFile.xml
index 060edc3875ac1..1046b76eef0f7 100644
--- a/CUDADataFormats/Common/BuildFile.xml
+++ b/CUDADataFormats/Common/BuildFile.xml
@@ -1,4 +1,6 @@
 <use name="cuda-api-wrappers"/>
+<use name="FWCore/ServiceRegistry"/>
+<use name="HeterogeneousCore/CUDAServices"/>
 
 <export>
     <lib name="1"/>
diff --git a/CUDADataFormats/Common/interface/CUDAProductBase.h b/CUDADataFormats/Common/interface/CUDAProductBase.h
index 61baaea6111d6..eb6fdae0e5abf 100644
--- a/CUDADataFormats/Common/interface/CUDAProductBase.h
+++ b/CUDADataFormats/Common/interface/CUDAProductBase.h
@@ -31,10 +31,8 @@ class CUDAProductBase {
   // The cuda::stream_t is really shared among edm::Event products, so
   // using shared_ptr also here
   std::shared_ptr<cuda::stream_t<>> stream_; //!
-  // Using unique_ptr to support the default constructor. Tried
-  // std::optional, but cuda::event_t has its move assignment
-  // operators deleted.
-  std::unique_ptr<cuda::event_t> event_; //!
+  // shared_ptr because of caching in CUDAService
+  std::shared_ptr<cuda::event_t> event_; //!
 
   int device_ = -1; //!
 };
diff --git a/CUDADataFormats/Common/src/CUDAProductBase.cc b/CUDADataFormats/Common/src/CUDAProductBase.cc
index b09a1d0445c81..c034b4f7295f8 100644
--- a/CUDADataFormats/Common/src/CUDAProductBase.cc
+++ b/CUDADataFormats/Common/src/CUDAProductBase.cc
@@ -1,12 +1,15 @@
 #include "CUDADataFormats/Common/interface/CUDAProductBase.h"
 
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
 CUDAProductBase::CUDAProductBase(int device, std::shared_ptr<cuda::stream_t<>> stream):
   stream_(std::move(stream)),
-  event_(std::make_unique<cuda::event_t>(cuda::event::create(device,
-                                                             cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
-                                                             cuda::event::dont_record_timings))), // it should be a bit faster to ignore timings
   device_(device)
 {
+  edm::Service<CUDAService> cs;
+  event_ = cs->getCUDAEvent();
+
   // Record CUDA event to the CUDA stream. The event will become
   // "occurred" after all work queued to the stream before this
   // point has been finished.
diff --git a/HeterogeneousCore/CUDAServices/interface/CUDAService.h b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
index 22e03a61f37d2..a7c416c17ed63 100644
--- a/HeterogeneousCore/CUDAServices/interface/CUDAService.h
+++ b/HeterogeneousCore/CUDAServices/interface/CUDAService.h
@@ -136,6 +136,10 @@ class CUDAService {
   // will be returned to the cache by the shared_ptr destructor.
   std::shared_ptr<cuda::stream_t<>> getCUDAStream();
 
+  // Gets a (cached) CUDA event for the current device. The event
+  // will be returned to the cache by the shared_ptr destructor.
+  std::shared_ptr<cuda::event_t> getCUDAEvent();
+
 private:
   // PIMPL to hide details of allocator
   struct Allocator;
@@ -147,6 +151,10 @@ class CUDAService {
   struct CUDAStreamCache;
   std::unique_ptr<CUDAStreamCache> cudaStreamCache_;
 
+  // PIMPL to hide details of the CUDA event cache
+  struct CUDAEventCache;
+  std::unique_ptr<CUDAEventCache> cudaEventCache_;
+
   int numberOfDevices_ = 0;
   unsigned int numberOfStreamsTotal_ = 0;
   std::vector<std::pair<int, int>> computeCapabilities_;
diff --git a/HeterogeneousCore/CUDAServices/src/CUDAService.cc b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
index 1902f3ff843a2..e776c349f2e6c 100644
--- a/HeterogeneousCore/CUDAServices/src/CUDAService.cc
+++ b/HeterogeneousCore/CUDAServices/src/CUDAService.cc
@@ -338,6 +338,7 @@ CUDAService::CUDAService(edm::ParameterSet const& config, edm::ActivityRegistry&
   }
 
   cudaStreamCache_ = std::make_unique<CUDAStreamCache>(numberOfDevices_);
+  cudaEventCache_ = std::make_unique<CUDAEventCache>(numberOfDevices_);
 
   log << "\n";
 
@@ -355,6 +356,7 @@ CUDAService::~CUDAService() {
     if(allocator_) {
       allocator_.reset();
     }
+    cudaEventCache_.reset();
     cudaStreamCache_.reset();
 
     for (int i = 0; i < numberOfDevices_; ++i) {
@@ -512,3 +514,20 @@ std::shared_ptr<cuda::stream_t<>> CUDAService::getCUDAStream() {
       return std::make_unique<cuda::stream_t<>>(current_device.create_stream(cuda::stream::implicitly_synchronizes_with_default_stream));
     });
 }
+
+// CUDA event cache
+struct CUDAService::CUDAEventCache {
+  explicit CUDAEventCache(int ndev): cache(ndev) {}
+
+  // Separate caches for each device for fast lookup
+  std::vector<edm::ReusableObjectHolder<cuda::event_t>> cache;
+};
+
+std::shared_ptr<cuda::event_t> CUDAService::getCUDAEvent() {
+  return cudaEventCache_->cache[getCurrentDevice()].makeOrGet([](){
+      auto current_device = cuda::device::current::get();
+      // We should not return a recorded, but not-yet-occurred event
+      return std::make_unique<cuda::event_t>(current_device.create_event(cuda::event::sync_by_busy_waiting,   // default; we should try to avoid explicit synchronization, so maybe the value doesn't matter much?
+                                                                         cuda::event::dont_record_timings)); // it should be a bit faster to ignore timings
+    });
+}