From 02a882978eeaa7c00707742c3030ff15823b48a9 Mon Sep 17 00:00:00 2001
From: Matti Kortelainen <matti.kortelainen@cern.ch>
Date: Fri, 3 Aug 2018 15:21:18 +0200
Subject: [PATCH] Merge CUDADeviceChooser and CUDADeviceFilter to
 CUDADeviceChooserFilter, add CUDADeviceChooserProducer

Developments

Fix unit test

Unit test for CUDADeviceChooserProducer
---
 HeterogeneousCore/CUDACore/README.md          |  50 +++++----
 .../CUDACore/plugins/CUDADeviceChooser.cc     |  89 ---------------
 .../plugins/CUDADeviceChooserFilter.cc        |  76 +++++++++++++
 .../plugins/CUDADeviceChooserProducer.cc      |  68 ++++++++++++
 .../CUDACore/plugins/CUDADeviceFilter.cc      |  40 -------
 .../CUDACore/plugins/chooseCUDADevice.cc      |  22 ++++
 .../CUDACore/plugins/chooseCUDADevice.h       |  10 ++
 .../CUDACore/test/testCUDA_cfg.py             |  16 +--
 ...ser.cc => test_CUDADeviceChooserFilter.cc} |  14 +--
 .../test/test_CUDADeviceChooserProducer.cc    | 103 ++++++++++++++++++
 10 files changed, 318 insertions(+), 170 deletions(-)
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
 delete mode 100644 HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
 create mode 100644 HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
 rename HeterogeneousCore/CUDACore/test/{test_CUDADeviceChooser.cc => test_CUDADeviceChooserFilter.cc} (84%)
 create mode 100644 HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc

diff --git a/HeterogeneousCore/CUDACore/README.md b/HeterogeneousCore/CUDACore/README.md
index 05e524a08a87f..28e24f648e7eb 100644
--- a/HeterogeneousCore/CUDACore/README.md
+++ b/HeterogeneousCore/CUDACore/README.md
@@ -21,38 +21,42 @@ deployed and `HeterogeneousEDProducer` retired.
 
 ## Choosing device
 
-The device choosing logic is split to an EDProducer, an EDFilter, and
-use of Paths in the configuration.
-
-First, a `CUDADeviceChooser` EDProducer is run. It has the logic to
-device whether the following chain of EDModules should run on a CUDA
-device or not, and if yes, on which CUDA device. If it decides "yes",
-it produces a `CUDAToken`, which contains the device id and a CUDA
-stream. If it decides "no", it does not produce anything.
-
-Next step is a `CUDADeviceFilter` EDFilter. It checks whether the
-`CUDADeviceChooser` produced a product or not. If "yes", it returns
-`true`, and if "no", it returns `false`.
-
-Finally, the pieces need to be put together in the configuration. The
-`CUDADeviceChooser` can be "anywhere", but the `CUDADeviceFilter`
-should be the first module on a `cms.Path`, followed by the CUDA
-EDProducers (in the future it may become sufficient to have only the
-first EDProducer of a chain in the `Path`).
+### Dynamically between GPU and CPU
+
+The device choosing (CPU vs. GPU, which GPU) logic is done by an
+EDFilter and using Paths in the configuration.
+
+First, a `CUDADeviceChooserFilter` EDFilter is run. It has the logic
+to device whether the following chain of EDModules should run on a
+CUDA device or not, and if yes, on which CUDA device. If it decides
+"yes", it returns `true` and produces a `CUDAToken`, which contains
+the device id and a CUDA stream. If it decides "no", it returns
+`false` and does not produce anything.
+
+Then, the pieces need to be put together in the configuration. The
+`CUDADeviceChooserFilter` should be put as the first module on a
+`cms.Path`, followed by the CUDA EDProducers (in the future it may
+become sufficient to have only the first EDProducer of a chain in the
+`Path`).
 ```python
-process.fooCUDADevice = cms.EDProducer("CUDADeviceChooser")
-process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceFilter",
+process.fooCUDADeviceFilter = cms.EDFilter("CUDADeviceChooserFilter",
     src = cms.InputTag("fooCUDADevice")
 )
 process.fooCUDA = cms.EDProducer("FooProducerCUDA")
 process.fooPathCUDA = cms.Path(
     process.fooCUDADeviceFilter + process.fooCUDA
 )
-process.fooTask = cms.Task(
-    process.fooDevice
-)
 ```
 
+### Always on GPU
+
+In case the chain of modules should always be run on a GPU, the
+EDFilter and Paths are not needed. In this case, a
+`CUDADeviceChooserProducer` should be used to produce the `CUDAToken`.
+If the machine has no GPUs or `CUDAService` is disabled, the producer
+throws an exception.
+
+
 ## Data model
 
 The GPU data can be a single pointer to device data, or a class/struct
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
deleted file mode 100644
index 58411f749d60e..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooser.cc
+++ /dev/null
@@ -1,89 +0,0 @@
-#include "FWCore/Framework/interface/global/EDProducer.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-#include "FWCore/ServiceRegistry/interface/Service.h"
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
-
-#include <cuda/api_wrappers.h>
-
-#include <memory>
-
-namespace {
-  struct DeviceCache {
-    int device;
-    bool enabled;
-  };
-}
-
-class CUDADeviceChooser: public edm::global::EDProducer<edm::StreamCache<::DeviceCache> > {
-public:
-  explicit CUDADeviceChooser(const edm::ParameterSet& iConfig);
-  ~CUDADeviceChooser() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
-
-  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
-
-private:
-  bool enabled_;
-};
-
-CUDADeviceChooser::CUDADeviceChooser(const edm::ParameterSet& iConfig):
-  enabled_(iConfig.getParameter<bool>("enabled"))
-{
-  produces<CUDAToken>();
-}
-
-void CUDADeviceChooser::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<bool>("enabled", true)->setComment("This parameter is intended for debugging purposes only. If disabling some CUDA chains is needed for production, it is better to remove the CUDA modules altogether from the configuration.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDProducer chooses whether a chain of CUDA EDModules depending on it should run or not. The decision is communicated downstream by the existence of a 'CUDAToken' event product. Intended to be used with CUDADeviceFilter.");
-}
-
-std::unique_ptr<::DeviceCache> CUDADeviceChooser::beginStream(edm::StreamID id) const {
-  auto ret = std::make_unique<::DeviceCache>();
-
-  edm::Service<CUDAService> cudaService;
-  ret->enabled = (enabled_ && cudaService->enabled(id));
-  if(!ret->enabled) {
-    return ret;
-  }
-
-  // For startes we "statically" assign the device based on
-  // edm::Stream number. This is suboptimal if the number of
-  // edm::Streams is not a multiple of the number of CUDA devices
-  // (and even then there is no load balancing).
-  //
-  // TODO: improve. Possible ideas include
-  // - allocate M (< N(edm::Streams)) buffers per device per "chain of modules", choose dynamically which (buffer, device) to use
-  // - our own CUDA memory allocator
-  //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
-  //   * would probably still need some buffer space/device to hold e.g. conditions data
-  //     - for conditions, how to handle multiple lumis per job?
-  ret->device = id % cudaService->numberOfDevices();
-
-  LogDebug("CUDADeviceChooser") << "EDM stream " << id << " set to CUDA device " << ret->device;
-
-  return ret;
-}
-
-void CUDADeviceChooser::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  auto cache = streamCache(id);
-  if(!cache->enabled) {
-    return;
-  }
-
-  auto ret = std::make_unique<CUDAToken>(cache->device);
-  LogDebug("CUDADeviceChooser") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
-  iEvent.put(std::move(ret));
-}
-
-
-DEFINE_FWK_MODULE(CUDADeviceChooser);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
new file mode 100644
index 0000000000000..15216edb020c5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserFilter.cc
@@ -0,0 +1,76 @@
+#include "FWCore/Framework/interface/global/EDFilter.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+namespace {
+  struct DeviceCache {
+    int device;
+    bool enabled;
+  };
+}
+
+class CUDADeviceChooserFilter: public edm::global::EDFilter<edm::StreamCache<::DeviceCache>> {
+public:
+  explicit CUDADeviceChooserFilter(const edm::ParameterSet& iConfig);
+  ~CUDADeviceChooserFilter() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
+
+  bool filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
+
+private:
+  bool enabled_;
+};
+
+CUDADeviceChooserFilter::CUDADeviceChooserFilter(const edm::ParameterSet& iConfig):
+  enabled_(iConfig.getParameter<bool>("enabled"))
+{
+  produces<CUDAToken>();
+}
+
+void CUDADeviceChooserFilter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  desc.add<bool>("enabled", true)->setComment("This parameter is intended for debugging purposes only. If disabling some CUDA chains is needed for production, it is better to remove the CUDA modules altogether from the configuration.");
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDFilter chooses whether a chain of CUDA EDModules depending on it should run or not, and on which CUDA device they should run. The decision is communicated downstream with the filter decision. In addition, if the filter returns true, a 'CUDAToken' is produced into the event (for false nothing is produced).");
+}
+
+std::unique_ptr<::DeviceCache> CUDADeviceChooserFilter::beginStream(edm::StreamID id) const {
+  auto ret = std::make_unique<::DeviceCache>();
+
+  edm::Service<CUDAService> cudaService;
+  ret->enabled = (enabled_ && cudaService->enabled(id));
+  if(!ret->enabled) {
+    return ret;
+  }
+
+  ret->device = cudacore::chooseCUDADevice(id);
+
+  LogDebug("CUDADeviceChooserFilter") << "EDM stream " << id << " set to CUDA device " << ret->device;
+
+  return ret;
+}
+
+bool CUDADeviceChooserFilter::filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  auto cache = streamCache(id);
+  if(!cache->enabled) {
+    return false;
+  }
+
+  auto ret = std::make_unique<CUDAToken>(cache->device);
+  LogDebug("CUDADeviceChooserFilter") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
+  iEvent.put(std::move(ret));
+  return true;
+}
+
+DEFINE_FWK_MODULE(CUDADeviceChooserFilter);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
new file mode 100644
index 0000000000000..13a4d6b34e521
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/CUDADeviceChooserProducer.cc
@@ -0,0 +1,68 @@
+#include "FWCore/Framework/interface/global/EDProducer.h"
+#include "FWCore/Framework/interface/Event.h"
+#include "FWCore/Framework/interface/Frameworkfwd.h"
+#include "FWCore/Framework/interface/MakerMacros.h"
+#include "FWCore/ParameterSet/interface/ParameterSet.h"
+#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+#include "chooseCUDADevice.h"
+
+#include <memory>
+
+namespace {
+  struct DeviceCache {
+    int device;
+  };
+}
+
+class CUDADeviceChooserProducer: public edm::global::EDProducer<edm::StreamCache<::DeviceCache>> {
+public:
+  explicit CUDADeviceChooserProducer(const edm::ParameterSet& iConfig);
+  ~CUDADeviceChooserProducer() override = default;
+
+  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
+
+  std::unique_ptr<::DeviceCache> beginStream(edm::StreamID id) const;
+
+  void produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const;
+};
+
+CUDADeviceChooserProducer::CUDADeviceChooserProducer(const edm::ParameterSet& iConfig) {
+  edm::Service<CUDAService> cudaService;
+  if(!cudaService->enabled()) {
+    throw cms::Exception("Configuration") << "CUDAService is disabled so CUDADeviceChooserProducer is unable to make decisions on which CUDA device to run. If you need to run without CUDA devices, please use CUDADeviceChooserFilter for conditional execution, or remove all CUDA modules from your configuration.";
+  }
+  produces<CUDAToken>();
+}
+
+void CUDADeviceChooserProducer::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
+  edm::ParameterSetDescription desc;
+  descriptions.addWithDefaultLabel(desc);
+  descriptions.setComment("This EDProducer chooses on which CUDA device the chain of CUDA EDModules depending on it should run. The decision is communicated downstream with the 'CUDAToken' event product. It is an error if there are no CUDA devices, or CUDAService is disabled.");
+}
+
+std::unique_ptr<::DeviceCache> CUDADeviceChooserProducer::beginStream(edm::StreamID id) const {
+  auto ret = std::make_unique<::DeviceCache>();
+
+  edm::Service<CUDAService> cudaService;
+  if(!cudaService->enabled(id)) {
+    throw cms::Exception("LogicError") << "CUDA is disabled for EDM stream " << id << " in CUDAService, so CUDADeviceChooser is unable to decide the CUDA device for this EDM stream. If you need to dynamically decide whether a chain of CUDA EDModules is run or not, please use CUDADeviceChooserFilter instead.";
+  }
+  ret->device = cudacore::chooseCUDADevice(id);
+
+  LogDebug("CUDADeviceChooserProducer") << "EDM stream " << id << " set to CUDA device " << ret->device;
+
+  return ret;
+}
+
+void CUDADeviceChooserProducer::produce(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
+  auto ret = std::make_unique<CUDAToken>(streamCache(id)->device);
+  LogDebug("CUDADeviceChooserProducer") << "EDM stream " << id << " CUDA device " << ret->device() << " with CUDA stream " << ret->stream().id();
+  iEvent.put(std::move(ret));
+}
+
+
+DEFINE_FWK_MODULE(CUDADeviceChooserProducer);
diff --git a/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc b/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
deleted file mode 100644
index cf4a8093a94bd..0000000000000
--- a/HeterogeneousCore/CUDACore/plugins/CUDADeviceFilter.cc
+++ /dev/null
@@ -1,40 +0,0 @@
-#include "FWCore/Framework/interface/global/EDFilter.h"
-#include "FWCore/Framework/interface/Event.h"
-#include "FWCore/Framework/interface/Frameworkfwd.h"
-#include "FWCore/Framework/interface/MakerMacros.h"
-#include "FWCore/ParameterSet/interface/ParameterSet.h"
-#include "FWCore/ParameterSet/interface/ParameterSetDescription.h"
-
-#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
-
-class CUDADeviceFilter: public edm::global::EDFilter<> {
-public:
-  explicit CUDADeviceFilter(const edm::ParameterSet& iConfig);
-  ~CUDADeviceFilter() override = default;
-
-  static void fillDescriptions(edm::ConfigurationDescriptions& descriptions);
-
-  bool filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const override;
-
-private:
-  edm::EDGetTokenT<CUDAToken> token_;
-};
-
-CUDADeviceFilter::CUDADeviceFilter(const edm::ParameterSet& iConfig):
-  token_(consumes<CUDAToken>(iConfig.getParameter<edm::InputTag>("src")))
-{}
-
-void CUDADeviceFilter::fillDescriptions(edm::ConfigurationDescriptions& descriptions) {
-  edm::ParameterSetDescription desc;
-  desc.add<edm::InputTag>("src", edm::InputTag("cudaDeviceChooser"))->setComment("Source of the 'CUDAToken'.");
-  descriptions.addWithDefaultLabel(desc);
-  descriptions.setComment("This EDFilter filters based on the existence of a 'CUDAToken' event product. Intended to be used together with CUDADeviceChooser. Returns 'true' if the product exists, and 'false' if not.");
-}
-
-bool CUDADeviceFilter::filter(edm::StreamID id, edm::Event& iEvent, const edm::EventSetup& iSetup) const {
-  edm::Handle<CUDAToken> handle;
-  iEvent.getByToken(token_, handle);
-  return handle.isValid();
-}
-
-DEFINE_FWK_MODULE(CUDADeviceFilter);
diff --git a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
new file mode 100644
index 0000000000000..a54ac5ac54bf7
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.cc
@@ -0,0 +1,22 @@
+#include "chooseCUDADevice.h"
+#include "FWCore/ServiceRegistry/interface/Service.h"
+#include "HeterogeneousCore/CUDAServices/interface/CUDAService.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id) {
+    edm::Service<CUDAService> cudaService;
+
+    // For startes we "statically" assign the device based on
+    // edm::Stream number. This is suboptimal if the number of
+    // edm::Streams is not a multiple of the number of CUDA devices
+    // (and even then there is no load balancing).
+    //
+    // TODO: improve. Possible ideas include
+    // - allocate M (< N(edm::Streams)) buffers per device per "chain of modules", choose dynamically which (buffer, device) to use
+    // - our own CUDA memory allocator
+    //   * being able to cheaply allocate+deallocate scratch memory allows to make the execution fully dynamic e.g. based on current load
+    //   * would probably still need some buffer space/device to hold e.g. conditions data
+    //     - for conditions, how to handle multiple lumis per job?
+    return id % cudaService->numberOfDevices();
+  }
+}
diff --git a/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
new file mode 100644
index 0000000000000..bb09c302af7f5
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/plugins/chooseCUDADevice.h
@@ -0,0 +1,10 @@
+#ifndef HeterogeneousCore_CUDACore_chooseCUDADevice_h
+#define HeterogeneousCore_CUDACore_chooseCUDADevice_h
+
+#include "FWCore/Utilities/interface/StreamID.h"
+
+namespace cudacore {
+  int chooseCUDADevice(edm::StreamID id);
+}
+
+#endif
diff --git a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py b/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
index 28f9f2539b854..6ea678259f245 100644
--- a/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
+++ b/HeterogeneousCore/CUDACore/test/testCUDA_cfg.py
@@ -30,13 +30,9 @@
 process.prod4CPU = testCUDAProducerCPU.clone(src = "prod1CPU")
 process.prod5CPU = testCUDAProducerCPU.clone()
 
-# Module to decide whether the chain of CUDA modules are run
-from HeterogeneousCore.CUDACore.cudaDeviceChooser_cfi import cudaDeviceChooser
-process.prodCUDADevice = cudaDeviceChooser.clone()
-
-# Filter to disable a Path in case we don't run on CUDA
-from HeterogeneousCore.CUDACore.cudaDeviceFilter_cfi import cudaDeviceFilter
-process.prodCUDADeviceFilter = cudaDeviceFilter.clone(src = "prodCUDADevice")
+# Module to decide whether the chain of CUDA modules are run, and to disable a Path in case we don't run on CUDA
+from HeterogeneousCore.CUDACore.cudaDeviceChooserFilter_cfi import cudaDeviceChooserFilter
+process.prodCUDADeviceFilter = cudaDeviceChooserFilter.clone()
 
 from HeterogeneousCore.CUDACore.testCUDAProducerGPUFirst_cfi import testCUDAProducerGPUFirst
 from HeterogeneousCore.CUDACore.testCUDAProducerGPU_cfi import testCUDAProducerGPU
@@ -44,11 +40,11 @@
 from HeterogeneousCore.CUDACore.testCUDAProducerGPUtoCPU_cfi import testCUDAProducerGPUtoCPU
 
 # GPU producers
-process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADevice")
+process.prod1CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
 process.prod2CUDA = testCUDAProducerGPU.clone(src = "prod1CUDA")
 process.prod3CUDA = testCUDAProducerGPU.clone(src = "prod2CUDA")
 process.prod4CUDA = testCUDAProducerGPUEW.clone(src = "prod1CUDA")
-process.prod5CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADevice")
+process.prod5CUDA = testCUDAProducerGPUFirst.clone(src = "prodCUDADeviceFilter")
 
 # Modules to copy data from GPU to CPU (as "on demand" as any other
 # EDProducer, i.e. according to consumes() and prefetching)
@@ -102,8 +98,6 @@
 )
 
 process.t = cms.Task(
-    process.prodCUDADevice,
-
     # Eventually the goal is to specify these as part of a Task,
     # but (at least) as long as the fallback mechanism is implemented
     # with an EDProducer, they must be in a Path.
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
similarity index 84%
rename from HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
rename to HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
index 40518eea7a330..09f419c33357e 100644
--- a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooser.cc
+++ b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserFilter.cc
@@ -6,14 +6,14 @@
 
 #include <cuda_runtime_api.h>
 
-static constexpr auto s_tag = "[CUDADeviceChooser]";
+static constexpr auto s_tag = "[CUDADeviceChooserFilter]";
 
-TEST_CASE("Standard checks of CUDADeviceChooser", s_tag) {
+TEST_CASE("Standard checks of CUDADeviceChooserFilter", s_tag) {
   const std::string baseConfig{
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooser")
+process.toTest = cms.EDFilter("CUDADeviceChooserFilter")
 process.moduleToTest(process.toTest)
 )_"
   };
@@ -49,12 +49,12 @@ process.moduleToTest(process.toTest)
 
 }
 
-TEST_CASE("CUDADeviceChooser enabled", s_tag) {
+TEST_CASE("CUDADeviceChooserFilter enabled", s_tag) {
   const std::string config{
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooser")
+process.toTest = cms.EDFilter("CUDADeviceChooserFilter")
 process.moduleToTest(process.toTest)
 )_"
   };
@@ -77,12 +77,12 @@ process.moduleToTest(process.toTest)
   }
 }
 
-TEST_CASE("CUDADeviceChooser disabled", s_tag) {
+TEST_CASE("CUDADeviceChooserFilter disabled", s_tag) {
   const std::string config{
 R"_(from FWCore.TestProcessor.TestProcess import *
 process = TestProcess()
 process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
-process.toTest = cms.EDProducer("CUDADeviceChooser", enabled=cms.bool(False))
+process.toTest = cms.EDFilter("CUDADeviceChooserFilter", enabled=cms.bool(False))
 process.moduleToTest(process.toTest)
 )_"
   };
diff --git a/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc
new file mode 100644
index 0000000000000..f567838730c41
--- /dev/null
+++ b/HeterogeneousCore/CUDACore/test/test_CUDADeviceChooserProducer.cc
@@ -0,0 +1,103 @@
+#include "catch.hpp"
+#include "FWCore/TestProcessor/interface/TestProcessor.h"
+#include "FWCore/Utilities/interface/Exception.h"
+
+#include "HeterogeneousCore/CUDACore/interface/CUDAToken.h"
+
+#include <cuda_runtime_api.h>
+
+static constexpr auto s_tag = "[CUDADeviceChooserFilter]";
+
+TEST_CASE("Standard checks of CUDADeviceProducer", s_tag) {
+  const std::string baseConfig{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+  
+  edm::test::TestProcessor::Config config{ baseConfig };  
+  SECTION("base configuration is OK") {
+    REQUIRE_NOTHROW(edm::test::TestProcessor(config));
+  }
+  
+  SECTION("No event data") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.test());
+  }
+  
+  SECTION("beginJob and endJob only") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testBeginAndEndJobOnly());
+  }
+
+  SECTION("Run with no LuminosityBlocks") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testRunWithNoLuminosityBlocks());
+  }
+
+  SECTION("LuminosityBlock with no Events") {
+    edm::test::TestProcessor tester(config);
+    
+    REQUIRE_NOTHROW(tester.testLuminosityBlockWithNoEvents());
+  }
+
+}
+
+TEST_CASE("CUDAService enabled", s_tag) {
+  const std::string config{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  int deviceCount = 0;
+  auto ret = cudaGetDeviceCount( &deviceCount );
+  if( ret != cudaSuccess ) {
+    WARN("Unable to query the CUDA capable devices from the CUDA runtime API: ("
+         << ret << ") " << cudaGetErrorString( ret ) 
+         << ". Ignoring tests requiring device to be present.");
+    return;
+  }
+
+  SECTION("CUDAToken") {
+    edm::test::TestProcessor tester{config};
+    auto event = tester.test();
+    
+    REQUIRE(event.get<CUDAToken>()->device() >= 0);
+    REQUIRE(event.get<CUDAToken>()->stream().id() != nullptr);
+  }
+}
+
+TEST_CASE("CUDAService disabled", s_tag) {
+  const std::string config{
+R"_(from FWCore.TestProcessor.TestProcess import *
+process = TestProcess()
+process.load("HeterogeneousCore.CUDAServices.CUDAService_cfi")
+process.CUDAService.enabled = False
+process.toTest = cms.EDProducer("CUDADeviceChooserProducer")
+process.moduleToTest(process.toTest)
+)_"
+  };
+
+  SECTION("Construction") {
+    REQUIRE_THROWS_AS(edm::test::TestProcessor{config}, cms::Exception);
+  }
+}