Skip to content

Commit

Permalink
Fix H2H copy in HW NVJPEG. (#4458)
Browse files Browse the repository at this point in the history
* Fix H2H copy in HW NVJPEG.

Signed-off-by: Michal Zientkiewicz <[email protected]>
  • Loading branch information
mzient authored and JanuszL committed Nov 22, 2022
1 parent 6ffd2e1 commit b0c2e72
Showing 1 changed file with 25 additions and 34 deletions.
59 changes: 25 additions & 34 deletions dali/operators/decoder/nvjpeg/nvjpeg_decoder_decoupled_api.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,16 @@
#include "dali/operators/decoder/nvjpeg/nvjpeg2k_helper.h"
#include "dali/operators/decoder/cache/cached_decoder_impl.h"
#include "dali/core/mm/memory.h"
#include "dali/core/cuda_stream_pool.h"
#include "dali/core/cuda_event.h"
#include "dali/core/dev_buffer.h"
#include "dali/core/static_switch.h"
#include "dali/util/image.h"
#include "dali/util/ocv.h"
#include "dali/util/nvml.h"
#include "dali/image/image_factory.h"
#include "dali/pipeline/util/thread_pool.h"
#include "dali/core/device_guard.h"
#include "dali/core/dev_buffer.h"
#include "dali/core/static_switch.h"
#include "dali/operators/decoder/nvjpeg/permute_layout.h"

#define NVJPEG_FLAT_VERSION(major, minor, patch) ((major)*1000000+(minor)*1000+(patch))
Expand Down Expand Up @@ -224,19 +226,19 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
CUDA_CALL(nvjpegBufferDeviceCreate(handle_, device_allocator_ptr, &buffer));
}
for (auto &stream : streams_) {
CUDA_CALL(cudaStreamCreateWithPriority(&stream, cudaStreamNonBlocking,
default_cuda_stream_priority_));
stream = CUDAStreamPool::instance().Get();
}
CUDA_CALL(cudaStreamCreateWithPriority(
&hw_decode_stream_, cudaStreamNonBlocking, default_cuda_stream_priority_));
hw_decode_stream_ = CUDAStreamPool::instance().Get();

if (hw_decoder_images_staging_.is_pinned())
hw_decoder_images_staging_.set_order(hw_decode_stream_);


for (auto &event : decode_events_) {
CUDA_CALL(cudaEventCreate(&event));
CUDA_CALL(cudaEventRecord(event, streams_[0]));
event = CUDAEvent::Create();
}

CUDA_CALL(cudaEventCreate(&hw_decode_event_));
CUDA_CALL(cudaEventRecord(hw_decode_event_, hw_decode_stream_));
hw_decode_event_ = CUDAEvent::Create();

#if NVJPEG2K_ENABLED
auto nvjpeg2k_thread_id = nvjpeg2k_thread_.GetThreadIds()[0];
Expand Down Expand Up @@ -269,10 +271,8 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
});
nvjpeg2k_thread_.RunAll();

CUDA_CALL(cudaStreamCreateWithPriority(&nvjpeg2k_cu_stream_, cudaStreamNonBlocking,
default_cuda_stream_priority_));
CUDA_CALL(cudaEventCreate(&nvjpeg2k_decode_event_));
CUDA_CALL(cudaEventRecord(nvjpeg2k_decode_event_, nvjpeg2k_cu_stream_));
nvjpeg2k_cu_stream_ = CUDAStreamPool::instance().Get();
nvjpeg2k_decode_event_ = CUDAEvent::Create();

for (auto &stream : nvjpeg2k_streams_) {
stream = NvJPEG2KStream::Create();
Expand All @@ -291,6 +291,10 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
#if NVML_ENABLED
nvml::Shutdown();
#endif
if (hw_decode_stream_)
CUDA_CALL(cudaStreamSynchronize(hw_decode_stream_));

hw_decoder_images_staging_.Reset();

sample_data_.clear();

Expand All @@ -311,15 +315,6 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
for (auto &buffer : device_buffers_) {
CUDA_CALL(nvjpegBufferDeviceDestroy(buffer));
}
for (auto &event : decode_events_) {
CUDA_CALL(cudaEventDestroy(event));
}
CUDA_CALL(cudaEventDestroy(hw_decode_event_));

for (auto &stream : streams_) {
CUDA_CALL(cudaStreamDestroy(stream));
}
CUDA_CALL(cudaStreamDestroy(hw_decode_stream_));

if (state_hw_batched_) {
CUDA_CALL(nvjpegJpegStateDestroy(state_hw_batched_));
Expand All @@ -333,8 +328,6 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
}

#if NVJPEG2K_ENABLED
CUDA_CALL(cudaEventDestroy(nvjpeg2k_decode_event_));
CUDA_CALL(cudaStreamDestroy(nvjpeg2k_cu_stream_));
for (auto thread_id : nvjpeg2k_thread_.GetThreadIds()) {
nvjpeg_memory::DeleteAllBuffers(thread_id);
}
Expand Down Expand Up @@ -893,9 +886,7 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
in_data_[k] = static_cast<unsigned char*>(tv.raw_mutable_tensor(k));
}
} else {
// it is H2H copy so the stream doesn't matter much as we don't use cudaMemcpy but
// maybe someday...
hw_decoder_images_staging_.Copy(tv, hw_decode_stream_);
hw_decoder_images_staging_.Copy(tv);
for (size_t k = 0; k < samples_hw_batched_.size(); ++k) {
in_data_[k] = hw_decoder_images_staging_.mutable_tensor<uint8_t>(k);
}
Expand Down Expand Up @@ -1087,8 +1078,8 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
NvJPEG2KHandle nvjpeg2k_handle_{};
NvJPEG2KDecodeState nvjpeg2k_decoder_{};
DeviceBuffer<uint8_t> nvjpeg2k_intermediate_buffer_;
cudaStream_t nvjpeg2k_cu_stream_;
cudaEvent_t nvjpeg2k_decode_event_;
CUDAStreamLease nvjpeg2k_cu_stream_;
CUDAEvent nvjpeg2k_decode_event_;
nvjpeg2kDeviceAllocator_t nvjpeg2k_dev_alloc_;
nvjpeg2kPinnedAllocator_t nvjpeg2k_pin_alloc_;

Expand All @@ -1099,10 +1090,10 @@ class nvJPEGDecoder : public Operator<MixedBackend>, CachedDecoderImpl {
// GPU
// Per thread
std::vector<nvjpegBufferDevice_t> device_buffers_;
std::vector<cudaStream_t> streams_;
cudaStream_t hw_decode_stream_;
std::vector<cudaEvent_t> decode_events_;
cudaEvent_t hw_decode_event_;
std::vector<CUDAStreamLease> streams_;
CUDAStreamLease hw_decode_stream_;
std::vector<CUDAEvent> decode_events_;
CUDAEvent hw_decode_event_;
std::vector<int> thread_page_ids_; // page index for double-buffering

int device_id_;
Expand Down

0 comments on commit b0c2e72

Please sign in to comment.