Skip to content

Commit

Permalink
Improve Yolo, mobilenet and shelfnet preprocessing using GPU #56
Browse files Browse the repository at this point in the history
Signed-off-by: Micaela Verucchi <[email protected]>
  • Loading branch information
mive93 committed Jan 27, 2022
1 parent c2825cc commit a0f54cd
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 76 deletions.
5 changes: 5 additions & 0 deletions include/tkDNN/DetectionNN.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef DETECTIONNN_H
#define DETECTIONNN_H

#include "kernels.h"

#include <iostream>
#include <signal.h>
#include <stdlib.h>
Expand All @@ -16,6 +18,7 @@
#include <opencv2/imgproc/imgproc.hpp>

#include "tkdnn.h"
#include "utilsNN.h"

//#define OPENCV_CUDACONTRIB //if OPENCV has been compiled with CUDA and contrib.

Expand All @@ -31,13 +34,15 @@ class DetectionNN {

protected:
tk::dnn::NetworkRT *netRT = nullptr;
uint8_t *frame_d = nullptr;
dnnType *input_d;

std::vector<cv::Size> originalSize;

cv::Scalar colors[256];

int nBatches = 1;
int frame_size = 0;

#ifdef OPENCV_CUDACONTRIB
cv::cuda::GpuMat bgr[3];
Expand Down
18 changes: 6 additions & 12 deletions include/tkDNN/SegmentationNN.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#ifndef SEGMENTATIONNN_H
#define SEGMENTATIONNN_H

#include "kernels.h"

#include <iostream>
#include <signal.h>
#include <stdlib.h>
Expand All @@ -16,14 +18,17 @@
#include "tkdnn.h"
#include "NetworkViz.h"
#include "kernelsThrust.h"
#include "utilsNN.h"

namespace tk { namespace dnn {

class SegmentationNN {

protected:
tk::dnn::NetworkRT *netRT = nullptr;
uint8_t *frame_d = nullptr;
int nBatches = 1;
int frame_size = 0;

std::vector<cv::Size> originalSize;
cv::Mat bgr[3];
Expand Down Expand Up @@ -76,18 +81,7 @@ class SegmentationNN {
cv::copyMakeBorder(frame, frame_cropped, top, bottom, left, right, cv::BORDER_CONSTANT, cv::Scalar(0,0,0) );

tk::dnn::dataDim_t idim = netRT->input_dim;

resize(frame_cropped, frame_cropped, cv::Size(idim.w, idim.h));

cv::split(frame_cropped, bgr);
for (int i = 0; i < idim.c; i++){
int idx = i * frame_cropped.rows * frame_cropped.cols;
int ch = idim.c-1 -i;
memcpy((void *)&input[idx + idim.tot()*bi], (void *)bgr[ch].data, frame_cropped.rows * frame_cropped.cols * sizeof(dnnType));
}

checkCuda(cudaMemcpyAsync(input_d+ idim.tot()*bi, input + idim.tot()*bi, idim.tot() * sizeof(dnnType), cudaMemcpyHostToDevice, netRT->stream));

resizeAndSplit(frame_cropped, &frame_d, frame_size, input_d, netRT, bi, true);
normalize(input_d + idim.tot()*bi, idim.c, idim.h, idim.w, mean_d, stddev_d);
}

Expand Down
34 changes: 2 additions & 32 deletions src/MobilenetDetection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -211,38 +211,8 @@ bool MobilenetDetection::init(const std::string& tensor_path, const int n_classe
}

void MobilenetDetection::preprocess(cv::Mat &frame, const int bi){
#ifdef OPENCV_CUDACONTRIB
//move original image on GPU
cv::cuda::GpuMat orig_img, frame_nomean;
orig_img = cv::cuda::GpuMat(frame);

//resize image, remove mean, divide by std
cv::cuda::resize (orig_img, orig_img, cv::Size(netRT->input_dim.w, netRT->input_dim.h));
orig_img.convertTo(frame_nomean, CV_32FC3, 1, -127);
frame_nomean.convertTo(imagePreproc, CV_32FC3, 1 / 128.0, 0);

//copy image into tensors
cv::cuda::split(imagePreproc, bgr);

for(int i=0; i < netRT->input_dim.c; i++){
int idx = i * imagePreproc.rows * imagePreproc.cols;
checkCuda( cudaMemcpy((void *)&input_d[idx + netRT->input_dim.tot()*bi], (void *)bgr[i].data, imagePreproc.rows * imagePreproc.cols* sizeof(float), cudaMemcpyDeviceToDevice) );
}
#else
//resize image, remove mean, divide by std
cv::Mat frame_nomean;
resize(frame, frame, cv::Size(netRT->input_dim.w, netRT->input_dim.h));
frame.convertTo(frame_nomean, CV_32FC3, 1, -127);
frame_nomean.convertTo(imagePreproc, CV_32FC3, 1 / 128.0, 0);

//copy image into tensor and copy it into GPU
cv::split(imagePreproc, bgr);
for (int i = 0; i < netRT->input_dim.c; i++){
int idx = i * imagePreproc.rows * imagePreproc.cols;
memcpy((void *)&input[idx + netRT->input_dim.tot()*bi], (void *)bgr[i].data, imagePreproc.rows * imagePreproc.cols * sizeof(dnnType));
}
checkCuda(cudaMemcpyAsync(input_d+ netRT->input_dim.tot()*bi, input + netRT->input_dim.tot()*bi, netRT->input_dim.tot() * sizeof(dnnType), cudaMemcpyHostToDevice, netRT->stream));
#endif
resizeAndSplit(frame, &frame_d, frame_size, input_d, netRT, bi, false);
normalize(input_d + netRT->input_dim.tot()*bi, netRT->input_dim.c, netRT->input_dim.h, netRT->input_dim.w, 127.0f, 128.0f);
}

void MobilenetDetection::postprocess(const int bi, const bool mAP){
Expand Down
34 changes: 2 additions & 32 deletions src/Yolo3Detection.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -57,38 +57,8 @@ bool Yolo3Detection::init(const std::string& tensor_path, const int n_classes, c
}

void Yolo3Detection::preprocess(cv::Mat &frame, const int bi){
#ifdef OPENCV_CUDACONTRIB
cv::cuda::GpuMat orig_img, img_resized;
orig_img = cv::cuda::GpuMat(frame);
cv::cuda::resize(orig_img, img_resized, cv::Size(netRT->input_dim.w, netRT->input_dim.h));

img_resized.convertTo(imagePreproc, CV_32FC3, 1/255.0);

//split channels
cv::cuda::split(imagePreproc,bgr);//split source

//write channels
for(int i=0; i<netRT->input_dim.c; i++) {
int size = imagePreproc.rows * imagePreproc.cols;
int ch = netRT->input_dim.c-1 -i;
bgr[ch].download(bgr_h); //TODO: don't copy back on CPU
checkCuda( cudaMemcpy(input_d + i*size + netRT->input_dim.tot()*bi, (float*)bgr_h.data, size*sizeof(dnnType), cudaMemcpyHostToDevice));
}
#else
cv::resize(frame, frame, cv::Size(netRT->input_dim.w, netRT->input_dim.h));
frame.convertTo(imagePreproc, CV_32FC3, 1/255.0);

//split channels
cv::split(imagePreproc,bgr);//split source

//write channels
for(int i=0; i<netRT->input_dim.c; i++) {
int idx = i*imagePreproc.rows*imagePreproc.cols;
int ch = netRT->input_dim.c-1 -i;
memcpy((void*)&input[idx + netRT->input_dim.tot()*bi], (void*)bgr[ch].data, imagePreproc.rows*imagePreproc.cols*sizeof(dnnType));
}
checkCuda(cudaMemcpyAsync(input_d + netRT->input_dim.tot()*bi, input + netRT->input_dim.tot()*bi, netRT->input_dim.tot()*sizeof(dnnType), cudaMemcpyHostToDevice, netRT->stream));
#endif
resizeAndSplit(frame, &frame_d, frame_size, input_d, netRT, bi, true);
normalize(input_d + netRT->input_dim.tot()*bi, netRT->input_dim.c, netRT->input_dim.h, netRT->input_dim.w, 0.0f, 255.0f);
}

void Yolo3Detection::postprocess(const int bi, const bool mAP){
Expand Down

0 comments on commit a0f54cd

Please sign in to comment.