Improve Yolo, mobilenet and shelfnet preprocessing using GPU #56

Signed-off-by: Micaela Verucchi <[email protected]>
ceccocats · Jan 27, 2022 · a0f54cd · a0f54cd
1 parent c2825cc
commit a0f54cd
Show file tree

Hide file tree

Showing 4 changed files with 15 additions and 76 deletions.
diff --git a/include/tkDNN/DetectionNN.h b/include/tkDNN/DetectionNN.h
@@ -1,6 +1,8 @@
 #ifndef DETECTIONNN_H
 #define DETECTIONNN_H
 
+#include "kernels.h"
+
 #include <iostream>
 #include <signal.h>
 #include <stdlib.h>    
@@ -16,6 +18,7 @@
 #include <opencv2/imgproc/imgproc.hpp>
 
 #include "tkdnn.h"
+#include "utilsNN.h"
 
 //#define OPENCV_CUDACONTRIB //if OPENCV has been compiled with CUDA and contrib.
 
@@ -31,13 +34,15 @@ class DetectionNN {
 
     protected:
         tk::dnn::NetworkRT *netRT = nullptr;
+        uint8_t *frame_d = nullptr;
         dnnType *input_d;
 
         std::vector<cv::Size> originalSize;
 
         cv::Scalar colors[256];
 
         int nBatches = 1;
+        int frame_size = 0;
 
 #ifdef OPENCV_CUDACONTRIB
         cv::cuda::GpuMat bgr[3];

diff --git a/include/tkDNN/SegmentationNN.h b/include/tkDNN/SegmentationNN.h
@@ -1,6 +1,8 @@
 #ifndef SEGMENTATIONNN_H
 #define SEGMENTATIONNN_H
 
+#include "kernels.h"
+
 #include <iostream>
 #include <signal.h>
 #include <stdlib.h>    
@@ -16,14 +18,17 @@
 #include "tkdnn.h"
 #include "NetworkViz.h"
 #include "kernelsThrust.h"
+#include "utilsNN.h"
 
 namespace tk { namespace dnn {
 
 class SegmentationNN {
 
     protected:
         tk::dnn::NetworkRT *netRT = nullptr;
+        uint8_t *frame_d = nullptr;
         int nBatches = 1;
+        int frame_size = 0;
 
         std::vector<cv::Size> originalSize;
         cv::Mat bgr[3];
@@ -76,18 +81,7 @@ class SegmentationNN {
             cv::copyMakeBorder(frame, frame_cropped, top, bottom, left, right, cv::BORDER_CONSTANT, cv::Scalar(0,0,0) );
 
             tk::dnn::dataDim_t idim = netRT->input_dim;
-
-            resize(frame_cropped, frame_cropped, cv::Size(idim.w, idim.h));
-
-            cv::split(frame_cropped, bgr);
-            for (int i = 0; i < idim.c; i++){
-                int idx = i * frame_cropped.rows * frame_cropped.cols;
-                int ch = idim.c-1 -i;
-                memcpy((void *)&input[idx + idim.tot()*bi], (void *)bgr[ch].data, frame_cropped.rows * frame_cropped.cols * sizeof(dnnType));
-            }
-
-            checkCuda(cudaMemcpyAsync(input_d+ idim.tot()*bi, input + idim.tot()*bi, idim.tot() * sizeof(dnnType), cudaMemcpyHostToDevice, netRT->stream));
-
+            resizeAndSplit(frame_cropped, &frame_d, frame_size, input_d, netRT, bi, true);
             normalize(input_d + idim.tot()*bi, idim.c, idim.h, idim.w, mean_d, stddev_d);
         }        
 

diff --git a/src/MobilenetDetection.cpp b/src/MobilenetDetection.cpp
@@ -211,38 +211,8 @@ bool MobilenetDetection::init(const std::string& tensor_path, const int n_classe
 }
 
 void MobilenetDetection::preprocess(cv::Mat &frame, const int bi){
-#ifdef OPENCV_CUDACONTRIB
-        //move original image on GPU
-        cv::cuda::GpuMat orig_img, frame_nomean;
-        orig_img = cv::cuda::GpuMat(frame);
-
-        //resize image, remove mean, divide by std
-        cv::cuda::resize (orig_img, orig_img, cv::Size(netRT->input_dim.w, netRT->input_dim.h)); 
-        orig_img.convertTo(frame_nomean, CV_32FC3, 1, -127);
-        frame_nomean.convertTo(imagePreproc, CV_32FC3, 1 / 128.0, 0);
-
-        //copy image into tensors
-        cv::cuda::split(imagePreproc, bgr);
-
-        for(int i=0; i < netRT->input_dim.c; i++){
-            int idx = i * imagePreproc.rows * imagePreproc.cols;
-            checkCuda( cudaMemcpy((void *)&input_d[idx + netRT->input_dim.tot()*bi], (void *)bgr[i].data, imagePreproc.rows * imagePreproc.cols* sizeof(float), cudaMemcpyDeviceToDevice) );
-        }
-#else
-        //resize image, remove mean, divide by std
-        cv::Mat frame_nomean;
-        resize(frame, frame, cv::Size(netRT->input_dim.w, netRT->input_dim.h));
-        frame.convertTo(frame_nomean, CV_32FC3, 1, -127);
-        frame_nomean.convertTo(imagePreproc, CV_32FC3, 1 / 128.0, 0);
-
-        //copy image into tensor and copy it into GPU
-        cv::split(imagePreproc, bgr);
-        for (int i = 0; i < netRT->input_dim.c; i++){
-            int idx = i * imagePreproc.rows * imagePreproc.cols;
-            memcpy((void *)&input[idx + netRT->input_dim.tot()*bi], (void *)bgr[i].data, imagePreproc.rows * imagePreproc.cols * sizeof(dnnType));
-        }
-        checkCuda(cudaMemcpyAsync(input_d+ netRT->input_dim.tot()*bi, input + netRT->input_dim.tot()*bi, netRT->input_dim.tot() * sizeof(dnnType), cudaMemcpyHostToDevice, netRT->stream));
-#endif
+    resizeAndSplit(frame, &frame_d, frame_size, input_d, netRT, bi, false);
+    normalize(input_d +  netRT->input_dim.tot()*bi,  netRT->input_dim.c,  netRT->input_dim.h,  netRT->input_dim.w, 127.0f, 128.0f);
 }
 
 void MobilenetDetection::postprocess(const int bi, const bool mAP){

diff --git a/src/Yolo3Detection.cpp b/src/Yolo3Detection.cpp
@@ -57,38 +57,8 @@ bool Yolo3Detection::init(const std::string& tensor_path, const int n_classes, c
 } 
 
 void Yolo3Detection::preprocess(cv::Mat &frame, const int bi){
-#ifdef OPENCV_CUDACONTRIB
-    cv::cuda::GpuMat orig_img, img_resized;
-    orig_img = cv::cuda::GpuMat(frame);
-    cv::cuda::resize(orig_img, img_resized, cv::Size(netRT->input_dim.w, netRT->input_dim.h));
-
-    img_resized.convertTo(imagePreproc, CV_32FC3, 1/255.0); 
-
-    //split channels
-    cv::cuda::split(imagePreproc,bgr);//split source
-
-    //write channels
-    for(int i=0; i<netRT->input_dim.c; i++) {
-        int size = imagePreproc.rows * imagePreproc.cols;
-        int ch = netRT->input_dim.c-1 -i;
-        bgr[ch].download(bgr_h); //TODO: don't copy back on CPU
-        checkCuda( cudaMemcpy(input_d + i*size + netRT->input_dim.tot()*bi, (float*)bgr_h.data, size*sizeof(dnnType), cudaMemcpyHostToDevice));
-    }
-#else
-    cv::resize(frame, frame, cv::Size(netRT->input_dim.w, netRT->input_dim.h));
-    frame.convertTo(imagePreproc, CV_32FC3, 1/255.0); 
-
-    //split channels
-    cv::split(imagePreproc,bgr);//split source
-
-    //write channels
-    for(int i=0; i<netRT->input_dim.c; i++) {
-        int idx = i*imagePreproc.rows*imagePreproc.cols;
-        int ch = netRT->input_dim.c-1 -i;
-        memcpy((void*)&input[idx + netRT->input_dim.tot()*bi], (void*)bgr[ch].data, imagePreproc.rows*imagePreproc.cols*sizeof(dnnType));     
-    }
-    checkCuda(cudaMemcpyAsync(input_d + netRT->input_dim.tot()*bi, input + netRT->input_dim.tot()*bi, netRT->input_dim.tot()*sizeof(dnnType), cudaMemcpyHostToDevice, netRT->stream));
-#endif
+    resizeAndSplit(frame, &frame_d, frame_size, input_d, netRT, bi, true);
+    normalize(input_d +  netRT->input_dim.tot()*bi,  netRT->input_dim.c,  netRT->input_dim.h,  netRT->input_dim.w, 0.0f, 255.0f);
 }
 
 void Yolo3Detection::postprocess(const int bi, const bool mAP){