From ad8cea1aa7e641d271ccf0677676dbd138901556 Mon Sep 17 00:00:00 2001
From: Alexander Mock <amock@uos.de>
Date: Sun, 23 Oct 2022 22:16:47 +0200
Subject: [PATCH] found bug in math. Fixed it for SphereCorrectors. TODO: fix
 for all

---
 CMakeLists.txt                                |   1 +
 include/rmcl/correction/MICP.hpp              |  16 ++
 include/rmcl/correction/MICPRangeSensor.hpp   |   7 +
 .../rmcl/correction/SphereCorrectorEmbree.hpp |  25 +++
 .../rmcl/correction/SphereCorrectorOptix.hpp  |  32 +++-
 include/rmcl/util/ros_defines.h               |   1 +
 package.xml                                   |   3 +
 src/nodes/micp_localization.cpp               |  12 +-
 src/rmcl/correction/MICP.cpp                  |   1 +
 src/rmcl/correction/MICPRangeSensor.cpp       | 132 ++++++++++++-
 src/rmcl/correction/SphereCorrectorEmbree.cpp | 174 ++++++++++++++++--
 src/rmcl/correction/SphereCorrectorOptix.cpp  | 162 ++++++++++------
 .../optix/SphereCorrectProgramRW.cu           |  39 ++--
 src/rmcl/math/math.cpp                        |   4 +-
 src/rmcl/math/math.cu                         |  39 +---
 src/rmcl/math/math_batched.cu                 |   7 +-
 16 files changed, 518 insertions(+), 137 deletions(-)
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55bc713..38ded21 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -29,6 +29,7 @@ find_package(catkin REQUIRED COMPONENTS
     tf2_ros
     rmcl_msgs
     image_transport
+    visualization_msgs
 )
 
 include_directories(
diff --git a/include/rmcl/correction/MICP.hpp b/include/rmcl/correction/MICP.hpp
index eede0ff..833d927 100644
--- a/include/rmcl/correction/MICP.hpp
+++ b/include/rmcl/correction/MICP.hpp
@@ -157,6 +157,22 @@ class MICP
         return m_sensors;
     }
 
+    void useInThisThread()
+    {
+        #ifdef RMCL_OPTIX
+        // TODO hold a context ptr at MICP object
+        if(m_map_optix)
+        {
+            auto cuda_ctx = m_map_optix->scene()->stream()->context();
+            if(!cuda_ctx->isActive())
+            {
+                std::cout << "MICP - reactivating CUDA for thread" << std::endl;
+                cuda_ctx->use();
+            }
+        }
+        #endif // RMCL_OPTIX
+    }
+
 protected:
     bool checkTF(bool prints = false);
 
diff --git a/include/rmcl/correction/MICPRangeSensor.hpp b/include/rmcl/correction/MICPRangeSensor.hpp
index 75d7a4c..66e2f52 100644
--- a/include/rmcl/correction/MICPRangeSensor.hpp
+++ b/include/rmcl/correction/MICPRangeSensor.hpp
@@ -157,10 +157,13 @@ class MICPRangeSensor
     size_t      n_ranges_valid = 0;
 
     
+    
     // subscriber to data
     NodeHandlePtr nh;
+    NodeHandlePtr nh_p;
     SubscriberPtr data_sub;
     SubscriberPtr info_sub;
+    
 
     ImageTransportPtr it;
     ITSubscriberPtr img_sub;
@@ -173,6 +176,10 @@ class MICPRangeSensor
     float                       adaptive_max_dist_min = 0.15;
     float                       corr_weight = 1.0;
 
+    // DEBUGGING
+    bool            draw_correspondences = true;
+    PublisherPtr    pub_corr;
+
     // correction: TODO better
     #ifdef RMCL_EMBREE
     SphereCorrectorEmbreePtr     corr_sphere_embree;
diff --git a/include/rmcl/correction/SphereCorrectorEmbree.hpp b/include/rmcl/correction/SphereCorrectorEmbree.hpp
index 729e678..b2bf397 100644
--- a/include/rmcl/correction/SphereCorrectorEmbree.hpp
+++ b/include/rmcl/correction/SphereCorrectorEmbree.hpp
@@ -92,6 +92,20 @@ class SphereCorrectorEmbree
         const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms
     );
 
+    void findCorrespondences(
+        const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
+        rmagine::MemoryView<rmagine::Point> dataset_points,
+        rmagine::MemoryView<rmagine::Point> model_points,
+        rmagine::MemoryView<unsigned int> corr_valid
+    );
+
+    void findCorrespondences(
+        const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
+        rmagine::Memory<rmagine::Point>& dataset_points,
+        rmagine::Memory<rmagine::Point>& model_points,
+        rmagine::Memory<unsigned int>& corr_valid
+    );
+
     // TODO: camel case
     void compute_covs(
         const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
@@ -121,6 +135,17 @@ class SphereCorrectorEmbree
         const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms, 
         size_t Ntests = 100);
 
+    // TODO: add properly - rmagine
+    inline CorrectionParams params() const
+    {
+        return m_params;
+    }
+
+    inline rmagine::SphericalModel model() const
+    {
+        return m_model[0];
+    }
+
 protected:
     rmagine::Memory<float, rmagine::RAM> m_ranges;
 
diff --git a/include/rmcl/correction/SphereCorrectorOptix.hpp b/include/rmcl/correction/SphereCorrectorOptix.hpp
index 2a39200..8d92418 100644
--- a/include/rmcl/correction/SphereCorrectorOptix.hpp
+++ b/include/rmcl/correction/SphereCorrectorOptix.hpp
@@ -72,10 +72,26 @@ class SphereCorrectorOptix
         const rmagine::MemoryView<float, rmagine::VRAM_CUDA>& ranges
     );
 
+    
+
     CorrectionResults<rmagine::VRAM_CUDA> correct(
         const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms
     ) const;
 
+    void findCorrespondences(
+        const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms,
+        rmagine::MemoryView<rmagine::Point, rmagine::VRAM_CUDA> dataset_points,
+        rmagine::MemoryView<rmagine::Point, rmagine::VRAM_CUDA> model_points,
+        rmagine::MemoryView<unsigned int, rmagine::VRAM_CUDA> corr_valid
+    ) const;
+
+    void findCorrespondences(
+        const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms,
+        rmagine::Memory<rmagine::Point, rmagine::VRAM_CUDA>& dataset_points,
+        rmagine::Memory<rmagine::Point, rmagine::VRAM_CUDA>& model_points,
+        rmagine::Memory<unsigned int, rmagine::VRAM_CUDA>& corr_valid
+    ) const;
+
     void compute_covs(
         const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms,
         rmagine::MemoryView<rmagine::Vector, rmagine::VRAM_CUDA>& ms,
@@ -92,8 +108,6 @@ class SphereCorrectorOptix
     CorrectionPreResults<rmagine::VRAM_CUDA> compute_covs(
         const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms
     ) const;
-    
-
     struct Timings
     {   
         double sim = 0.0;
@@ -105,6 +119,20 @@ class SphereCorrectorOptix
         const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms, 
         size_t Ntests = 100);
 
+
+    // TODO: add properly - rmagine
+    inline CorrectionParams params() const
+    {
+        rmagine::Memory<CorrectionParams, rmagine::RAM> params_ = m_params;
+        return params_[0];
+    }
+
+    inline rmagine::SphericalModel model() const
+    {
+        rmagine::Memory<rmagine::SphericalModel, rmagine::RAM> model_ = m_model;
+        return model_[0];
+    }
+
 protected:
     rmagine::Memory<float, rmagine::VRAM_CUDA> m_ranges;
     rmagine::Memory<CorrectionParams, rmagine::VRAM_CUDA> m_params;
diff --git a/include/rmcl/util/ros_defines.h b/include/rmcl/util/ros_defines.h
index 5d946c6..a72273e 100644
--- a/include/rmcl/util/ros_defines.h
+++ b/include/rmcl/util/ros_defines.h
@@ -51,6 +51,7 @@ namespace rmcl
 
 using NodeHandlePtr = std::shared_ptr<ros::NodeHandle>;
 using SubscriberPtr = std::shared_ptr<ros::Subscriber>;
+using PublisherPtr = std::shared_ptr<ros::Publisher>;
 using ImageTransportPtr = std::shared_ptr<image_transport::ImageTransport>;
 using ITSubscriberPtr = std::shared_ptr<image_transport::Subscriber>;
 using TFBufferPtr = std::shared_ptr<tf2_ros::Buffer>;
diff --git a/package.xml b/package.xml
index cc0838a..df95b4f 100644
--- a/package.xml
+++ b/package.xml
@@ -17,16 +17,19 @@
     <build_depend>tf2_ros</build_depend>
     <build_depend>rmcl_msgs</build_depend>
     <build_depend>image_transport</build_depend>
+    <build_depend>visualization_msgs</build_depend>
     <build_export_depend>roscpp</build_export_depend>
     <build_export_depend>geometry_msgs</build_export_depend>
     <build_export_depend>sensor_msgs</build_export_depend>
     <build_export_depend>tf2_ros</build_export_depend>
     <build_export_depend>rmcl_msgs</build_export_depend>
     <build_export_depend>image_transport</build_export_depend>
+    <build_export_depend>visualization_msgs</build_export_depend>
     <exec_depend>roscpp</exec_depend>
     <exec_depend>geometry_msgs</exec_depend>
     <exec_depend>sensor_msgs</exec_depend>
     <exec_depend>tf2_ros</exec_depend>
     <exec_depend>rmcl_msgs</exec_depend>
     <exec_depend>image_transport</exec_depend>
+    <exec_depend>visualization_msgs</exec_depend>
 </package>
\ No newline at end of file
diff --git a/src/nodes/micp_localization.cpp b/src/nodes/micp_localization.cpp
index 14c6b52..6f1c4e0 100644
--- a/src/nodes/micp_localization.cpp
+++ b/src/nodes/micp_localization.cpp
@@ -45,6 +45,7 @@ std::mutex                      T_base_odom_mutex;
 Transform Tbo;
 
 bool invert_tf = false;
+bool correction_disabled = false;
 
 
 Transform initial_pose_offset;
@@ -239,7 +240,12 @@ void correctOnce()
     // pose == Tbm
     // Tom = Tbm * Tob
     // 
-    Tom = poses[0] * ~Tbo;
+
+    // apply actual correction as Tom
+    if(!correction_disabled)
+    {
+        Tom = poses[0] * ~Tbo;
+    }
 }
 
 // Storing Pose information globally
@@ -420,6 +426,7 @@ int main(int argc, char** argv)
     ros::Subscriber pose_sub = nh.subscribe<geometry_msgs::PoseStamped>("pose", 1, poseCB);
     ros::Subscriber pose_wc_sub = nh.subscribe<geometry_msgs::PoseWithCovarianceStamped>("pose_wc", 1, poseWcCB);
 
+
     // CORRECTION THREAD
     stop_correction_thread = false;
     correction_thread = std::thread([corr_rate_max, print_corr_rate]()
@@ -430,6 +437,9 @@ int main(int argc, char** argv)
         // minimum duration for one loop
         double el_min = 1.0 / corr_rate_max;
 
+        // reactivate cuda context if required
+        micp->useInThisThread();
+
         while(!stop_correction_thread)
         {
             sw();
diff --git a/src/rmcl/correction/MICP.cpp b/src/rmcl/correction/MICP.cpp
index 102f8ec..8ddb848 100644
--- a/src/rmcl/correction/MICP.cpp
+++ b/src/rmcl/correction/MICP.cpp
@@ -766,6 +766,7 @@ bool MICP::loadSensor(std::string sensor_name, XmlRpc::XmlRpcValue sensor_params
     
     // connect sensor to ROS
     sensor->nh = m_nh;
+    sensor->nh_p = m_nh_p;
     sensor->tf_buffer = m_tf_buffer; 
     sensor->connect();
 
diff --git a/src/rmcl/correction/MICPRangeSensor.cpp b/src/rmcl/correction/MICPRangeSensor.cpp
index c47b062..62f120b 100644
--- a/src/rmcl/correction/MICPRangeSensor.cpp
+++ b/src/rmcl/correction/MICPRangeSensor.cpp
@@ -15,6 +15,9 @@
 
 
 #include <rmagine/math/math.cuh>
+#include <rmagine/util/prints.h>
+
+#include <visualization_msgs/Marker.h>
 
 
 namespace rm = rmagine;
@@ -22,8 +25,78 @@ namespace rm = rmagine;
 namespace rmcl
 {
 
+visualization_msgs::Marker make_marker(
+    rm::MemoryView<rm::Point, rm::RAM> dataset_points,
+    rm::MemoryView<rm::Point, rm::RAM> model_points,
+    rm::MemoryView<unsigned int, rm::RAM> corr_valid,
+    rm::Transform Tbm)
+{
+    visualization_msgs::Marker marker;
+    marker.header.frame_id = "map";
+    marker.id = 0;
+    marker.type = visualization_msgs::Marker::LINE_LIST;
+    marker.action = visualization_msgs::Marker::ADD;
+    marker.pose.orientation.w = 1.0;
+    marker.scale.x = 0.01;
+    marker.scale.y = 0.01;
+    marker.scale.z = 0.01;
+    marker.color.a = 1.0;
+    marker.color.g = 1.0;
+
+    for(size_t j=0; j<dataset_points.size(); j++)
+    {
+        if(corr_valid[j] > 0)
+        {
+            // transform from base coords to map coords
+            rm::Point d = Tbm * dataset_points[j];
+            rm::Point m = Tbm * model_points[j];
+            
+            geometry_msgs::Point dros;
+            geometry_msgs::Point mros;
+
+            dros.x = d.x;
+            dros.y = d.y;
+            dros.z = d.z;
+
+            mros.x = m.x;
+            mros.y = m.y;
+            mros.z = m.z;
+
+            marker.points.push_back(dros);
+            marker.points.push_back(mros);
+        }
+    }
+
+    return marker;
+}
+
+#ifdef RMCL_CUDA
+visualization_msgs::Marker make_marker(
+    rm::MemoryView<rm::Point, rm::VRAM_CUDA> dataset_points,
+    rm::MemoryView<rm::Point, rm::VRAM_CUDA> model_points,
+    rm::MemoryView<unsigned int, rm::VRAM_CUDA> corr_valid,
+    rm::MemoryView<rm::Transform, rm::VRAM_CUDA> Tbm)
+{
+    rm::Memory<rm::Point, rm::RAM> dataset_points_ = dataset_points;
+    rm::Memory<rm::Point, rm::RAM> model_points_ = model_points;
+    rm::Memory<unsigned int, rm::RAM> corr_valid_ = corr_valid;
+    rm::Memory<rm::Transform, rm::RAM> Tbm_ = Tbm;
+
+    return make_marker(dataset_points_, model_points_, corr_valid_, Tbm_[0]);
+}
+#endif // RMCL_CUDA
+
 void MICPRangeSensor::connect()
 {
+    if(draw_correspondences)
+    {
+        std::stringstream draw_topic;
+        draw_topic << name << "/correspondences";
+        pub_corr = std::make_shared<ros::Publisher>(
+            nh_p->advertise<visualization_msgs::Marker>(draw_topic.str(), 1)
+        );
+    }
+
     if(type == 0) { // Spherical
         if(data_topic.msg == "rmcl_msgs/ScanStamped") {
             data_sub = std::make_shared<ros::Subscriber>(
@@ -144,7 +217,6 @@ void MICPRangeSensor::connect()
         } else {
             std::cout << "info topic message " << info_topic.msg << " not supported" << std::endl; 
         }
-        
     }
 }
 
@@ -155,13 +227,12 @@ void MICPRangeSensor::fetchTF()
     
     if(frame != base_frame)
     {
-        try 
+        try
         {
             T_sensor_base = tf_buffer->lookupTransform(base_frame, frame, ros::Time(0));
         } catch (tf2::TransformException &ex) {
             ROS_WARN("%s", ex.what());
             ROS_WARN_STREAM("Source: " << frame << ", Target: " << base_frame);
-
             return;
         }
     } else {
@@ -329,6 +400,31 @@ void MICPRangeSensor::computeCovs(
     if(backend == 0)
     {
         if(type == 0) {
+
+            if(draw_correspondences)
+            {
+                // draw correspondences of first pose
+                auto Tbms0 = Tbms(0, 0+1);
+
+                rm::Memory<rm::Point, rm::RAM> dataset_points;
+                rm::Memory<rm::Point, rm::RAM> model_points;
+                rm::Memory<unsigned int, rm::RAM> corr_valid;
+
+                corr_sphere_embree->findCorrespondences(Tbms0, 
+                    dataset_points, model_points, corr_valid);
+
+                auto marker = make_marker(
+                    dataset_points, model_points, 
+                    corr_valid, Tbms0[0]);
+                
+                marker.header.stamp = ros::Time::now();
+                if(pub_corr)
+                {
+                    pub_corr->publish(marker);
+                }
+            }
+
+
             corr_sphere_embree->compute_covs(Tbms, res);
         } else if(type == 1) {
             corr_pinhole_embree->compute_covs(Tbms, res);   
@@ -373,8 +469,8 @@ void MICPRangeSensor::computeCovs(
 
 #ifdef RMCL_CUDA
 void MICPRangeSensor::computeCovs(
-    const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms,
-    CorrectionPreResults<rmagine::VRAM_CUDA>& res)
+    const rm::MemoryView<rm::Transform, rm::VRAM_CUDA>& Tbms,
+    CorrectionPreResults<rm::VRAM_CUDA>& res)
 {
     // this is bad. maybe we must go away from having a completely generic sensor
     // std::cout << "Compute Covs" << std::endl;
@@ -408,11 +504,37 @@ void MICPRangeSensor::computeCovs(
     }
     #endif // RMCL_EMBREE
     
+    
     #ifdef RMCL_OPTIX
     if(backend == 1)
     {
         // compute
         if(type == 0) {
+            // std::cout << "SPHERE GPU" << std::endl;
+
+            if(draw_correspondences)
+            {
+                // draw correspondences of first pose
+                auto Tbms0 = Tbms(0, 0+1);
+
+                rm::Memory<rm::Point, rm::VRAM_CUDA> dataset_points;
+                rm::Memory<rm::Point, rm::VRAM_CUDA> model_points;
+                rm::Memory<unsigned int, rm::VRAM_CUDA> corr_valid;
+
+                corr_sphere_optix->findCorrespondences(Tbms0, 
+                    dataset_points, model_points, corr_valid);
+
+                auto marker = make_marker(
+                    dataset_points, model_points, 
+                    corr_valid, Tbms0);
+                
+                marker.header.stamp = ros::Time::now();
+                if(pub_corr)
+                {
+                    pub_corr->publish(marker);
+                }
+            }
+
             corr_sphere_optix->compute_covs(Tbms, res);
         } else if(type == 1) {
             corr_pinhole_optix->compute_covs(Tbms, res);   
diff --git a/src/rmcl/correction/SphereCorrectorEmbree.cpp b/src/rmcl/correction/SphereCorrectorEmbree.cpp
index 608d24f..6557d74 100644
--- a/src/rmcl/correction/SphereCorrectorEmbree.cpp
+++ b/src/rmcl/correction/SphereCorrectorEmbree.cpp
@@ -174,7 +174,139 @@ CorrectionResults<rmagine::RAM> SphereCorrectorEmbree::correct(
     return res;
 }
 
+void SphereCorrectorEmbree::findCorrespondences(
+    const rm::MemoryView<rm::Transform, rm::RAM>& Tbms,
+    rm::MemoryView<rm::Point> dataset_points,
+    rm::MemoryView<rm::Point> model_points,
+    rm::MemoryView<unsigned int> corr_valid)
+{
+    const float max_distance = m_params.max_distance;
+
+    auto scene = m_map->scene->handle();
+
+    const rm::Transform Tsb = m_Tsb[0];
+
+    #pragma omp parallel for default(shared)
+    for(size_t pid=0; pid < Tbms.size(); pid++)
+    {
+        const rmagine::Transform Tbm = Tbms[pid];
+        const rmagine::Transform Tsm = Tbm * Tsb;
+        const rmagine::Transform Tms = ~Tsm;
+
+        const unsigned int glob_shift = pid * m_model->size();
+
+        for(unsigned int vid = 0; vid < m_model->getHeight(); vid++)
+        {
+            for(unsigned int hid = 0; hid < m_model->getWidth(); hid++)
+            {
+                const unsigned int loc_id = m_model->getBufferId(vid, hid);
+                const unsigned int glob_id = glob_shift + loc_id;
+
+                const float range_real = m_ranges[loc_id];
+                
+                if(range_real < m_model->range.min 
+                    || range_real > m_model->range.max)
+                {
+                    corr_valid[glob_id] = 0;
+                    continue;
+                }
+                
+                const Vector ray_dir_s = m_model->getDirection(vid, hid);
+                const Vector ray_dir_m = Tsm.R * ray_dir_s;
+
+                RTCRayHit rayhit;
+                rayhit.ray.org_x = Tsm.t.x;
+                rayhit.ray.org_y = Tsm.t.y;
+                rayhit.ray.org_z = Tsm.t.z;
+                rayhit.ray.dir_x = ray_dir_m.x;
+                rayhit.ray.dir_y = ray_dir_m.y;
+                rayhit.ray.dir_z = ray_dir_m.z;
+                rayhit.ray.tnear = 0;
+                rayhit.ray.tfar = INFINITY;
+                rayhit.ray.mask = 0;
+                rayhit.ray.flags = 0;
+                rayhit.hit.geomID = RTC_INVALID_GEOMETRY_ID;
+                rayhit.hit.instID[0] = RTC_INVALID_GEOMETRY_ID;
+
+                rtcIntersect1(scene, &m_context, &rayhit);
+
+                bool sim_valid = rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID;
+                if(sim_valid)
+                {
+                    // map space
+                    Vector nint_m;
+                    nint_m.x = rayhit.hit.Ng_x;
+                    nint_m.y = rayhit.hit.Ng_y;
+                    nint_m.z = rayhit.hit.Ng_z;
+                    nint_m.normalizeInplace();
+
+                    // Do point to plane ICP here
+                    Vector preal_s, pint_s, nint_s;
+                    preal_s = ray_dir_s * range_real;
 
+                    // search point on surface that is more nearby
+                    pint_s = ray_dir_s * rayhit.ray.tfar;
+                    
+                    // transform normal from global to local
+                    nint_s = Tms.R * nint_m;
+
+                    // flip to base
+                    if(ray_dir_s.dot(nint_s) > 0.0)
+                    {
+                        nint_s = -nint_s;
+                    }
+
+                    // distance of real point to plane at simulated point
+                    float signed_plane_dist = (pint_s - preal_s).dot(nint_s);
+                    // project point to plane results in correspondence
+                    const Vector pmesh_s = preal_s + nint_s * signed_plane_dist;  
+
+                    const float distance = (pmesh_s - preal_s).l2norm();
+
+                    if(distance < max_distance)
+                    {
+                        // convert back to base (sensor shared coordinate system)
+                        const Vector preal_b = Tsb * preal_s;
+                        const Vector pmesh_b = Tsb * pmesh_s;
+
+                        dataset_points[glob_id] = preal_b;
+                        model_points[glob_id] = pmesh_b;
+                        corr_valid[glob_id] = 1;
+                    } else {
+                        corr_valid[glob_id] = 0;
+                    }
+                } else {
+                    corr_valid[glob_id] = 0;
+                }
+            }
+        }
+    }
+}
+
+void SphereCorrectorEmbree::findCorrespondences(
+    const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
+    rmagine::Memory<rmagine::Point>& dataset_points,
+    rmagine::Memory<rmagine::Point>& model_points,
+    rmagine::Memory<unsigned int>& corr_valid)
+{
+    size_t Nrays = Tbms.size() * m_model->size();
+    if(dataset_points.size() < Nrays)
+    {
+        dataset_points.resize(Nrays);
+    }
+
+    if(model_points.size() < Nrays)
+    {
+        model_points.resize(Nrays);
+    }
+
+    if(corr_valid.size() < Nrays)
+    {
+        corr_valid.resize(Nrays);
+    }
+
+    findCorrespondences(Tbms, dataset_points(0, Nrays), model_points(0, Nrays), corr_valid(0, Nrays));
+}
 
 void SphereCorrectorEmbree::compute_covs(
     const rmagine::MemoryView<rmagine::Transform, rmagine::RAM>& Tbms,
@@ -187,13 +319,15 @@ void SphereCorrectorEmbree::compute_covs(
 
     auto scene = m_map->scene->handle();
 
+    const rmagine::Transform Tsb = m_Tsb[0];
+
+
     #pragma omp parallel for default(shared)
     for(size_t pid=0; pid < Tbms.size(); pid++)
     {
         const rmagine::Transform Tbm = Tbms[pid];
-        const rmagine::Transform Tsb = m_Tsb[0];
         const rmagine::Transform Tsm = Tbm * Tsb;
-        const rmagine::Transform Tmb = ~Tbm;
+        const rmagine::Transform Tms = ~Tsm;
 
         const unsigned int glob_shift = pid * m_model->size();
 
@@ -225,9 +359,7 @@ void SphereCorrectorEmbree::compute_covs(
                     continue;
                 }
                 
-                
                 const Vector ray_dir_s = m_model->getDirection(vid, hid);
-                const Vector ray_dir_b = Tsb.R * ray_dir_s;
                 const Vector ray_dir_m = Tsm.R * ray_dir_s;
 
                 RTCRayHit rayhit;
@@ -249,31 +381,43 @@ void SphereCorrectorEmbree::compute_covs(
                 bool sim_valid = rayhit.hit.geomID != RTC_INVALID_GEOMETRY_ID;
                 if(sim_valid)
                 {
-                    // Do point to plane ICP here
-                    Vector preal_b, pint_b, nint_m, nint_b;
-                    preal_b = ray_dir_b * range_real;
-
-                    // search point on surface that is more nearby
-                    pint_b = ray_dir_b * rayhit.ray.tfar;
+                    // map space
+                    Vector nint_m;
                     nint_m.x = rayhit.hit.Ng_x;
                     nint_m.y = rayhit.hit.Ng_y;
                     nint_m.z = rayhit.hit.Ng_z;
                     nint_m.normalizeInplace();
+
+                    // Do point to plane ICP here
+                    Vector preal_s, pint_s, nint_s;
+                    preal_s = ray_dir_s * range_real;
+
+                    // search point on surface that is more nearby
+                    pint_s = ray_dir_s * rayhit.ray.tfar;
                     
                     // transform normal from global to local
-                    nint_b = Tmb.R * nint_m;
+                    nint_s = Tms.R * nint_m;
+
+                    // flip to base
+                    if(ray_dir_s.dot(nint_s) > 0.0)
+                    {
+                        nint_s = -nint_s;
+                    }
+
                     // distance of real point to plane at simulated point
-                    float signed_plane_dist = (preal_b - pint_b).dot(nint_b);
+                    float signed_plane_dist = (pint_s - preal_s).dot(nint_s);
                     // project point to plane results in correspondence
-                    const Vector pmesh_b = preal_b + nint_b * signed_plane_dist;  
+                    const Vector pmesh_s = preal_s + nint_s * signed_plane_dist;  
 
-                    const float distance = (pmesh_b - preal_b).l2norm();
+                    const float distance = (pmesh_s - preal_s).l2norm();
 
                     if(distance < max_distance)
                     {
+                        const Vector preal_b = Tsb * preal_s;
+                        const Vector pmesh_b = Tsb * pmesh_s;
                         Dmean_inner += preal_b;
                         Mmean_inner += pmesh_b;
-                        C_inner += preal_b.multT(pmesh_b);
+                        C_inner += pmesh_b.multT(preal_b);
                         Ncorr_inner++;
                     }
                 }
diff --git a/src/rmcl/correction/SphereCorrectorOptix.cpp b/src/rmcl/correction/SphereCorrectorOptix.cpp
index 438b8fe..ab382d8 100644
--- a/src/rmcl/correction/SphereCorrectorOptix.cpp
+++ b/src/rmcl/correction/SphereCorrectorOptix.cpp
@@ -93,6 +93,74 @@ CorrectionResults<rm::VRAM_CUDA> SphereCorrectorOptix::correct(
     return res;
 }
 
+void SphereCorrectorOptix::findCorrespondences(
+    const rm::MemoryView<rm::Transform, rm::VRAM_CUDA>& Tbms,
+    rm::MemoryView<rm::Point, rm::VRAM_CUDA> dataset_points,
+    rm::MemoryView<rm::Point, rm::VRAM_CUDA> model_points,
+    rm::MemoryView<unsigned int, rm::VRAM_CUDA> corr_valid
+) const
+{
+    // std::cout << "SphereCorrectorOptix - findCorrespondences" << std::endl;
+    if(!m_map)
+    {
+        throw std::runtime_error("NO MAP");
+    }
+
+    if(!m_map->scene())
+    {
+        throw std::runtime_error("EMPTY MAP");
+    }
+
+    if(!m_map->scene()->as())
+    {
+        throw std::runtime_error("MAP SCENE NOT COMMITTED");
+    }
+
+    if(!m_stream->context()->isActive())
+    {
+        std::cout << "[SphereCorrectorOptix::findCorrespondences() Need to activate map context" << std::endl;
+        m_stream->context()->use();
+    }
+
+    // size_t scanSize = m_width * m_height;
+    // size_t scanSize = m_model_ram->width * m_model_ram->height;
+    // size_t Nrays = Tbm.size() * scanSize;
+
+    // rm::Memory<unsigned int,    rm::VRAM_CUDA> corr_valid(Nrays);
+    // rm::Memory<rm::Vector, rm::VRAM_CUDA> model_points(Nrays);
+    // rm::Memory<rm::Vector, rm::VRAM_CUDA> dataset_points(Nrays);
+
+    rm::Memory<SphereCorrectionDataRW, rm::RAM> mem(1);
+    mem->model = m_model.raw();
+    mem->ranges = m_ranges.raw();
+    mem->Tsb = m_Tsb.raw();
+    mem->Tbm = Tbms.raw();
+    mem->Nposes = Tbms.size();
+    mem->params = m_params.raw();
+    mem->handle = m_map->scene()->as()->handle;
+    mem->corr_valid = corr_valid.raw();
+    mem->model_points = model_points.raw();
+    mem->dataset_points = dataset_points.raw();
+
+    rm::Memory<SphereCorrectionDataRW, rm::VRAM_CUDA> d_mem(1);
+    copy(mem, d_mem, m_stream);
+
+    rm::PipelinePtr program = make_pipeline_corr_rw(m_map->scene(), 0);
+
+    RM_OPTIX_CHECK( optixLaunch(
+        program->pipeline, 
+        m_stream->handle(), 
+        reinterpret_cast<CUdeviceptr>(d_mem.raw()), 
+        sizeof( SphereCorrectionDataRW ), 
+        program->sbt,
+        m_width, // width Xdim
+        m_height, // height Ydim
+        Tbms.size()// depth Zdim
+        ));
+
+    m_stream->synchronize();
+}
+
 void SphereCorrectorOptix::compute_covs(
     const rm::MemoryView<rm::Transform, rm::VRAM_CUDA>& Tbms,
     rm::MemoryView<rm::Vector, rm::VRAM_CUDA>& ms,
@@ -114,6 +182,41 @@ void SphereCorrectorOptix::compute_covs(
     }
 }
 
+void SphereCorrectorOptix::findCorrespondences(
+    const rm::MemoryView<rm::Transform, rm::VRAM_CUDA>& Tbms,
+    rm::Memory<rm::Point, rm::VRAM_CUDA>& dataset_points,
+    rm::Memory<rm::Point, rm::VRAM_CUDA>& model_points,
+    rm::Memory<unsigned int, rm::VRAM_CUDA>& corr_valid
+) const
+{
+    if(!m_stream->context()->isActive())
+    {
+        std::cout << "[SphereCorrectorOptix::findCorrespondences() Need to activate map context" << std::endl;
+        m_stream->context()->use();
+    }
+
+    size_t scanSize = m_width * m_height;
+    size_t Nrays = Tbms.size() * scanSize;
+
+    // keep the this if bigger than required
+    if(dataset_points.size() < Nrays)
+    {
+        dataset_points.resize(Nrays);
+    }
+
+    if(model_points.size() < Nrays)
+    {
+        model_points.resize(Nrays);
+    }
+
+    if(corr_valid.size() < Nrays)
+    {
+        corr_valid.resize(Nrays);
+    }
+
+    findCorrespondences(Tbms, dataset_points(0, Nrays), model_points(0, Nrays), corr_valid(0, Nrays) );
+}
+
 void SphereCorrectorOptix::compute_covs(
     const rmagine::MemoryView<rmagine::Transform, rmagine::VRAM_CUDA>& Tbms,
     CorrectionPreResults<rmagine::VRAM_CUDA>& res
@@ -260,73 +363,26 @@ SphereCorrectorOptix::Timings SphereCorrectorOptix::benchmark(
 /// PRIVATE
 void SphereCorrectorOptix::computeMeansCovsRW(
     const rm::MemoryView<rm::Transform, rm::VRAM_CUDA>& Tbm,
-    rm::MemoryView<rm::Vector, rm::VRAM_CUDA>& m1, // from, dataset
-    rm::MemoryView<rm::Vector, rm::VRAM_CUDA>& m2, // to, model
+    rm::MemoryView<rm::Vector, rm::VRAM_CUDA>& means_dataset, // from, dataset
+    rm::MemoryView<rm::Vector, rm::VRAM_CUDA>& means_model, // to, model
     rm::MemoryView<rm::Matrix3x3, rm::VRAM_CUDA>& Cs,
     rm::MemoryView<unsigned int, rm::VRAM_CUDA>& Ncorr
     ) const
 {
-    if(!m_map)
-    {
-        throw std::runtime_error("NO MAP");
-    }
-
-    if(!m_map->scene())
-    {
-        throw std::runtime_error("EMPTY MAP");
-    }
-
-    if(!m_map->scene()->as())
-    {
-        throw std::runtime_error("MAP SCENE NOT COMMITTED");
-    }
-
     size_t scanSize = m_width * m_height;
-    // size_t scanSize = m_model_ram->width * m_model_ram->height;
     size_t Nrays = Tbm.size() * scanSize;
 
     rm::Memory<unsigned int,    rm::VRAM_CUDA> corr_valid(Nrays);
     rm::Memory<rm::Vector, rm::VRAM_CUDA> model_points(Nrays);
     rm::Memory<rm::Vector, rm::VRAM_CUDA> dataset_points(Nrays);
 
-    rm::Memory<SphereCorrectionDataRW, rm::RAM> mem(1);
-    mem->model = m_model.raw();
-    mem->ranges = m_ranges.raw();
-    mem->Tsb = m_Tsb.raw();
-    mem->Tbm = Tbm.raw();
-    mem->Nposes = Tbm.size();
-    mem->params = m_params.raw();
-    mem->handle = m_map->scene()->as()->handle;
-    mem->corr_valid = corr_valid.raw();
-    mem->model_points = model_points.raw();
-    mem->dataset_points = dataset_points.raw();
-
-
-
-    rm::Memory<SphereCorrectionDataRW, rm::VRAM_CUDA> d_mem(1);
-    copy(mem, d_mem, m_stream);
-
-    rm::PipelinePtr program = make_pipeline_corr_rw(m_map->scene(), 0);
-
-    RM_OPTIX_CHECK( optixLaunch(
-        program->pipeline, 
-        m_stream->handle(), 
-        reinterpret_cast<CUdeviceptr>(d_mem.raw()), 
-        sizeof( SphereCorrectionDataRW ), 
-        program->sbt,
-        m_width, // width Xdim
-        m_height, // height Ydim
-        Tbm.size()// depth Zdim
-        ));
-
-    m_stream->synchronize();
-
+    findCorrespondences(Tbm, dataset_points, model_points, corr_valid);
 
     rm::sumBatched(corr_valid, Ncorr);
-    meanBatched(dataset_points, corr_valid, Ncorr, m1);
-    meanBatched(model_points, corr_valid, Ncorr, m2);
+    meanBatched(dataset_points, corr_valid, Ncorr, means_dataset);
+    meanBatched(model_points, corr_valid, Ncorr, means_model);
 
-    covFancyBatched(model_points, dataset_points,
+    covFancyBatched(dataset_points, model_points,
             corr_valid, Ncorr, Cs);
 }
 
diff --git a/src/rmcl/correction/optix/SphereCorrectProgramRW.cu b/src/rmcl/correction/optix/SphereCorrectProgramRW.cu
index 6cd3790..a0c6538 100644
--- a/src/rmcl/correction/optix/SphereCorrectProgramRW.cu
+++ b/src/rmcl/correction/optix/SphereCorrectProgramRW.cu
@@ -32,12 +32,9 @@ extern "C" __global__ void __raygen__rg()
     const rm::Transform Tsb = mem.Tsb[0];
     const rm::Transform Tbm = mem.Tbm[pid];
     const rm::Transform Tsm = Tbm * Tsb;
-    const rm::Quaternion Rmb = Tbm.R.inv();
-
+    const rm::Transform Tms = ~Tsm;
     
     const rm::Vector ray_dir_s = mem.model->getDirection(vid, hid);
-
-    const rm::Vector ray_dir_b = Tsb.R * ray_dir_s;
     const rm::Vector ray_dir_m = Tsm.R * ray_dir_s;
 
     unsigned int p0, p1, p2, p3;
@@ -60,38 +57,42 @@ extern "C" __global__ void __raygen__rg()
 
     if(real_range > range_max || sim_range > range_max || real_range < range_min)
     {
-        mem.corr_valid[glob_id] = 0;
-        mem.model_points[glob_id] = {0.0f, 0.0f, 0.0f};
         mem.dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+        mem.model_points[glob_id] = {0.0f, 0.0f, 0.0f};
+        mem.corr_valid[glob_id] = 0;
         return;
     }
         
-    const rmagine::Vector preal_b = ray_dir_b * real_range;
-    const rmagine::Vector psim_b = ray_dir_b * sim_range;
+    const rm::Vector preal_s = ray_dir_s * real_range;
+    const rm::Vector psim_s = ray_dir_s * sim_range;
     
-    const rmagine::Vector nsim_m = {
+    rmagine::Vector nsim_m = {
         __uint_as_float(p1),
         __uint_as_float(p2),
         __uint_as_float(p3)
     };
 
-    rmagine::Vector nsim_b = Rmb * nsim_m;
+    nsim_m.normalizeInplace();
+
+    rm::Vector nsim_s = Tms.R * nsim_m;
 
-    if(ray_dir_b.dot(nsim_b) > 0.0)
+    if(ray_dir_s.dot(nsim_s) > 0.0)
     {
-        nsim_b = -nsim_b;
+        nsim_s = -nsim_s;
     }
 
-    const float signed_plane_dist = (preal_b - psim_b).dot(nsim_b);
-    const rmagine::Vector pnearest_b = preal_b + nsim_b * signed_plane_dist;
-    const float dist_sqrt = (pnearest_b - preal_b).l2normSquared();
+    const float signed_plane_dist = (psim_s - preal_s).dot(nsim_s);
+    const rm::Vector pnearest_s = preal_s + nsim_s * signed_plane_dist;
+    const float dist_sqrt = (pnearest_s - preal_s).l2normSquared();
 
     if(dist_sqrt < dist_thresh * dist_thresh)
     {
+        mem.dataset_points[glob_id] = Tsb * preal_s;
+        mem.model_points[glob_id] = Tsb * pnearest_s;
         mem.corr_valid[glob_id] = 1;
-        mem.model_points[glob_id] = pnearest_b;
-        mem.dataset_points[glob_id] = preal_b;
     } else {
+        mem.dataset_points[glob_id] = {0.0f, 0.0f, 0.0f};
+        mem.model_points[glob_id] = {0.0f, 0.0f, 0.0f};
         mem.corr_valid[glob_id] = 0;
     }
 }
@@ -121,8 +122,8 @@ extern "C" __global__ void __closesthit__ch()
     }
 
     const float3 normal = make_float3(
-        mesh_data->face_normals[face_id].x, 
-        mesh_data->face_normals[face_id].y, 
+        mesh_data->face_normals[face_id].x,
+        mesh_data->face_normals[face_id].y,
         mesh_data->face_normals[face_id].z);
     float3 normal_world = optixTransformNormalFromObjectToWorldSpace(normal);
 
diff --git a/src/rmcl/math/math.cpp b/src/rmcl/math/math.cpp
index de48879..e102c6c 100644
--- a/src/rmcl/math/math.cpp
+++ b/src/rmcl/math/math.cpp
@@ -45,7 +45,7 @@ void Correction::correction_from_covs(
             Transform T;
             T.R.set(U * S * V.transpose());
             T.R.normalizeInplace();
-            T.t = ds[pid] - T.R * ms[pid];
+            T.t = ms[pid] - T.R * ds[pid];
 
             Tdelta[pid] = T;
         } else {
@@ -82,7 +82,7 @@ void Correction::correction_from_covs(
             R.set(U * S * V.transpose());
             R.normalizeInplace();
             Rdelta[pid] = R;
-            tdelta[pid] = ds[pid] - R * ms[pid];
+            tdelta[pid] = ms[pid] - R * ds[pid];
         } else {
             Rdelta[pid].setIdentity();
             tdelta[pid] = {0.0, 0.0, 0.0};
diff --git a/src/rmcl/math/math.cu b/src/rmcl/math/math.cu
index 1d932c8..20cd4a1 100644
--- a/src/rmcl/math/math.cu
+++ b/src/rmcl/math/math.cu
@@ -51,39 +51,6 @@ void CorrectionCuda::correction_from_covs(
     rm::Memory<rm::Vector, rm::VRAM_CUDA> Ss(Cs.size());
 
     m_svd->calcUSV(Cs, Us, Ss, Vs);
-    
-    // debug
-    // {
-    //     // Memory<Matrix3x3, RAM> Cs_ = Cs;
-    //     // Memory<Matrix3x3, RAM> Us_(Cs.size());
-    //     // Memory<Matrix3x3, RAM> Vs_(Cs.size());
-
-    //     // static SVD svd;
-    //     // svd.calcUV(Cs_, Us_, Vs_);
-
-    //     // Us = Us_;
-    //     // Vs = Vs_;
-    //     Memory<Matrix3x3, RAM> Cs_       = Cs;
-    //     Memory<Matrix3x3, RAM> Us_       = Us;
-    //     Memory<Vector, RAM> ss_          = ss;
-    //     Memory<Matrix3x3, RAM> Vs_       = Vs;
-    //     Memory<unsigned int, RAM> Ncorr_ = Ncorr;
-
-    //     std::cout << "C:" << std::endl;
-    //     std::cout << Cs_[0] << std::endl;
-
-    //     std::cout << "U:" << std::endl;
-    //     std::cout << Us_[0] << std::endl;
-    //     std::cout << "s:" << std::endl;
-    //     std::cout << ss_[0] << std::endl;
-    //     std::cout << "V:" << std::endl;
-    //     std::cout << Vs_[0] << std::endl;
-    //     std::cout << "Ncorr: " << std::endl;
-    //     std::cout << Ncorr_[0] << std::endl;
-        
-
-    // }
-
     compute_transform(Us, Vs, ds, ms, Tdelta);
 }
 
@@ -102,7 +69,7 @@ void CorrectionCuda::correction_from_covs(
     rm::transposeInplace(Vs);
 
     rm::multNxN(Us, Vs, Rdelta);
-    rm::subNxN(ds, rm::multNxN(Rdelta, ms), tdelta);
+    rm::subNxN(ms, rm::multNxN(Rdelta, ds), tdelta);
 }
 
 void CorrectionCuda::correction_from_covs(
@@ -120,8 +87,6 @@ Memory<Transform, VRAM_CUDA> CorrectionCuda::correction_from_covs(
     return Tdelta;
 }
 
-
-
 __global__
 void compute_transform_kernel(
     const rm::Matrix3x3* Us,
@@ -151,7 +116,7 @@ void compute_transform_kernel(
         // computation
         T.R.set(U * S * V.transpose());
         T.R.normalizeInplace();
-        T.t = d - T.R * m;
+        T.t = m - T.R * d;
 
         // write
         dT[pid] = T;
diff --git a/src/rmcl/math/math_batched.cu b/src/rmcl/math/math_batched.cu
index 53ca879..dfc5968 100644
--- a/src/rmcl/math/math_batched.cu
+++ b/src/rmcl/math/math_batched.cu
@@ -206,9 +206,10 @@ __global__ void covFancyBatched_kernel(
         {
             if(mask[globId + blockSize * i] > 0)
             {
-                const rm::Vector a = data1[globId + blockSize * i];
-                const rm::Vector b = data2[globId + blockSize * i];
-                sdata[tid] += b.multT(a);
+                // dataset: d, model: m
+                const rm::Vector d = data1[globId + blockSize * i];
+                const rm::Vector m = data2[globId + blockSize * i];
+                sdata[tid] += m.multT(d);
             }
         }
     }