diff --git a/slam/algorithms/nice_slam.py b/slam/algorithms/nice_slam.py
index 672dcce..2a0b674 100644
--- a/slam/algorithms/nice_slam.py
+++ b/slam/algorithms/nice_slam.py
@@ -43,6 +43,8 @@ class NiceSLAMConfig(AlgorithmConfig):
     mapping_lr_factor: float = 1.0
     mapping_lr_first_factor: float = 5.0
 
+    mapping_color_refine: bool = True
+
 
 class NiceSLAM(Algorithm):
 
@@ -67,6 +69,47 @@ def __init__(self, config: NiceSLAMConfig, camera: Camera,
 
         self.cur_mesh = None
 
+    def do_mapping(self, cur_frame):
+        if not self.is_initialized():
+            mapping_n_iters = self.config.mapping_first_n_iters
+        else:
+            mapping_n_iters = self.config.mapping_n_iters
+
+        # here provides a color refinement postprocess
+        if cur_frame.is_final_frame and self.config.mapping_color_refine:
+            outer_joint_iters = 5
+            self.config.mapping_window_size *= 2
+            self.config.mapping_middle_iter_ratio = 0.0
+            self.config.mapping_fine_iter_ratio = 0.0
+            self.model.config.mapping_fix_color = True
+            self.model.config.mapping_frustum_feature_selection = False
+        else:
+            outer_joint_iters = 1
+
+        for _ in range(outer_joint_iters):
+            # select optimize frames
+            with torch.no_grad():
+                optimize_frames = self.select_optimize_frames(
+                    cur_frame,
+                    keyframe_selection_method=self.config.
+                    keyframe_selection_method)
+            # optimize keyframes_pose, model_params, update model params
+            self.optimize_update(mapping_n_iters,
+                                 optimize_frames,
+                                 is_mapping=True,
+                                 coarse=False)
+
+        # do coarse_mapper
+        optimize_frames = self.select_optimize_frames(
+            cur_frame, keyframe_selection_method='random')
+        self.optimize_update(mapping_n_iters,
+                             optimize_frames,
+                             is_mapping=True,
+                             coarse=True)
+
+        if not self.is_initialized():
+            self.set_initialized()
+
     def optimizer_config_update(self, max_iters, coarse=False):
         if len(self.keyframe_graph) > 4 and not coarse:
             self.bundle_adjust = True
diff --git a/slam/algorithms/voxfusion.py b/slam/algorithms/voxfusion.py
index acf739f..675bfc5 100644
--- a/slam/algorithms/voxfusion.py
+++ b/slam/algorithms/voxfusion.py
@@ -179,7 +179,7 @@ def get_mesh(self):
     @torch.no_grad()
     def extract_mesh(self, res=8, clean_mesh=False, require_color=False):
         # get map states
-        voxels, _, features, leaf_num = self.model.octree.get_all()
+        voxels, _, features = self.model.svo.get_centres_and_children()
         index = features.eq(-1).any(-1)
         voxels = voxels[~index, :]
         features = features[~index, :]
diff --git a/slam/common/frame.py b/slam/common/frame.py
index 2e8b261..35b5bd2 100644
--- a/slam/common/frame.py
+++ b/slam/common/frame.py
@@ -24,6 +24,7 @@ def __init__(self,
         self.gt_pose = gt_pose
         self.separate_LR = separate_LR
         self.rot_rep = rot_rep
+        self.is_final_frame = False
 
         if init_pose is not None:
             pose = torch.tensor(init_pose,
diff --git a/slam/configs/input_config.py b/slam/configs/input_config.py
index 82bbc10..d8aea8c 100644
--- a/slam/configs/input_config.py
+++ b/slam/configs/input_config.py
@@ -159,12 +159,12 @@
                               use_relative_pose=True,
                               save_debug_result=False,
                               init_pose_offset=10),
-        mapper=MapperConfig(keyframe_every=10, ),
+        mapper=MapperConfig(keyframe_every=50, ),
         algorithm=VoxFusionConfig(
-            # keyframe_selection_algorithm='random',
+            keyframe_selection_method='random',
             tracking_n_iters=30,
             mapping_n_iters=15,  # 30
-            mapping_first_n_iters=100,
+            mapping_first_n_iters=30,
             mapping_window_size=5,
             mapping_sample=1024,
             tracking_sample=1024,
diff --git a/slam/models/conv_onet.py b/slam/models/conv_onet.py
index da3f76d..8e5826b 100644
--- a/slam/models/conv_onet.py
+++ b/slam/models/conv_onet.py
@@ -169,12 +169,12 @@ def get_param_groups(self) -> Dict[str, List[Parameter]]:
             decoders_para_list += list(self.decoder.fine_decoder.parameters())
         if not self.config.mapping_fix_color:
             decoders_para_list += list(self.decoder.color_decoder.parameters())
-        param_groups['decoder'] = decoders_para_list
+        if len(decoders_para_list) > 0:
+            param_groups['decoder'] = decoders_para_list
         # grid_params
         for key, grid in self.grid_c.items():
             grid = grid.to(self.device)
-            if (self.config.mapping_frustum_feature_selection
-                    and not self.config.coarse):
+            if self.config.mapping_frustum_feature_selection:
                 mask = self.grid_opti_mask[key]
                 grid.set_mask(mask)
             param_groups[key] = list(grid.parameters())
diff --git a/slam/models/sparse_voxel.py b/slam/models/sparse_voxel.py
index b95024e..3696c5f 100644
--- a/slam/models/sparse_voxel.py
+++ b/slam/models/sparse_voxel.py
@@ -29,7 +29,7 @@ def find_so_files(directory):
 
 
 search_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)),
-                                '../../third_party/sparse_octforest/build/')
+                                '../../third_party/sparse_octree/build/')
 so_files = find_so_files(search_directory)
 for so_file in so_files:
     torch.classes.load_library(so_file)
@@ -304,7 +304,8 @@ def sdf2weights(self, sdf, z_vals, valid_mask):
                           1e-8), z_min
 
     def get_octree(self):
-        self.octree = torch.classes.forest.Octree(self.config.voxels_each_dim)
+        self.svo = torch.classes.svo.Octree()
+        self.svo.init(256, self.config.embed_dim, self.config.voxel_size)
         self.embeddings = torch.nn.Parameter(torch.zeros(
             (self.config.num_embeddings, self.config.embed_dim),
             dtype=torch.float32,
@@ -327,18 +328,13 @@ def insert_points(self, points):
         voxels = torch.div(points,
                            self.config.voxel_size,
                            rounding_mode='floor')
-        voxels = torch.unique(voxels.cpu().int(), sorted=False, dim=0)
-        # here, voxels.cpu().int() and (voxels.cpu().int()[:, None]).view(-1,3)
-        # has the same shape: [N_points, 3]
-        # i think we can remove repeated voxel ids to reduce insert time and
-        # torch loading time for svo.
-        self.octree.insert(voxels)
+        self.svo.insert(voxels.cpu().int())
         self.update_map_states()
 
     @torch.enable_grad()
     def update_map_states(self):
         """This function is modified from voxfusion."""
-        voxels, children, features, leaf_num = self.octree.get_all()
+        voxels, children, features = self.svo.get_centres_and_children()
         centres = (voxels[:, :3] + voxels[:, -1:] / 2) * self.config.voxel_size
         children = torch.cat([children, voxels[:, -1:]], -1)
 
diff --git a/slam/pipeline/tracker.py b/slam/pipeline/tracker.py
index 52946b3..e29d231 100644
--- a/slam/pipeline/tracker.py
+++ b/slam/pipeline/tracker.py
@@ -290,7 +290,10 @@ def check_mapframe(self, check_frame, map_buffer):
         else:
             map_every = self.config.map_every
         # send to mapper
-        if check_frame.fid % map_every == 0:
+        if check_frame.fid % map_every == 0 or check_frame.fid == len(
+                self.dataset) - 1:
+            check_frame.is_final_frame = (
+                check_frame.fid == len(self.dataset) - 1)
             map_buffer.put(check_frame, block=True)
             return True
         return False
diff --git a/third_party/install.sh b/third_party/install.sh
index 2c39954..25a59b1 100755
--- a/third_party/install.sh
+++ b/third_party/install.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 
-cd ./sparse_octforest/
+cd ./sparse_octree/
 python setup.py install
 
 cd ../sparse_voxels/
diff --git a/third_party/sparse_octforest/include/cuda_utils.h b/third_party/sparse_octforest/include/cuda_utils.h
deleted file mode 100644
index cb8c988..0000000
--- a/third_party/sparse_octforest/include/cuda_utils.h
+++ /dev/null
@@ -1,41 +0,0 @@
-#ifndef _CUDA_UTILS_H
-#define _CUDA_UTILS_H
-
-#include <torch/extension.h>
-#include <cuda_runtime.h>
-
-#define TOTAL_THREADS 512
-#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
-#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
-#define CHECK_IS_INT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Int, #x " must be an int tensor")
-#define CHECK_IS_FLOAT(x) TORCH_CHECK(x.scalar_type() == at::ScalarType::Float, #x " must be an float tensor")
-#define CUDA_CHECK_ERRORS()                                           \
-  {                                                                   \
-    cudaError_t err = cudaGetLastError();                             \
-    if (cudaSuccess != err)                                           \
-    {                                                                 \
-      fprintf(stderr, "CUDA kernel failed : %s\n%s at L:%d in %s\n",  \
-              cudaGetErrorString(err), __PRETTY_FUNCTION__, __LINE__, \
-              __FILE__);                                              \
-      exit(-1);                                                       \
-    }                                                                 \
-  }
-
-inline int imax(int a, int b)
-{
-    return a > b ? a : b;
-}
-
-inline int imin(int a, int b)
-{
-    return a < b ? a : b;
-}
-
-inline int opt_n_threads(int work_size)
-{
-  const int pow_2 = std::log(static_cast<double>(work_size)) / std::log(2.0);
-
-  return imax(imin(1 << pow_2, TOTAL_THREADS), 1);
-}
-
-#endif
diff --git a/third_party/sparse_octforest/include/octree.h b/third_party/sparse_octforest/include/octree.h
deleted file mode 100644
index d8605e5..0000000
--- a/third_party/sparse_octforest/include/octree.h
+++ /dev/null
@@ -1,87 +0,0 @@
-#ifndef _OCTREE_H
-#define _OCTREE_H
-
-#include <memory>
-#include <vector>
-#include <torch/script.h>
-#include <torch/custom_class.h>
-
-
-//                _____
-//              /____ /|
-//             |     | |
-//     (0,0,0) |_____|/
-//              side
-// voxel coordinated defined on (0,0,0)
-// side indicate the voxel length at current level
-// point inside voxel would cast to int at (0,0,0)
-
-class Octant : public torch::CustomClassHolder
-{
-public:
-    inline Octant(int octree_index, int octant_index)
-    {
-        octree_index_ = octree_index;
-        octant_index_ = octant_index;
-        code_ = 0;
-        side_ = 0;
-        depth_ = -1;
-        is_leaf_ = false;
-        point_cnt_ = 0; // record how many points inside voxel
-        child_ptr_ = std::vector<std::shared_ptr<Octant>>(8, nullptr);
-        // std::cout << "[Debug][Octant] create octant in octree: " << octree_index_ << ", with index: " << octant_index_ <<std::endl;
-    }
-    ~Octant() {}
-    std::shared_ptr<Octant>& child(const int offset)
-    {
-        return child_ptr_[offset];
-    }
-
-    int octree_index_;
-    int octant_index_;
-    uint64_t code_;
-    unsigned int side_;
-    int point_cnt_;
-    int depth_;
-    bool is_leaf_;
-    std::vector<std::shared_ptr<Octant>> child_ptr_;
-};
-
-
-class Octree : public torch::CustomClassHolder
-{
-public:
-    ~Octree();
-    Octree(int64_t max_voxels);
-    void insert(torch::Tensor points);
-    double try_insert(torch::Tensor pts);
-
-    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, int64_t> get_all();
-    bool has_voxels(torch::Tensor points);
-    torch::Tensor get_features(torch::Tensor points);
-
-
-    int size_;
-    // temporal solution for serialization
-    std::vector<torch::Tensor> all_pts_;
-    Octree(int64_t grid_dim, std::vector<torch::Tensor> all_pts);
-
-private:
-    std::pair<int64_t, int64_t> count_nodes_internal();
-    std::pair<int64_t, int64_t> count_recursive_internal(std::shared_ptr<Octant> n);
-    std::shared_ptr<Octant> find_octant(std::vector<int> coord);
-    std::shared_ptr<Octant> find_octant(int x, int y, int z);
-
-    int octree_idx_;
-    int octant_cnt_;
-    static int octree_cnt_;
-    std::shared_ptr<Octant> root_;
-    std::set<uint64_t> all_codes;
-
-    int max_level_;
-};
-
-torch::Tensor get_features_cuda(torch::Tensor points,
-                                torch::Tensor children);
-
-#endif
diff --git a/third_party/sparse_octforest/setup.py b/third_party/sparse_octforest/setup.py
deleted file mode 100644
index aac2591..0000000
--- a/third_party/sparse_octforest/setup.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import glob
-import os
-
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
-
-current_dir = os.getcwd()
-_ext_sources = glob.glob('src/*.cpp') + glob.glob('src/*.cu')
-include_path = os.path.join(current_dir, 'include')
-
-setup(name='forest',
-      ext_modules=[
-          CUDAExtension(
-              name='forest',
-              sources=_ext_sources,
-              include_dirs=[include_path],
-              extra_compile_args={
-                  'cxx': ['-O2', '-I./include'],
-                  'nvcc': ['-O2', '-I./include'],
-              },
-          )
-      ],
-      cmdclass={'build_ext': BuildExtension})
diff --git a/third_party/sparse_octforest/src/bindings.cpp b/third_party/sparse_octforest/src/bindings.cpp
deleted file mode 100644
index 4179995..0000000
--- a/third_party/sparse_octforest/src/bindings.cpp
+++ /dev/null
@@ -1,24 +0,0 @@
-#include "octree.h"
-
-TORCH_LIBRARY(forest, m)
-{
-    // m.def("encode", &encode_torch);
-    m.class_<Octree>("Octree")
-        .def(torch::init<int64_t>())
-        .def("insert", &Octree::insert)
-        .def("try_insert", &Octree::try_insert)
-        .def("has_voxels", &Octree::has_voxels)
-        .def("get_features", &Octree::get_features)
-        .def("get_all", &Octree::get_all)
-        .def_pickle(
-        // __getstate__
-        [](const c10::intrusive_ptr<Octree>& self) -> std::tuple<int64_t, std::vector<torch::Tensor>> {
-            return std::make_tuple(self->size_, self->all_pts_);
-        },
-        // __setstate__
-        [](std::tuple<int64_t, std::vector<torch::Tensor>> state) {
-            return c10::make_intrusive<Octree>(std::get<0>(state), std::get<1>(state));
-        })
-        ;
-    m.def("get_features_cuda", &get_features_cuda);
-}
diff --git a/third_party/sparse_octforest/src/octree.cpp b/third_party/sparse_octforest/src/octree.cpp
deleted file mode 100644
index 7d5b981..0000000
--- a/third_party/sparse_octforest/src/octree.cpp
+++ /dev/null
@@ -1,414 +0,0 @@
-#include "octree.h"
-#include "utils.h"
-#include "cuda_utils.h"
-#include <queue>
-#include <iostream>
-#include <bitset>
-
-const int incr_x[8] = {0, 0, 0, 0, 1, 1, 1, 1};
-const int incr_y[8] = {0, 0, 1, 1, 0, 0, 1, 1};
-const int incr_z[8] = {0, 1, 0, 1, 0, 1, 0, 1};
-
-int Octree::octree_cnt_ = 0;
-
-Octree::~Octree()
-{
-}
-
-Octree::Octree(int64_t max_voxels)
-{
-    if (max_voxels < 1)
-    {
-        std::cout << "[Error][Octree] max voxels should greater than 0 !!!" << std::endl;
-        return;
-    }
-
-    octree_idx_ = Octree::octree_cnt_++;
-    octant_cnt_ = 0;
-
-    size_ = max_voxels;
-    max_level_ = log2(size_); // root level is 0
-    if (max_level_ >= MAX_BITS)
-    {
-        std::cout << "[Error][Octree] max level should less than " << MAX_BITS-1 << " !!!" << std::endl;
-        return;
-    }
-
-    root_ = std::make_shared<Octant>(octree_idx_, octant_cnt_++);
-    root_->side_ = size_;
-    // std::cout << "[Debug][Octree] create new octree: " << octree_idx_ << std::endl;
-}
-
-void Octree::insert(torch::Tensor pts)
-{
-    bool create_new_node = false;
-
-    if (root_ == nullptr)
-    {
-        std::cout << "[Error][Octree] Octree not initialized !!!" << std::endl;
-        return;
-    }
-
-    auto points = pts.accessor<int, 2>(); // (P, 3)
-    if (points.size(1) != 3)
-    {
-        std::cout << "[Error][Octree] Point dimensions mismatch: inputs are " << points.size(1) << " expect 3 !!!" << std::endl;
-        return;
-    }
-
-    for (int i = 0; i < points.size(0); ++i)
-    {
-        for (int j = 0; j < 8; ++j)
-        {
-            // compute morton code
-            int x = points[i][0] + incr_x[j];
-            int y = points[i][1] + incr_y[j];
-            int z = points[i][2] + incr_z[j];
-            uint64_t code = encode(x, y, z);
-            // std::cout << "[Debug][Octree] xyz: (" << x << ", " << y << ", " << z << "), morton code is: " << std::bitset<sizeof(uint64_t) * 8>(code) << std::endl;
-            all_codes.insert(code);
-
-            const unsigned int shift = MAX_BITS - max_level_ - 1;
-            // std::cout << "[Debug][Octree] shift: " << shift << std::endl;
-
-            auto n = root_;
-            unsigned edge = size_ / 2;
-            for (int d = 1; d <= max_level_; edge /= 2, ++d)
-            {
-                const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
-                auto& tmp = n->child(childid);
-                if (tmp == nullptr)
-                {
-                    const uint64_t t_code = code & MASK[d + shift];
-                    // std::cout << "[Debug][Octree] morton code is: " << std::bitset<sizeof(uint64_t) * 8>(t_code) << " - " << octant_cnt_ << std::endl;
-                    tmp = std::make_shared<Octant>(octree_idx_, octant_cnt_++);
-                    tmp->code_ = t_code;
-                    tmp->side_ = edge;
-                    tmp->is_leaf_ = (d == max_level_);
-                    // std::cout << "[Debug][Octree] create octant at level: " << d << ", with side: " << tmp->side_ << ", is leaf: " << tmp->is_leaf_ <<std::endl;
-                    create_new_node = true;
-                }
-                if (j == 0)
-                {
-                    ++(tmp->point_cnt_);
-                    // std::cout << "[Debug][Octree] child " << childid << " with points " << n->child(childid)->point_cnt_ << std::endl;
-                }
-
-                n = tmp;
-            }
-        }
-    }
-
-    // temporal solution for serialization
-    if (create_new_node)
-        all_pts_.push_back(pts);
-}
-
-double Octree::try_insert(torch::Tensor pts)
-{
-    if (root_ == nullptr)
-    {
-        std::cout << "Octree not initialized!" << std::endl;
-    }
-
-    auto points = pts.accessor<int, 2>();
-    if (points.size(1) != 3)
-    {
-        std::cout << "Point dimensions mismatch: inputs are " << points.size(1) << " expect 3" << std::endl;
-        return -1.0;
-    }
-
-    std::set<uint64_t> tmp_codes;
-
-    for (int i = 0; i < points.size(0); ++i)
-    {
-        for (int j = 0; j < 8; ++j)
-        {
-            int x = points[i][0] + incr_x[j];
-            int y = points[i][1] + incr_y[j];
-            int z = points[i][2] + incr_z[j];
-            uint64_t key = encode(x, y, z);
-
-            tmp_codes.insert(key);
-        }
-    }
-
-    std::set<int> result;
-    std::set_intersection(all_codes.begin(), all_codes.end(),
-                          tmp_codes.begin(), tmp_codes.end(),
-                          std::inserter(result, result.end()));
-
-    double overlap_ratio = 1.0 * result.size() / tmp_codes.size();
-    return overlap_ratio;
-}
-
-
-
-std::shared_ptr<Octant> Octree::find_octant(int x, int y, int z)
-{
-    auto n = root_;
-    unsigned edge = size_ / 2;
-    for (int d = 1; d <= max_level_; edge /= 2, ++d)
-    {
-        const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
-        auto tmp = n->child(childid);
-        if (tmp == nullptr)
-        {
-            std::cout << "[Error][Octree] voxel not found at " << x << ", " << y << ", " << z << " !!!" << std::endl;
-            return nullptr;
-        }
-        n = tmp;
-    }
-    return n;
-}
-
-std::shared_ptr<Octant> Octree::find_octant(std::vector<int> coord)
-{
-    int x = coord[0];
-    int y = coord[1];
-    int z = coord[2];
-
-    return find_octant(x, y, z);
-}
-
-std::pair<int64_t, int64_t> Octree::count_nodes_internal()
-{
-    return count_recursive_internal(root_);
-}
-
-std::pair<int64_t, int64_t> Octree::count_recursive_internal(std::shared_ptr<Octant> n)
-{
-    if (n == nullptr)
-        return std::make_pair<int64_t, int64_t>(0, 0);
-
-    if (n->is_leaf_)
-        return std::make_pair<int64_t, int64_t>(1, 1);
-
-    auto sum = std::make_pair<int64_t, int64_t>(1, 0);
-
-    for (int i = 0; i < 8; i++)
-    {
-        auto temp = count_recursive_internal(n->child(i));
-        sum.first += temp.first;
-        sum.second += temp.second;
-    }
-
-    return sum;
-}
-
-std::tuple<torch::Tensor, torch::Tensor, torch::Tensor, int64_t> Octree::get_all()
-{
-    auto node_count = count_nodes_internal();
-    auto total_count = node_count.first;
-    auto leaf_count = node_count.second;
-    // std::cout << "[Debug][Octree] count: " << node_count.first << " nodes with " << node_count.second << " leaves" <<std::endl;
-    auto all_voxels = torch::zeros({total_count, 4}, dtype(torch::kInt32));
-    auto all_children = -torch::ones({total_count, 8}, dtype(torch::kInt32));
-    auto all_features = -torch::ones({total_count, 8}, dtype(torch::kInt32));
-
-    std::queue<std::shared_ptr<Octant>> all_nodes;
-    all_nodes.push(root_);
-
-    while (!all_nodes.empty())
-    {
-        auto node_ptr = all_nodes.front();
-        all_nodes.pop();
-
-        auto xyz = decode(node_ptr->code_);
-        std::vector<int> coords = {static_cast<int>(xyz[0]),
-                                    static_cast<int>(xyz[1]),
-                                    static_cast<int>(xyz[2]),
-                                    static_cast<int>(node_ptr->side_)};
-        auto voxel = torch::from_blob(coords.data(), {4}, dtype(torch::kInt32));
-        all_voxels[node_ptr->octant_index_] = voxel;
-
-        if (node_ptr->is_leaf_)
-        {
-            for (int i = 0; i < 8; ++i)
-            {
-                std::vector<int> vcoords = coords;
-                vcoords[0] += incr_x[i];
-                vcoords[1] += incr_y[i];
-                vcoords[2] += incr_z[i];
-                auto octant = find_octant(vcoords);
-                all_features[node_ptr->octant_index_][i] = octant->octant_index_;
-            }
-        }
-
-        for (int i = 0; i < 8; i++)
-        {
-            auto& child_ptr = node_ptr->child(i);
-            if (child_ptr != nullptr)
-            {
-                all_children[node_ptr->octant_index_][i] = child_ptr->octant_index_;
-                if (child_ptr->point_cnt_ > 0)
-                {
-                    // std::cout << "[Debug][Octree] child " << i << " with points " << child_ptr->point_cnt_ << std::endl;
-                    all_nodes.push(child_ptr);
-                }
-            }
-        }
-    }
-    // return std::make_tuple(all_voxels, all_children, all_features);
-    return std::make_tuple(all_voxels, all_children, all_features, leaf_count); // return the number of leaf
-}
-
-// temporal solution for serialization
-Octree::Octree(int64_t max_voxels, std::vector<torch::Tensor> all_pts)
-{
-    if (max_voxels < 1)
-    {
-        std::cout << "[Error][Octree] max voxels should greater than 0 !!!" << std::endl;
-        return;
-    }
-
-    octree_idx_ = Octree::octree_cnt_++;
-    octant_cnt_ = 0;
-
-    size_ = max_voxels;
-    max_level_ = log2(size_); // root level is 0
-    if (max_level_ >= MAX_BITS)
-    {
-        std::cout << "[Error][Octree] max level should less than " << MAX_BITS-1 << " !!!" << std::endl;
-        return;
-    }
-
-    root_ = std::make_shared<Octant>(octree_idx_, octant_cnt_++);
-    root_->side_ = size_;
-    std::cout << "[Debug][Octree] create new octree: " << octree_idx_ << std::endl;
-    std::cout << "[Debug][Octree] serialization info: max_voxels:" << max_voxels << " vector<pts> size:" << all_pts.size() << std::endl;
-
-    // for (auto &pt : all_pts_)
-    for (auto &pt : all_pts)
-    {
-        insert(pt);
-        std::cout << "[Debug][Octree] serialization insert points." << std::endl;
-    }
-}
-
-
-torch::Tensor Octree::get_features(torch::Tensor pts)
-{
-    auto points = pts.accessor<int, 2>(); // (P, 3)
-    int total_points = points.size(0);
-    auto all_features = -torch::ones({total_points, 8}, dtype(torch::kInt32));
-
-    if (root_ == nullptr)
-    {
-        std::cout << "[Error][Octree] Octree not initialized !!!" << std::endl;
-        return all_features;
-    }
-
-    if (points.size(1) != 3)
-    {
-        std::cout << "[Error][Octree] Point dimensions mismatch: inputs are " << points.size(1) << " expect 3 !!!" << std::endl;
-        return all_features;
-    }
-
-    for (int i = 0; i < total_points; ++i)
-    {
-        int x = points[i][0];
-        int y = points[i][1];
-        int z = points[i][2];
-
-        auto n = root_;
-        unsigned edge = size_ / 2;
-        for (int d = 1; d <= max_level_; edge /= 2, ++d)
-        {
-            const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
-            auto tmp = n->child(childid);
-            if (tmp == nullptr)
-            {
-                std::cout << "[Error][Octree] voxel not found at " << x << ", " << y << ", " << z << " !!!" << std::endl;
-                break;
-            }
-            n = tmp;
-        }
-
-        if (n != nullptr && n->is_leaf_ && n->point_cnt_ > 0) {
-            for (int j = 0; j < 8; ++j)
-            {
-                int tx = x + incr_x[j];
-                int ty = y + incr_y[j];
-                int tz = z + incr_z[j];
-                auto octant = find_octant(tx, ty, tz);
-                all_features[i][j] = octant->octant_index_;
-            }
-        }
-    }
-    return all_features;
-}
-
-
-bool Octree::has_voxels(torch::Tensor pts)
-{
-    if (root_ == nullptr)
-    {
-        std::cout << "[Error][Octree] Octree not initialized !!!" << std::endl;
-        return false;
-    }
-
-    auto points = pts.accessor<int, 2>(); // (P, 3)
-    if (points.size(1) != 3)
-    {
-        std::cout << "[Error][Octree] Point dimensions mismatch: inputs are " << points.size(1) << " expect 3 !!!" << std::endl;
-        return false;
-    }
-
-    int total_points = points.size(0);
-    for (int i = 0; i < total_points; ++i)
-    {
-        int x = points[i][0];
-        int y = points[i][1];
-        int z = points[i][2];
-
-        auto n = root_;
-        unsigned edge = size_ / 2;
-        for (int d = 1; d <= max_level_; edge /= 2, ++d)
-        {
-            const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
-            auto tmp = n->child(childid);
-            if (tmp == nullptr)
-            {
-                std::cout << "[Error][Octree] voxel not found at " << x << ", " << y << ", " << z << " !!!" << std::endl;
-                break;
-            }
-            n = tmp;
-        }
-
-        if (n != nullptr && n->is_leaf_ && n->point_cnt_ > 0) {
-            return true;
-        }
-    }
-    return false;
-}
-
-void get_features_kernel_wrapper(const int *points,
-    const int *children,
-    int *features,
-    const int num_batch,
-    const int num_sample);
-
-torch::Tensor get_features_cuda(torch::Tensor points,
-                                torch::Tensor children)
-{
-
-    CHECK_CONTIGUOUS(points);
-    CHECK_CONTIGUOUS(children);
-    CHECK_IS_INT(points);
-    CHECK_IS_INT(children);
-    CHECK_CUDA(points);
-    CHECK_CUDA(children);
-
-    torch::Tensor features = torch::full(
-                        {points.size(0), points.size(1), 1},
-                        -1,
-                        torch::device(points.device()).dtype(torch::ScalarType::Int)
-                        );
-    get_features_kernel_wrapper(points.data_ptr<int>(),
-                                children.data_ptr<int>(),
-                                features.data_ptr<int>(),
-                                points.size(0),
-                                points.size(1));
-    return features;
-}
diff --git a/third_party/sparse_octforest/src/octree_cuda.cu b/third_party/sparse_octforest/src/octree_cuda.cu
deleted file mode 100644
index f84a8f8..0000000
--- a/third_party/sparse_octforest/src/octree_cuda.cu
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "cuda_utils.h"
-#include <ATen/ATen.h>
-#include <ATen/Context.h>
-#include <ATen/cuda/CUDAContext.h>
-
-__global__ void get_features_kernel(
-    const int *__restrict__ points, //(b, n, 3)
-    const int *__restrict__ children,
-    int *__restrict__ features,
-    const int num_batch,
-    const int num_sample)
-{
-    int n = num_sample;
-    int b = blockIdx.x;
-    int p = threadIdx.x;
-
-    int cnt = 0;
-
-    points += b * n * 3 + p * 3;
-    features += b * n + p;
-    int x = points[0];
-    int y = points[1];
-    int z = points[2];
-
-    int size = children[8];
-
-    unsigned edge = size / 2;
-    int d = 0;
-    while(children[d * 9 + 8] != 1) {
-        int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
-        if (children[d * 9 + childid] == -1) {
-            return;
-        }
-        d = children[d * 9 + childid];
-        edge /= 2;
-    }
-    features[0] = d;
-}
-
-void get_features_kernel_wrapper(const int *points,
-    const int *children,
-    int *features,
-    const int num_batch,
-    const int num_sample)
-{
-    cudaStream_t stream = at::cuda::getCurrentCUDAStream();
-    get_features_kernel<<<num_batch, num_sample, 0, stream>>>(points, children, features, num_batch, num_sample);
-    CUDA_CHECK_ERRORS();
-}
diff --git a/third_party/sparse_octree/include/octree.h b/third_party/sparse_octree/include/octree.h
new file mode 100644
index 0000000..28d8331
--- /dev/null
+++ b/third_party/sparse_octree/include/octree.h
@@ -0,0 +1,129 @@
+#include <memory>
+#include <torch/script.h>
+#include <torch/custom_class.h>
+
+enum OcType
+{
+    NONLEAF = -1,
+    SURFACE = 0,
+    FEATURE = 1
+};
+
+class Octant : public torch::CustomClassHolder
+{
+public:
+    inline Octant()
+    {
+        code_ = 0;
+        side_ = 0;
+        index_ = next_index_++;
+        depth_ = -1;
+        is_leaf_ = false;
+        children_mask_ = 0;
+        type_ = NONLEAF;
+        for (unsigned int i = 0; i < 8; i++)
+        {
+            child_ptr_[i] = nullptr;
+            // feature_index_[i] = -1;
+        }
+    }
+    ~Octant() {}
+
+    // std::shared_ptr<Octant> &child(const int x, const int y, const int z)
+    // {
+    //     return child_ptr_[x + y * 2 + z * 4];
+    // };
+
+    // std::shared_ptr<Octant> &child(const int offset)
+    // {
+    //     return child_ptr_[offset];
+    // }
+    Octant *&child(const int x, const int y, const int z)
+    {
+        return child_ptr_[x + y * 2 + z * 4];
+    };
+
+    Octant *&child(const int offset)
+    {
+        return child_ptr_[offset];
+    }
+
+    uint64_t code_;
+    bool is_leaf_;
+    unsigned int side_;
+    unsigned char children_mask_;
+    // std::shared_ptr<Octant> child_ptr_[8];
+    // int feature_index_[8];
+    int index_;
+    int depth_;
+    int type_;
+    // int feat_index_;
+    Octant *child_ptr_[8];
+    static int next_index_;
+};
+
+class Octree : public torch::CustomClassHolder
+{
+public:
+    Octree();
+    // temporal solution
+    Octree(int64_t grid_dim, int64_t feat_dim, double voxel_size, std::vector<torch::Tensor> all_pts);
+    ~Octree();
+    void init(int64_t grid_dim, int64_t feat_dim, double voxel_size);
+
+    // allocate voxels
+    void insert(torch::Tensor vox);
+    double try_insert(torch::Tensor pts);
+
+    // find a particular octant
+    Octant *find_octant(std::vector<float> coord);
+
+    // test intersections
+    bool has_voxel(torch::Tensor pose);
+
+    // query features
+    torch::Tensor get_features(torch::Tensor pts);
+
+    // get all voxels
+    torch::Tensor get_voxels();
+    std::vector<float> get_voxel_recursive(Octant *n);
+
+    // get leaf voxels
+    torch::Tensor get_leaf_voxels();
+    std::vector<float> get_leaf_voxel_recursive(Octant *n);
+
+    // count nodes
+    int64_t count_nodes();
+    int64_t count_recursive(Octant *n);
+
+    // count leaf nodes
+    int64_t count_leaf_nodes();
+    // int64_t leaves_count_recursive(std::shared_ptr<Octant> n);
+    int64_t leaves_count_recursive(Octant *n);
+
+    // get voxel centres and children
+    std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> get_centres_and_children();
+
+public:
+    int size_;
+    int feat_dim_;
+    int max_level_;
+
+    // temporal solution
+    double voxel_size_;
+    std::vector<torch::Tensor> all_pts;
+
+private:
+    std::set<uint64_t> all_keys;
+
+
+    // std::shared_ptr<Octant> root_;
+    Octant *root_;
+    // static int feature_index;
+
+    // internal count function
+    std::pair<int64_t, int64_t> count_nodes_internal();
+    std::pair<int64_t, int64_t> count_recursive_internal(Octant *n);
+
+
+};
diff --git a/third_party/sparse_octree/include/test.h b/third_party/sparse_octree/include/test.h
new file mode 100644
index 0000000..cd30748
--- /dev/null
+++ b/third_party/sparse_octree/include/test.h
@@ -0,0 +1,86 @@
+#pragma once
+#include <iostream>
+
+#define MAX_BITS 21
+// #define SCALE_MASK ((uint64_t)0x1FF)
+#define SCALE_MASK ((uint64_t)0x1)
+
+/*
+ * Mask generated with:
+   MASK[0] = 0x7000000000000000,
+   for(int i = 1; i < 21; ++i) {
+   MASK[i] = MASK[i-1] | (MASK[0] >> (i*3));
+   std::bitset<64> b(MASK[i]);
+   std::cout << std::hex << b.to_ullong() << std::endl;
+   }
+ *
+*/
+constexpr uint64_t MASK[] = {
+    0x7000000000000000,
+    0x7e00000000000000,
+    0x7fc0000000000000,
+    0x7ff8000000000000,
+    0x7fff000000000000,
+    0x7fffe00000000000,
+    0x7ffffc0000000000,
+    0x7fffff8000000000,
+    0x7ffffff000000000,
+    0x7ffffffe00000000,
+    0x7fffffffc0000000,
+    0x7ffffffff8000000,
+    0x7fffffffff000000,
+    0x7fffffffffe00000,
+    0x7ffffffffffc0000,
+    0x7fffffffffff8000,
+    0x7ffffffffffff000,
+    0x7ffffffffffffe00,
+    0x7fffffffffffffc0,
+    0x7ffffffffffffff8,
+    0x7fffffffffffffff};
+
+inline int64_t expand(int64_t value)
+{
+    int64_t x = value & 0x1fffff;
+    x = (x | x << 32) & 0x1f00000000ffff;
+    x = (x | x << 16) & 0x1f0000ff0000ff;
+    x = (x | x << 8) & 0x100f00f00f00f00f;
+    x = (x | x << 4) & 0x10c30c30c30c30c3;
+    x = (x | x << 2) & 0x1249249249249249;
+    return x;
+}
+
+inline uint64_t compact(uint64_t value)
+{
+    uint64_t x = value & 0x1249249249249249;
+    x = (x | x >> 2) & 0x10c30c30c30c30c3;
+    x = (x | x >> 4) & 0x100f00f00f00f00f;
+    x = (x | x >> 8) & 0x1f0000ff0000ff;
+    x = (x | x >> 16) & 0x1f00000000ffff;
+    x = (x | x >> 32) & 0x1fffff;
+    return x;
+}
+
+inline int64_t compute_morton(int64_t x, int64_t y, int64_t z)
+{
+    int64_t code = 0;
+
+    x = expand(x);
+    y = expand(y) << 1;
+    z = expand(z) << 2;
+
+    code = x | y | z;
+    return code;
+}
+
+inline torch::Tensor encode_torch(torch::Tensor coords)
+{
+    torch::Tensor outs = torch::zeros({coords.size(0), 1}, dtype(torch::kInt64));
+    for (int i = 0; i < coords.size(0); ++i)
+    {
+        int64_t x = coords.data_ptr<int64_t>()[i * 3];
+        int64_t y = coords.data_ptr<int64_t>()[i * 3 + 1];
+        int64_t z = coords.data_ptr<int64_t>()[i * 3];
+        outs.data_ptr<int64_t>()[i] = (compute_morton(x, y, z) & MASK[MAX_BITS - 1]);
+    }
+    return outs;
+}
diff --git a/third_party/sparse_octforest/include/utils.h b/third_party/sparse_octree/include/utils.h
similarity index 57%
rename from third_party/sparse_octforest/include/utils.h
rename to third_party/sparse_octree/include/utils.h
index beaf20e..dc3de51 100644
--- a/third_party/sparse_octforest/include/utils.h
+++ b/third_party/sparse_octree/include/utils.h
@@ -1,17 +1,48 @@
-#ifndef _UTILS_H
-#define _UTILS_H
-
+#pragma once
 #include <iostream>
 #include <eigen3/Eigen/Dense>
 
 #define MAX_BITS 21
-typedef Eigen::Matrix<uint64_t, 1, 3> Vector3lu;
+// #define SCALE_MASK ((uint64_t)0x1FF)
+#define SCALE_MASK ((uint64_t)0x1)
+
+template <class T>
+struct Vector3
+{
+    Vector3() : x(0), y(0), z(0) {}
+    Vector3(T x_, T y_, T z_) : x(x_), y(y_), z(z_) {}
+
+    Vector3<T> operator+(const Vector3<T> &b)
+    {
+        return Vector3<T>(x + b.x, y + b.y, z + b.z);
+    }
 
+    Vector3<T> operator-(const Vector3<T> &b)
+    {
+        return Vector3<T>(x - b.x, y - b.y, z - b.z);
+    }
+
+    T x, y, z;
+};
+
+typedef Vector3<int> Vector3i;
+typedef Vector3<float> Vector3f;
+
+/*
+ * Mask generated with:
+   MASK[0] = 0x7000000000000000,
+   for(int i = 1; i < 21; ++i) {
+   MASK[i] = MASK[i-1] | (MASK[0] >> (i*3));
+   std::bitset<64> b(MASK[i]);
+   std::cout << std::hex << b.to_ullong() << std::endl;
+   }
+ *
+*/
 constexpr uint64_t MASK[] = {
-    0x7000000000000000, // 0111 0000 0000 0000 0000 ...
-    0x7e00000000000000, // 0111 1110 0000 0000 0000 ...
-    0x7fc0000000000000, // 0111 1111 1100 0000 0000 ...
-    0x7ff8000000000000, // 0111 1111 1111 1000 0000 ...
+    0x7000000000000000,
+    0x7e00000000000000,
+    0x7fc0000000000000,
+    0x7ff8000000000000,
     0x7fff000000000000,
     0x7fffe00000000000,
     0x7ffffc0000000000,
@@ -28,8 +59,7 @@ constexpr uint64_t MASK[] = {
     0x7ffffffffffffe00,
     0x7fffffffffffffc0,
     0x7ffffffffffffff8,
-    0x7fffffffffffffff
-    };
+    0x7fffffffffffffff};
 
 inline uint64_t expand(unsigned long long value)
 {
@@ -44,9 +74,9 @@ inline uint64_t expand(unsigned long long value)
 
 inline uint64_t compact(uint64_t value)
 {
-    uint64_t x = value & 0x1249249249249249; // 0001 0010 0100 1001 0010 ...
-    x = (x | x >> 2) & 0x10c30c30c30c30c3; // 0001 0000 1100 0011 0000 ...
-    x = (x | x >> 4) & 0x100f00f00f00f00f; // 0001 0000 0000 1111 0000
+    uint64_t x = value & 0x1249249249249249;
+    x = (x | x >> 2) & 0x10c30c30c30c30c3;
+    x = (x | x >> 4) & 0x100f00f00f00f00f;
     x = (x | x >> 8) & 0x1f0000ff0000ff;
     x = (x | x >> 16) & 0x1f00000000ffff;
     x = (x | x >> 32) & 0x1fffff;
@@ -65,7 +95,7 @@ inline uint64_t compute_morton(uint64_t x, uint64_t y, uint64_t z)
     return code;
 }
 
-inline Vector3lu decode(const uint64_t code)
+inline Eigen::Vector3i decode(const uint64_t code)
 {
     return {
         compact(code >> 0ull),
@@ -77,5 +107,3 @@ inline uint64_t encode(const int x, const int y, const int z)
 {
     return (compute_morton(x, y, z) & MASK[MAX_BITS - 1]);
 }
-
-#endif
diff --git a/third_party/sparse_octree/setup.py b/third_party/sparse_octree/setup.py
new file mode 100644
index 0000000..ffd7b3b
--- /dev/null
+++ b/third_party/sparse_octree/setup.py
@@ -0,0 +1,22 @@
+# Copyright (c) Facebook, Inc. and its affiliates.
+#
+# This source code is licensed under the MIT license found in the
+# LICENSE file in the root directory of this source tree.
+
+import glob
+
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CppExtension
+
+_ext_sources = glob.glob('src/*.cpp')
+
+setup(name='svo',
+      ext_modules=[
+          CppExtension(
+              name='svo',
+              sources=_ext_sources,
+              include_dirs=['./include'],
+              extra_compile_args={'cxx': ['-O2', '-I./include']},
+          )
+      ],
+      cmdclass={'build_ext': BuildExtension.with_options(use_ninja=False)})
diff --git a/third_party/sparse_octree/src/bindings.cpp b/third_party/sparse_octree/src/bindings.cpp
new file mode 100644
index 0000000..589908d
--- /dev/null
+++ b/third_party/sparse_octree/src/bindings.cpp
@@ -0,0 +1,32 @@
+#include "octree.h"
+#include "test.h"
+
+TORCH_LIBRARY(svo, m)
+{
+    m.def("encode", &encode_torch);
+
+    m.class_<Octant>("Octant")
+        .def(torch::init<>());
+
+    m.class_<Octree>("Octree")
+        .def(torch::init<>())
+        .def("init", &Octree::init)
+        .def("insert", &Octree::insert)
+        .def("try_insert", &Octree::try_insert)
+        .def("get_voxels", &Octree::get_voxels)
+        .def("get_leaf_voxels", &Octree::get_leaf_voxels)
+        .def("get_features", &Octree::get_features)
+        .def("count_nodes", &Octree::count_nodes)
+        .def("count_leaf_nodes", &Octree::count_leaf_nodes)
+        .def("has_voxel", &Octree::has_voxel)
+        .def("get_centres_and_children", &Octree::get_centres_and_children)
+        .def_pickle(
+        // __getstate__
+        [](const c10::intrusive_ptr<Octree>& self) -> std::tuple<int64_t, int64_t, double, std::vector<torch::Tensor>> {
+            return std::make_tuple(self->size_, self->feat_dim_, self->voxel_size_, self->all_pts);
+        },
+        // __setstate__
+        [](std::tuple<int64_t, int64_t, double, std::vector<torch::Tensor>> state) {
+            return c10::make_intrusive<Octree>(std::get<0>(state), std::get<1>(state), std::get<2>(state), std::get<3>(state));
+        });
+}
diff --git a/third_party/sparse_octree/src/octree.cpp b/third_party/sparse_octree/src/octree.cpp
new file mode 100644
index 0000000..d08a7b4
--- /dev/null
+++ b/third_party/sparse_octree/src/octree.cpp
@@ -0,0 +1,393 @@
+#include "octree.h"
+#include "utils.h"
+#include <queue>
+#include <iostream>
+
+// #define MAX_HIT_VOXELS 10
+// #define MAX_NUM_VOXELS 10000
+
+int Octant::next_index_ = 0;
+// int Octree::feature_index = 0;
+
+int incr_x[8] = {0, 0, 0, 0, 1, 1, 1, 1};
+int incr_y[8] = {0, 0, 1, 1, 0, 0, 1, 1};
+int incr_z[8] = {0, 1, 0, 1, 0, 1, 0, 1};
+
+Octree::Octree()
+{
+}
+
+Octree::Octree(int64_t grid_dim, int64_t feat_dim, double voxel_size, std::vector<torch::Tensor> all_pts)
+{
+    Octant::next_index_ = 0;
+    init(grid_dim, feat_dim, voxel_size);
+    for (auto &pt : all_pts)
+    {
+        insert(pt);
+    }
+}
+
+Octree::~Octree()
+{
+}
+
+void Octree::init(int64_t grid_dim, int64_t feat_dim, double voxel_size)
+{
+    size_ = grid_dim;
+    feat_dim_ = feat_dim;
+    voxel_size_ = voxel_size;
+    max_level_ = log2(size_);
+    // root_ = std::make_shared<Octant>();
+    root_ = new Octant();
+    root_->side_ = size_;
+    // root_->depth_ = 0;
+    root_->is_leaf_ = false;
+
+    // feats_allocated_ = 0;
+    // auto options = torch::TensorOptions().requires_grad(true);
+    // feats_array_ = torch::randn({MAX_NUM_VOXELS, feat_dim}, options) * 0.01;
+}
+
+void Octree::insert(torch::Tensor pts)
+{
+    // temporal solution
+    // all_pts.push_back(pts);
+    bool create_new_node = false;
+
+    if (root_ == nullptr)
+    {
+        std::cout << "Octree not initialized!" << std::endl;
+    }
+
+    auto points = pts.accessor<int, 2>();
+    if (points.size(1) != 3)
+    {
+        std::cout << "Point dimensions mismatch: inputs are " << points.size(1) << " expect 3" << std::endl;
+        return;
+    }
+
+    for (int i = 0; i < points.size(0); ++i)
+    {
+        for (int j = 0; j < 8; ++j)
+        {
+            int x = points[i][0] + incr_x[j];
+            int y = points[i][1] + incr_y[j];
+            int z = points[i][2] + incr_z[j];
+            uint64_t key = encode(x, y, z);
+
+            all_keys.insert(key);
+
+            const unsigned int shift = MAX_BITS - max_level_ - 1;
+
+            auto n = root_;
+            unsigned edge = size_ / 2;
+            for (int d = 1; d <= max_level_; edge /= 2, ++d)
+            {
+                const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
+                // std::cout << "Level: " << d << " ChildID: " << childid << std::endl;
+                auto tmp = n->child(childid);
+                if (!tmp)
+                {
+                    const uint64_t code = key & MASK[d + shift];
+                    const bool is_leaf = (d == max_level_);
+                    // tmp = std::make_shared<Octant>();
+                    tmp = new Octant();
+                    tmp->code_ = code;
+                    tmp->side_ = edge;
+                    tmp->is_leaf_ = is_leaf;
+                    tmp->type_ = is_leaf ? (j == 0 ? SURFACE : FEATURE) : NONLEAF;
+
+                    n->children_mask_ = n->children_mask_ | (1 << childid);
+                    n->child(childid) = tmp;
+                    create_new_node = true;
+                }
+                else
+                {
+                    if (tmp->type_ == FEATURE && j == 0)
+                        tmp->type_ = SURFACE;
+                }
+                n = tmp;
+            }
+        }
+    }
+    if (create_new_node)
+        all_pts.push_back(pts);
+}
+
+double Octree::try_insert(torch::Tensor pts)
+{
+    if (root_ == nullptr)
+    {
+        std::cout << "Octree not initialized!" << std::endl;
+    }
+
+    auto points = pts.accessor<int, 2>();
+    if (points.size(1) != 3)
+    {
+        std::cout << "Point dimensions mismatch: inputs are " << points.size(1) << " expect 3" << std::endl;
+        return -1.0;
+    }
+
+    std::set<uint64_t> tmp_keys;
+
+    for (int i = 0; i < points.size(0); ++i)
+    {
+        for (int j = 0; j < 8; ++j)
+        {
+            int x = points[i][0] + incr_x[j];
+            int y = points[i][1] + incr_y[j];
+            int z = points[i][2] + incr_z[j];
+            uint64_t key = encode(x, y, z);
+
+            tmp_keys.insert(key);
+        }
+    }
+
+    std::set<int> result;
+    std::set_intersection(all_keys.begin(), all_keys.end(),
+                          tmp_keys.begin(), tmp_keys.end(),
+                          std::inserter(result, result.end()));
+
+    double overlap_ratio = 1.0 * result.size() / tmp_keys.size();
+    return overlap_ratio;
+}
+
+Octant *Octree::find_octant(std::vector<float> coord)
+{
+    int x = int(coord[0]);
+    int y = int(coord[1]);
+    int z = int(coord[2]);
+    // uint64_t key = encode(x, y, z);
+    // const unsigned int shift = MAX_BITS - max_level_ - 1;
+
+    auto n = root_;
+    unsigned edge = size_ / 2;
+    for (int d = 1; d <= max_level_; edge /= 2, ++d)
+    {
+        const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
+        auto tmp = n->child(childid);
+        if (!tmp)
+            return nullptr;
+
+        n = tmp;
+    }
+    return n;
+}
+
+bool Octree::has_voxel(torch::Tensor pts)
+{
+    if (root_ == nullptr)
+    {
+        std::cout << "Octree not initialized!" << std::endl;
+    }
+
+    auto points = pts.accessor<int, 1>();
+    if (points.size(0) != 3)
+    {
+        return false;
+    }
+
+    int x = int(points[0]);
+    int y = int(points[1]);
+    int z = int(points[2]);
+
+    auto n = root_;
+    unsigned edge = size_ / 2;
+    for (int d = 1; d <= max_level_; edge /= 2, ++d)
+    {
+        const int childid = ((x & edge) > 0) + 2 * ((y & edge) > 0) + 4 * ((z & edge) > 0);
+        auto tmp = n->child(childid);
+        if (!tmp)
+            return false;
+
+        n = tmp;
+    }
+
+    if (!n)
+        return false;
+    else
+        return true;
+}
+
+torch::Tensor Octree::get_features(torch::Tensor pts)
+{
+}
+
+torch::Tensor Octree::get_leaf_voxels()
+{
+    std::vector<float> voxel_coords = get_leaf_voxel_recursive(root_);
+
+    int N = voxel_coords.size() / 3;
+    torch::Tensor voxels = torch::from_blob(voxel_coords.data(), {N, 3});
+    return voxels.clone();
+}
+
+std::vector<float> Octree::get_leaf_voxel_recursive(Octant *n)
+{
+    if (!n)
+        return std::vector<float>();
+
+    if (n->is_leaf_ && n->type_ == SURFACE)
+    {
+        auto xyz = decode(n->code_);
+        return {xyz[0], xyz[1], xyz[2]};
+    }
+
+    std::vector<float> coords;
+    for (int i = 0; i < 8; i++)
+    {
+        auto temp = get_leaf_voxel_recursive(n->child(i));
+        coords.insert(coords.end(), temp.begin(), temp.end());
+    }
+
+    return coords;
+}
+
+torch::Tensor Octree::get_voxels()
+{
+    std::vector<float> voxel_coords = get_voxel_recursive(root_);
+    int N = voxel_coords.size() / 4;
+    auto options = torch::TensorOptions().dtype(torch::kFloat32);
+    torch::Tensor voxels = torch::from_blob(voxel_coords.data(), {N, 4}, options);
+    return voxels.clone();
+}
+
+std::vector<float> Octree::get_voxel_recursive(Octant *n)
+{
+    if (!n)
+        return std::vector<float>();
+
+    auto xyz = decode(n->code_);
+    std::vector<float> coords = {xyz[0], xyz[1], xyz[2], float(n->side_)};
+    for (int i = 0; i < 8; i++)
+    {
+        auto temp = get_voxel_recursive(n->child(i));
+        coords.insert(coords.end(), temp.begin(), temp.end());
+    }
+
+    return coords;
+}
+
+std::pair<int64_t, int64_t> Octree::count_nodes_internal()
+{
+    return count_recursive_internal(root_);
+}
+
+// int64_t Octree::leaves_count_recursive(std::shared_ptr<Octant> n)
+std::pair<int64_t, int64_t> Octree::count_recursive_internal(Octant *n)
+{
+    if (!n)
+        return std::make_pair<int64_t, int64_t>(0, 0);
+
+    if (n->is_leaf_)
+        return std::make_pair<int64_t, int64_t>(1, 1);
+
+    auto sum = std::make_pair<int64_t, int64_t>(1, 0);
+
+    for (int i = 0; i < 8; i++)
+    {
+        auto temp = count_recursive_internal(n->child(i));
+        sum.first += temp.first;
+        sum.second += temp.second;
+    }
+
+    return sum;
+}
+
+std::tuple<torch::Tensor, torch::Tensor, torch::Tensor> Octree::get_centres_and_children()
+{
+    auto node_count = count_nodes_internal();
+    auto total_count = node_count.first;
+    auto leaf_count = node_count.second;
+
+    auto all_voxels = torch::zeros({total_count, 4}, dtype(torch::kFloat32));
+    auto all_children = -torch::ones({total_count, 8}, dtype(torch::kFloat32));
+    auto all_features = -torch::ones({total_count, 8}, dtype(torch::kInt32));
+
+    std::queue<Octant *> all_nodes;
+    all_nodes.push(root_);
+
+    while (!all_nodes.empty())
+    {
+        auto node_ptr = all_nodes.front();
+        all_nodes.pop();
+
+        auto xyz = decode(node_ptr->code_);
+        std::vector<float> coords = {xyz[0], xyz[1], xyz[2], float(node_ptr->side_)};
+        auto voxel = torch::from_blob(coords.data(), {4}, dtype(torch::kFloat32));
+        all_voxels[node_ptr->index_] = voxel;
+
+        if (node_ptr->type_ == SURFACE)
+        {
+            for (int i = 0; i < 8; ++i)
+            {
+                std::vector<float> vcoords = coords;
+                vcoords[0] += incr_x[i];
+                vcoords[1] += incr_y[i];
+                vcoords[2] += incr_z[i];
+                auto voxel = find_octant(vcoords);
+                if (voxel)
+                    all_features.data_ptr<int>()[node_ptr->index_ * 8 + i] = voxel->index_;
+            }
+        }
+
+        for (int i = 0; i < 8; i++)
+        {
+            auto child_ptr = node_ptr->child(i);
+            if (child_ptr && child_ptr->type_ != FEATURE)
+            {
+                all_nodes.push(child_ptr);
+                all_children[node_ptr->index_][i] = float(child_ptr->index_);
+            }
+        }
+    }
+
+    return std::make_tuple(all_voxels, all_children, all_features);
+}
+
+int64_t Octree::count_nodes()
+{
+    return count_recursive(root_);
+}
+
+// int64_t Octree::leaves_count_recursive(std::shared_ptr<Octant> n)
+int64_t Octree::count_recursive(Octant *n)
+{
+    if (!n)
+        return 0;
+
+    int64_t sum = 1;
+
+    for (int i = 0; i < 8; i++)
+    {
+        sum += count_recursive(n->child(i));
+    }
+
+    return sum;
+}
+
+int64_t Octree::count_leaf_nodes()
+{
+    return leaves_count_recursive(root_);
+}
+
+// int64_t Octree::leaves_count_recursive(std::shared_ptr<Octant> n)
+int64_t Octree::leaves_count_recursive(Octant *n)
+{
+    if (!n)
+        return 0;
+
+    if (n->type_ == SURFACE)
+    {
+        return 1;
+    }
+
+    int64_t sum = 0;
+
+    for (int i = 0; i < 8; i++)
+    {
+        sum += leaves_count_recursive(n->child(i));
+    }
+
+    return sum;
+}