diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f97d2ea..165aa02 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,4 +1,4 @@
-exclude: '^slam/model_components/slam_helpers_splatam\.py$|^scripts/download_data\.py$|^test/data/'
+exclude: '^slam/model_components/slam_helpers_splatam\.py$|^scripts/download_data\.py$|^test/data/|^third_party'
 
 repos:
   - repo: https://github.com/pycqa/flake8.git
diff --git a/README.md b/README.md
index d4e57cf..8dde7cb 100644
--- a/README.md
+++ b/README.md
@@ -19,10 +19,10 @@ OpenXRLab Deep-learning based SLAM Toolbox and Benchmark. It is a part of the Op
 
 We provide a set of pre-implemented deep-learning based SLAM algorithms.
 
-| Replica/office0                                              |                                                              |                                                              |                                                              |                                                              |
-| :----------------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
-| [nice-slam](https://github.com/cvg/nice-slam)                | [co-slam](https://github.com/HengyiWang/Co-SLAM)             | [Vox-Fusion](https://github.com/zju3dv/Vox-Fusion)           | [Point_SLAM](https://github.com/eriksandstroem/Point-SLAM)   | [splaTAM](https://github.com/spla-tam/SplaTAM)               |
-| <img src="./docs/imgs/nice_slam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/co_slam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/vox_fusion.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/point_slam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/splatam.gif" alt="nice-slam" style="zoom: 50%;" /> |
+| Replica/office0                                              |                                                              |                                                              |                                                              |                                                              |                                                              |
+| :----------------------------------------------------------- | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
+| [nice-slam](https://github.com/cvg/nice-slam)                | [co-slam](https://github.com/HengyiWang/Co-SLAM)             | [Vox-Fusion](https://github.com/zju3dv/Vox-Fusion)           | [Point_SLAM](https://github.com/eriksandstroem/Point-SLAM)   | [splaTAM](https://github.com/spla-tam/SplaTAM)               | [DPVO](https://github.com/princeton-vl/DPVO)                 |
+| <img src="./docs/imgs/nice_slam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/co_slam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/vox_fusion.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/point_slam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/splatam.gif" alt="nice-slam" style="zoom: 50%;" /> | <img src="./docs/imgs/dpvo.gif" alt="dpvo" style="zoom: 50%;" /> |
 
 ## Quickstart
 
@@ -49,7 +49,6 @@ conda activate xrdslam
 **Build extension**
 
 ```
-# for vox-fusion 3rd-party
 cd third_party
 bash install.sh
 ```
@@ -188,6 +187,7 @@ Note: The default configuration in the algorithm is suitable for Replica. If you
 | Point-SLAM_X | 0.47                | 34.10 | 0.97  | 0.10   | 99.30           | 83.78        | 90.86   | 0.38           | 1.25           | 3.12           | 88.15                  |
 | SplaTAM      | 0.36                | 34.11 | 0.97  | 0.10   | -               | -            | -       | -              | -              | -              | -                      |
 | SplaTAM_X    | 0.40                | 34.44 | 0.96  | 0.09   | -               | -            | -       | -              | -              | -              | -                      |
+| DPVO_X | 0.36 | - | - | - | - | - | - | - | - | - | - |
 
 ## License
 
@@ -196,7 +196,7 @@ Please note that this license only applies to the code in our library, the depen
 
 ## Acknowledgement
 
-In addition to the implemented algorithm ([nice-slam](https://github.com/cvg/nice-slam),[co-slam](https://github.com/HengyiWang/Co-SLAM),[Vox-Fusion](https://github.com/zju3dv/Vox-Fusion),[Point_SLAM](https://github.com/eriksandstroem/Point-SLAM),[splaTAM](https://github.com/spla-tam/SplaTAM)), our code also adapt some codes from [nerfStudio](https://github.com/nerfstudio-project/nerfstudio/), [sdfstudio](https://autonomousvision.github.io/sdfstudio/). Thanks for making the code available.
+In addition to the implemented algorithm ([nice-slam](https://github.com/cvg/nice-slam),[co-slam](https://github.com/HengyiWang/Co-SLAM),[Vox-Fusion](https://github.com/zju3dv/Vox-Fusion),[Point_SLAM](https://github.com/eriksandstroem/Point-SLAM),[splaTAM](https://github.com/spla-tam/SplaTAM), [DPVO](https://github.com/princeton-vl/DPVO)), our code also adapt some codes from [nerfStudio](https://github.com/nerfstudio-project/nerfstudio/), [sdfstudio](https://autonomousvision.github.io/sdfstudio/). Thanks for making the code available.
 
 ## Built On
 
diff --git a/docs/imgs/dpvo.gif b/docs/imgs/dpvo.gif
new file mode 100644
index 0000000..0f73d72
Binary files /dev/null and b/docs/imgs/dpvo.gif differ
diff --git a/environment.yml b/environment.yml
index 69f511b..f7dbe82 100644
--- a/environment.yml
+++ b/environment.yml
@@ -189,6 +189,7 @@ dependencies:
       - termcolor==2.4.0
       - threadpoolctl==3.2.0
       - tifffile==2023.12.9
+      - torch-scatter==2.1.2
       - torchmetrics==1.3.0.post0
       - tqdm==4.66.1
       - traitlets==5.14.1
@@ -201,7 +202,6 @@ dependencies:
       - yacs==0.1.8
       - zipp==3.17.0
       - faiss-gpu==1.7.2
-      - git+https://github.com/tfy14esa/evaluate_3d_reconstruction_lib.git
       - git+https://github.com/facebookresearch/pytorch3d.git
       - git+https://github.com/NVlabs/tiny-cuda-nn/#subdirectory=bindings/torch
       - git+https://github.com/JonathonLuiten/diff-gaussian-rasterization-w-depth.git@cb65e4b86bc3bd8ed42174b72a62e8d3a3a71110
diff --git a/pretrained/dpvo/dpvo.pth b/pretrained/dpvo/dpvo.pth
new file mode 100644
index 0000000..4d46941
Binary files /dev/null and b/pretrained/dpvo/dpvo.pth differ
diff --git a/requirements.txt b/requirements.txt
index 2cfb51c..ffadeda 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -18,6 +18,7 @@ scikit-image
 scikit-learn
 scipy
 seaborn
+torch_scatter
 torchmetrics
 tqdm
 trimesh==3.21.5
diff --git a/scripts/eval.py b/scripts/eval.py
index 9fdc037..60e68ad 100644
--- a/scripts/eval.py
+++ b/scripts/eval.py
@@ -25,6 +25,7 @@ class EvalMatrics:
 
     eval_traj: bool = True
     eval_recon: bool = True
+    correct_scale: bool = False
 
     def main(self) -> None:
         """Main function."""
@@ -45,7 +46,8 @@ def main(self) -> None:
             poses_est = poses_est[mask]
             traj_result = evaluate(poses_gt,
                                    poses_est,
-                                   plot=f'{output}/eval_ate_plot.png')
+                                   plot=f'{output}/eval_ate_plot.png',
+                                   correct_scale=self.correct_scale)
             print(traj_result)
         if self.eval_recon:
             if self.gt_mesh is None or not os.path.exists(self.gt_mesh):
diff --git a/scripts/utils/eval_ate.py b/scripts/utils/eval_ate.py
index f303330..e9d416c 100644
--- a/scripts/utils/eval_ate.py
+++ b/scripts/utils/eval_ate.py
@@ -61,7 +61,7 @@ def associate(first_list, second_list, offset=0.0, max_difference=0.02):
     return matches
 
 
-def align(model, data):
+def align(model, data, correct_scale=False):
     """Align two trajectories using the method of Horn (closed-form).
 
     Input:
@@ -74,6 +74,8 @@ def align(model, data):
     trans_error -- translational error per point (1xn)
     """
     numpy.set_printoptions(precision=3, suppress=True)
+
+    # Compute rotation matrix
     model_zerocentered = model - model.mean(1)
     data_zerocentered = data - data.mean(1)
 
@@ -81,20 +83,38 @@ def align(model, data):
     for column in range(model.shape[1]):
         W += numpy.outer(model_zerocentered[:, column],
                          data_zerocentered[:, column])
-    U, d, Vh = numpy.linalg.linalg.svd(W.transpose())
+    U, d, Vh = numpy.linalg.svd(W.transpose())
     S = numpy.matrix(numpy.identity(3))
-    if (numpy.linalg.det(U) * numpy.linalg.det(Vh) < 0):
+    if numpy.linalg.det(U) * numpy.linalg.det(Vh) < 0:
         S[2, 2] = -1
     rot = U * S * Vh
-    trans = data.mean(1) - rot * model.mean(1)
 
-    model_aligned = rot * model + trans
+    if correct_scale:
+        # calculate scale
+        rotmodel = rot * model_zerocentered
+        dots = 0.0
+        norms = 0.0
+        for column in range(data_zerocentered.shape[1]):
+            dots += numpy.dot(data_zerocentered[:, column].transpose(),
+                              rotmodel[:, column])
+            norms_i = numpy.linalg.norm(model_zerocentered[:, column])
+            norms += norms_i * norms_i
+        scale = float(dots / norms)
+    else:
+        scale = 1.0
+
+    # Compute translation vector
+    trans = data.mean(1) - scale * rot * model.mean(1)
+
+    # Align model with data
+    model_aligned = scale * rot * model + trans
     alignment_error = model_aligned - data
 
+    # Compute translational error
     trans_error = numpy.sqrt(
         numpy.sum(numpy.multiply(alignment_error, alignment_error), 0)).A[0]
 
-    return rot, trans, trans_error
+    return rot, trans, trans_error, scale
 
 
 def plot_traj(ax, stamps, traj, style, color, label):
@@ -127,7 +147,11 @@ def plot_traj(ax, stamps, traj, style, color, label):
         ax.plot(x, y, style, color=color, label=label)
 
 
-def evaluate_ate(first_list, second_list, plot='', _args=''):
+def evaluate_ate(first_list,
+                 second_list,
+                 plot='',
+                 correct_scale=False,
+                 _args=''):
     # parse command line
     parser = argparse.ArgumentParser(
         description='This script computes the absolute trajectory error \
@@ -184,9 +208,11 @@ def evaluate_ate(first_list, second_list, plot='', _args=''):
         [[float(value) * float(args.scale) for value in second_list[b][0:3]]
          for a, b in matches]).transpose()
 
-    rot, trans, trans_error = align(second_xyz, first_xyz)
+    rot, trans, trans_error, scale = align(second_xyz,
+                                           first_xyz,
+                                           correct_scale=correct_scale)
 
-    second_xyz_aligned = rot * second_xyz + trans
+    second_xyz_aligned = rot * second_xyz * scale + trans
 
     first_stamps = list(first_list.keys())
     first_stamps.sort()
@@ -199,7 +225,7 @@ def evaluate_ate(first_list, second_list, plot='', _args=''):
     second_xyz_full = numpy.matrix(
         [[float(value) * float(args.scale) for value in second_list[b][0:3]]
          for b in second_stamps]).transpose()
-    second_xyz_full_aligned = rot * second_xyz_full + trans
+    second_xyz_full_aligned = rot * second_xyz_full * scale + trans
 
     if args.verbose:
         print('compared_pose_pairs %d pairs' % (len(trans_error)))
@@ -274,10 +300,12 @@ def evaluate_ate(first_list, second_list, plot='', _args=''):
         rot,
         'trans':
         trans,
+        'scale':
+        scale,
     }
 
 
-def evaluate(poses_gt, poses_est, plot):
+def evaluate(poses_gt, poses_est, plot, correct_scale=False):
 
     poses_gt = poses_gt.cpu().numpy()
     poses_est = poses_est.cpu().numpy()
@@ -286,7 +314,7 @@ def evaluate(poses_gt, poses_est, plot):
     poses_gt = dict([(i, poses_gt[i]) for i in range(N)])
     poses_est = dict([(i, poses_est[i]) for i in range(N)])
 
-    results = evaluate_ate(poses_gt, poses_est, plot)
+    results = evaluate_ate(poses_gt, poses_est, plot, correct_scale)
     return results
 
 
diff --git a/scripts/utils/viz_utils.py b/scripts/utils/viz_utils.py
index a6b02a3..f2a4427 100644
--- a/scripts/utils/viz_utils.py
+++ b/scripts/utils/viz_utils.py
@@ -164,7 +164,7 @@ def animation_callback(vis):
 
     # set the viewer's pose in the back of the first frame's pose
     param = ctr.convert_to_pinhole_camera_parameters()
-    if algorithm_name != 'splaTAM':
+    if algorithm_name != 'splaTAM' and algorithm_name != 'dpvo':
         init_pose[:3, 3] += 6 * normalize(init_pose[:3, 2])
         init_pose[:3, 2] *= -1
         init_pose[:3, 1] *= -1
@@ -202,7 +202,7 @@ def update_pose(self, index, pose, gt=False):
             pose = pose.cpu().numpy()
 
         # Note: splaTAM should not use pose[:3, 2] *= -1
-        if self.algorithm_name != 'splaTAM':
+        if self.algorithm_name != 'splaTAM' and self.algorithm_name != 'dpvo':
             pose[:3, 2] *= -1
         self.queue.put_nowait(('pose', index, pose, gt))
 
diff --git a/scripts/viewer.py b/scripts/viewer.py
index b319732..a73c663 100644
--- a/scripts/viewer.py
+++ b/scripts/viewer.py
@@ -75,14 +75,14 @@ def main(self) -> None:
                     frontend.update_cloud(cloudfile)
             frontend.update_pose(1, estimate_c2w_list[i], gt=False)
             # Note: not show gt_traj for splaTAM
-            if algorithm_name != 'splaTAM':
+            if algorithm_name != 'splaTAM' and algorithm_name != 'dpvo':
                 frontend.update_pose(1, gt_c2w_list[i], gt=True)
             # the visualizer might get stuck if update every frame
             # with a long sequence (10000+ frames)
             if i % 10 == 0:
                 frontend.update_cam_trajectory(i, gt=False)
                 # Note: not show gt_traj for splaTAM
-                if algorithm_name != 'splaTAM':
+                if algorithm_name != 'splaTAM' and algorithm_name != 'dpvo':
                     frontend.update_cam_trajectory(i, gt=True)
 
         if self.config.save_rendering:
diff --git a/slam/algorithms/base_algorithm.py b/slam/algorithms/base_algorithm.py
index 9def539..37f3806 100644
--- a/slam/algorithms/base_algorithm.py
+++ b/slam/algorithms/base_algorithm.py
@@ -209,10 +209,11 @@ def setup_optimizers(self,
                 })
 
     def do_tracking(self, cur_frame):
-        optimize_frames = [cur_frame]
-        return self.optimize_update(self.config.tracking_n_iters,
-                                    optimize_frames,
-                                    is_mapping=False)
+        if self.is_initialized():
+            optimize_frames = [cur_frame]
+            return self.optimize_update(self.config.tracking_n_iters,
+                                        optimize_frames,
+                                        is_mapping=False)
 
     def do_mapping(self, cur_frame):
         if not self.is_initialized():
diff --git a/slam/algorithms/dpvo.py b/slam/algorithms/dpvo.py
new file mode 100644
index 0000000..54f41f1
--- /dev/null
+++ b/slam/algorithms/dpvo.py
@@ -0,0 +1,450 @@
+from dataclasses import dataclass, field
+from typing import Type
+
+import altcorr
+import fastba
+import lietorch
+import numpy as np
+import torch
+import torch.nn.functional as F
+from lietorch import SE3
+
+from slam.algorithms.base_algorithm import Algorithm, AlgorithmConfig
+from slam.common.camera import Camera
+from slam.model_components import projective_ops_dpvo as pops
+from slam.model_components.utils_dpvo import flatmeshgrid
+
+autocast = torch.cuda.amp.autocast
+Id = SE3.Identity(1, device='cuda')
+
+
+@dataclass
+class DPVOConfig(AlgorithmConfig):
+    """DPVO  Config."""
+    _target: Type = field(default_factory=lambda: DPVO)
+    patch_per_frame: int = 96
+    patch_lifetime: int = 13
+    init_frame_num: int = 8
+    gradient_bias: bool = False
+    mixed_precision: bool = False
+    optimization_window: int = 10
+    keyframe_index: int = 4
+    keyframe_thresh: float = 15.0
+    removal_window: int = 22
+    motion_model: str = 'DAMPED_LINEAR'
+    motion_damping: float = 0.5
+    buffer_size: int = 2048
+    mem: int = 32
+
+
+class DPVO(Algorithm):
+
+    config: DPVOConfig
+
+    def __init__(self, config: DPVOConfig, camera: Camera,
+                 device: str) -> None:
+        super().__init__(config, camera, device)
+        self.model = self.config.model.setup(camera=camera, bounding_box=None)
+        self.model.to(device)
+        self.bundle_adjust = False
+
+        self.n = 0
+        self.m = 0
+        self.M = self.config.patch_per_frame
+        self.N = self.config.buffer_size
+
+        self.intrinsics_ori = torch.from_numpy(
+            np.array([camera.fx, camera.fy, camera.cx, camera.cy])).cuda()
+
+        # state attributes #
+        self.counter = 0
+        self.tlist = []
+
+        # steal network attributes
+        self.DIM = self.model.network.DIM
+        self.RES = self.model.network.RES
+        self.P = self.model.network.P
+
+        self.tstamps_ = torch.zeros(self.N, dtype=torch.long, device='cuda')
+        self.poses_ = torch.zeros(self.N, 7, dtype=torch.float, device='cuda')
+        self.patches_ = torch.zeros(self.N,
+                                    self.M,
+                                    3,
+                                    self.P,
+                                    self.P,
+                                    dtype=torch.float,
+                                    device='cuda')
+        self.intrinsics_ = torch.zeros(self.N,
+                                       4,
+                                       dtype=torch.float,
+                                       device='cuda')
+
+        self.points_ = torch.zeros(self.N * self.M,
+                                   3,
+                                   dtype=torch.float,
+                                   device='cuda')
+        self.colors_ = torch.zeros(self.N,
+                                   self.M,
+                                   3,
+                                   dtype=torch.uint8,
+                                   device='cuda')
+
+        self.index_ = torch.zeros(self.N,
+                                  self.M,
+                                  dtype=torch.long,
+                                  device='cuda')
+        self.index_map_ = torch.zeros(self.N, dtype=torch.long, device='cuda')
+
+        # network attributes #
+        self.mem = self.config.mem
+        ht = camera.height // self.RES
+        wd = camera.width // self.RES
+
+        if self.config.mixed_precision:
+            self.kwargs = kwargs = {'device': 'cuda', 'dtype': torch.half}
+        else:
+            self.kwargs = kwargs = {'device': 'cuda', 'dtype': torch.float}
+
+        self.imap_ = torch.zeros(self.mem, self.M, self.DIM, **kwargs)
+        self.gmap_ = torch.zeros(self.mem, self.M, 128, self.P, self.P,
+                                 **kwargs)
+
+        self.fmap1_ = torch.zeros(1, self.mem, 128, ht // 1, wd // 1, **kwargs)
+        self.fmap2_ = torch.zeros(1, self.mem, 128, ht // 4, wd // 4, **kwargs)
+
+        # feature pyramid
+        self.pyramid = (self.fmap1_, self.fmap2_)
+
+        self.net = torch.zeros(1, 0, self.DIM, **kwargs)
+        self.ii = torch.as_tensor([], dtype=torch.long, device='cuda')
+        self.jj = torch.as_tensor([], dtype=torch.long, device='cuda')
+        self.kk = torch.as_tensor([], dtype=torch.long, device='cuda')
+
+        # initialize poses to identity matrix
+        self.poses_[:, 6] = 1.0
+        # store relative poses for removed frames
+        self.delta = {}
+
+    def render_img(self, c2w, gt_depth=None, idx=None):
+        return None, None
+
+    def add_keyframe(self, keyframe):
+        pass
+
+    def do_mapping(self, cur_frame):
+        pass
+
+    def get_cloud(self, c2w_np, gt_depth_np):
+        with self.lock and torch.no_grad():
+            cloud_pos = self.points_[:self.m]
+            cloud_rgb = self.colors_.view(-1, 3)[:self.m].float() / 255.0
+            cloud_rgb = torch.clamp(cloud_rgb, 0, 1)
+
+            # filter the outlier depth
+            median_depth = torch.median(cloud_pos[:, 2])
+            mask = (cloud_pos[:, 2] <= median_depth * 10) & (cloud_pos[:, 2] >
+                                                             0)
+
+            return cloud_pos[mask].detach().cpu().numpy(
+            ), cloud_rgb[mask].detach().cpu().numpy()
+
+    def do_tracking(self, cur_frame):
+        if (self.n + 1) >= self.N:
+            raise Exception(f'The buffer size is too small. You can increase'
+                            f'it using "--buffer {self.N*2}"')
+
+        self.detect_patches(cur_frame)
+
+        self.counter += 1
+        if self.n > 0 and not self.is_initialized():
+            if self.motion_probe() < 2.0:
+                self.delta[self.counter - 1] = (self.counter - 2, Id[0])
+                return
+
+        self.n += 1
+        self.m += self.M
+        self.append_factors(*self.edges_forw())
+        self.append_factors(*self.edges_back())
+
+        if self.n == self.config.init_frame_num and not self.is_initialized():
+            self.set_initialized()
+            for itr in range(12):
+                self.update()
+
+            # update the first 7 frames' pose
+            poses, fids = self.get_all_poses()
+            for i in range(self.counter - 1):
+                self.update_framepose(fids[i], torch.from_numpy(poses[i]))
+
+        elif self.is_initialized():
+            self.update()
+            self.keyframe()
+
+        torch.cuda.empty_cache()
+
+        return SE3(
+            self.poses_[self.n -
+                        1]).inv().matrix().clone().detach().cpu().numpy()
+
+    @property
+    def poses(self):
+        return self.poses_.view(1, self.N, 7)
+
+    @property
+    def patches(self):
+        return self.patches_.view(1, self.N * self.M, 3, 3, 3)
+
+    @property
+    def intrinsics(self):
+        return self.intrinsics_.view(1, self.N, 4)
+
+    @property
+    def ix(self):
+        return self.index_.view(-1)
+
+    @property
+    def imap(self):
+        return self.imap_.view(1, self.mem * self.M, self.DIM)
+
+    @property
+    def gmap(self):
+        return self.gmap_.view(1, self.mem * self.M, 128, 3, 3)
+
+    def corr(self, coords, indices=None):
+        """This function is from DPVO, licensed under the MIT License."""
+        """local correlation volume."""
+        ii, jj = indices if indices is not None else (self.kk, self.jj)
+        ii1 = ii % (self.M * self.mem)
+        jj1 = jj % (self.mem)
+        corr1 = altcorr.corr(self.gmap, self.pyramid[0], coords / 1, ii1, jj1,
+                             3)
+        corr2 = altcorr.corr(self.gmap, self.pyramid[1], coords / 4, ii1, jj1,
+                             3)
+        return torch.stack([corr1, corr2], -1).view(1, len(ii), -1)
+
+    def reproject(self, indices=None):
+        """This function is from DPVO, licensed under the MIT License."""
+        """reproject patch k from i -> j."""
+        (ii, jj, kk) = indices if indices is not None else (self.ii, self.jj,
+                                                            self.kk)
+        coords = pops.transform(SE3(self.poses), self.patches, self.intrinsics,
+                                ii, jj, kk)
+        return coords.permute(0, 1, 4, 2, 3).contiguous()
+
+    def append_factors(self, ii, jj):
+        """This function is from DPVO, licensed under the MIT License."""
+        self.jj = torch.cat([self.jj, jj])
+        self.kk = torch.cat([self.kk, ii])
+        self.ii = torch.cat([self.ii, self.ix[ii]])
+
+        net = torch.zeros(1, len(ii), self.DIM, **self.kwargs)
+        self.net = torch.cat([self.net, net], dim=1)
+
+    def remove_factors(self, m):
+        """This function is from DPVO, licensed under the MIT License."""
+        self.ii = self.ii[~m]
+        self.jj = self.jj[~m]
+        self.kk = self.kk[~m]
+        self.net = self.net[:, ~m]
+
+    def motion_probe(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        """kinda hacky way to ensure enough motion for initialization."""
+        kk = torch.arange(self.m - self.M, self.m, device='cuda')
+        jj = self.n * torch.ones_like(kk)
+        ii = self.ix[kk]
+
+        net = torch.zeros(1, len(ii), self.DIM, **self.kwargs)
+        coords = self.reproject(indices=(ii, jj, kk))
+
+        with autocast(enabled=self.config.mixed_precision) and torch.no_grad():
+            corr = self.corr(coords, indices=(kk, jj))
+            ctx = self.imap[:, kk % (self.M * self.mem)]
+            net, (delta, weight, _) = \
+                self.model.network.update(net, ctx, corr, None, ii, jj, kk)
+
+        return torch.quantile(delta.norm(dim=-1).float(), 0.5)
+
+    def motionmag(self, i, j):
+        """This function is from DPVO, licensed under the MIT License."""
+        k = (self.ii == i) & (self.jj == j)
+        ii = self.ii[k]
+        jj = self.jj[k]
+        kk = self.kk[k]
+
+        flow = pops.flow_mag(SE3(self.poses),
+                             self.patches,
+                             self.intrinsics,
+                             ii,
+                             jj,
+                             kk,
+                             beta=0.5)
+        return flow.mean().item()
+
+    def keyframe(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        i = self.n - self.config.keyframe_index - 1
+        j = self.n - self.config.keyframe_index + 1
+        m = self.motionmag(i, j) + self.motionmag(j, i)
+
+        if m / 2 < self.config.keyframe_thresh:
+            k = self.n - self.config.keyframe_index
+
+            t0 = self.tstamps_[k - 1].item()
+            t1 = self.tstamps_[k].item()
+            dP = SE3(self.poses_[k]) * SE3(self.poses_[k - 1]).inv()
+            self.delta[t1] = (t0, dP)
+
+            to_remove = (self.ii == k) | (self.jj == k)
+            self.remove_factors(to_remove)
+            self.kk[self.ii > k] -= self.M
+            self.ii[self.ii > k] -= 1
+            self.jj[self.jj > k] -= 1
+
+            for i in range(k, self.n - 1):
+                self.tstamps_[i] = self.tstamps_[i + 1]
+                self.poses_[i] = self.poses_[i + 1]
+                self.colors_[i] = self.colors_[i + 1]
+                self.patches_[i] = self.patches_[i + 1]
+                self.intrinsics_[i] = self.intrinsics_[i + 1]
+
+                self.imap_[i % self.mem] = self.imap_[(i + 1) % self.mem]
+                self.gmap_[i % self.mem] = self.gmap_[(i + 1) % self.mem]
+                self.fmap1_[0, i % self.mem] = self.fmap1_[0,
+                                                           (i + 1) % self.mem]
+                self.fmap2_[0, i % self.mem] = self.fmap2_[0,
+                                                           (i + 1) % self.mem]
+
+            self.n -= 1
+            self.m -= self.M
+
+        to_remove = self.ix[self.kk] < self.n - self.config.removal_window
+        self.remove_factors(to_remove)
+
+    def update(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        coords = self.reproject()
+        with autocast(enabled=True) and torch.no_grad():
+            corr = self.corr(coords)
+            ctx = self.imap[:, self.kk % (self.M * self.mem)]
+            self.net, (delta, weight,
+                       _) = self.model.network.update(self.net, ctx, corr,
+                                                      None, self.ii, self.jj,
+                                                      self.kk)
+        lmbda = torch.as_tensor([1e-4], device='cuda')
+        weight = weight.float()
+        target = coords[..., self.P // 2, self.P // 2] + delta.float()
+        t0 = self.n - self.config.optimization_window if self.is_initialized(
+        ) else 1
+        t0 = max(t0, 1)
+
+        try:
+            fastba.bundle_adjust_dpvo(self.poses, self.patches,
+                                      self.intrinsics, target, weight, lmbda,
+                                      self.ii, self.jj, self.kk, t0, self.n, 2)
+
+        except Exception as e:
+            print('Warning: bundle_adjust_dpvo failed with exception:', e)
+            import traceback
+            traceback.print_exc()
+
+        points = pops.point_cloud(SE3(self.poses), self.patches[:, :self.m],
+                                  self.intrinsics, self.ix[:self.m])
+        points = (points[..., 1, 1, :3] / points[..., 1, 1, 3:]).reshape(-1, 3)
+        with self.lock:
+            self.points_[:len(points)] = points[:]
+
+    def edges_forw(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        r = self.config.patch_lifetime
+        t0 = self.M * max((self.n - r), 0)
+        t1 = self.M * max((self.n - 1), 0)
+        return flatmeshgrid(torch.arange(t0, t1, device='cuda'),
+                            torch.arange(self.n - 1, self.n, device='cuda'),
+                            indexing='ij')
+
+    def edges_back(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        r = self.config.patch_lifetime
+        t0 = self.M * max((self.n - 1), 0)
+        t1 = self.M * max((self.n - 0), 0)
+        return flatmeshgrid(torch.arange(t0, t1, device='cuda'),
+                            torch.arange(max(self.n - r, 0),
+                                         self.n,
+                                         device='cuda'),
+                            indexing='ij')
+
+    def get_pose(self, t):
+        """This function is from DPVO, licensed under the MIT License."""
+        if t in self.traj:
+            return SE3(self.traj[t])
+        t0, dP = self.delta[t]
+        return dP * self.get_pose(t0)
+
+    def get_all_poses(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        """interpolate missing poses."""
+        self.traj = {}
+        for i in range(self.n):
+            self.traj[self.tstamps_[i].item()] = self.poses_[i]
+        poses = [self.get_pose(t) for t in range(self.counter)]
+        poses = lietorch.stack(poses, dim=0)
+        poses = poses.inv().matrix().data.cpu().numpy()
+        tstamps = np.array(self.tlist)
+        return poses, tstamps
+
+    def detect_patches(self, cur_frame):
+        image = torch.from_numpy(cur_frame.rgb).permute(2, 0, 1).to(
+            self.device)  # [C, H, W]
+        image = 2 * (image[None, None]) - 0.5
+        image = image[:self.camera.height -
+                      self.camera.height % 16, :self.camera.width -
+                      self.camera.width % 16]
+        with autocast(enabled=self.config.mixed_precision) and torch.no_grad():
+            fmap, gmap, imap, patches, _, clr = self.model.network.patchify(
+                images=image,
+                patches_per_image=self.config.patch_per_frame,
+                gradient_bias=self.config.gradient_bias,
+                return_color=True)
+
+        # update state attributes #
+        self.tlist.append(cur_frame.fid)
+        self.intrinsics_[self.n] = self.intrinsics_ori / self.RES
+        self.tstamps_[self.n] = self.counter
+
+        # color info for visualization
+        clr = (clr + 0.5) * (255.0 / 2)
+        self.colors_[self.n] = clr.to(torch.uint8)
+
+        self.index_[self.n + 1] = self.n + 1
+        self.index_map_[self.n + 1] = self.m + self.M
+
+        if self.n > 1:
+            if self.config.motion_model == 'DAMPED_LINEAR':
+                P1 = SE3(self.poses_[self.n - 1])
+                P2 = SE3(self.poses_[self.n - 2])
+
+                xi = self.config.motion_damping * (P1 * P2.inv()).log()
+                tvec_qvec = (SE3.exp(xi) * P1).data
+                self.poses_[self.n] = tvec_qvec
+            else:
+                tvec_qvec = self.poses[self.n - 1]
+                self.poses_[self.n] = tvec_qvec
+
+        # TODO better depth initialization
+        patches[:, :, 2] = torch.rand_like(patches[:, :, 2, 0, 0, None, None])
+        if self.is_initialized():
+            s = torch.median(self.patches_[self.n - 3:self.n, :, 2])
+            patches[:, :, 2] = s
+
+        self.patches_[self.n] = patches
+
+        # update network attributes #
+        self.imap_[self.n % self.mem].fill_(0)
+        self.imap_[self.n % self.mem] = imap.squeeze()
+        self.gmap_[self.n % self.mem].fill_(0)
+        self.gmap_[self.n % self.mem] = gmap.squeeze()
+        self.fmap1_[:, self.n % self.mem].fill_(0)
+        self.fmap1_[:, self.n % self.mem] = F.avg_pool2d(fmap[0], 1, 1)
+        self.fmap2_[:, self.n % self.mem].fill_(0)
+        self.fmap2_[:, self.n % self.mem] = F.avg_pool2d(fmap[0], 4, 4)
diff --git a/slam/common/common.py b/slam/common/common.py
index 142e14c..a78cd1a 100644
--- a/slam/common/common.py
+++ b/slam/common/common.py
@@ -428,82 +428,118 @@ def keyframe_selection_overlap(camera,
 
 def save_render_imgs(idx, gt_color_np, gt_depth_np, color_np, depth_np,
                      img_save_dir):
-    depth_residual = np.abs(gt_depth_np - depth_np)
-    depth_residual[gt_depth_np == 0.0] = 0.0
-    max_depth = np.max(gt_depth_np)
+    result_2d = None
+
     gt_color_np = np.clip(gt_color_np, 0, 1)
-    color_np = np.clip(color_np, 0, 1)
-    color_residual = np.abs(gt_color_np - color_np)
-    color_residual[gt_depth_np == 0.0] = 0.0
-    color_residual = np.clip(color_residual, 0, 1)
+    if color_np is not None:
+        color_np = np.clip(color_np, 0, 1)
+        color_residual = np.abs(gt_color_np - color_np)
+        color_residual[gt_depth_np == 0.0] = 0.0
+        color_residual = np.clip(color_residual, 0, 1)
+
+    if depth_np is not None:
+        depth_residual = np.abs(gt_depth_np - depth_np)
+        depth_residual[gt_depth_np == 0.0] = 0.0
+        max_depth = np.max(gt_depth_np)
+
+        gt_color = torch.tensor(gt_color_np)
+        rcolor = torch.tensor(color_np)
+    elif color_np is not None:
+        depth_mask = (torch.from_numpy(gt_depth_np > 0).unsqueeze(-1)).float()
+        gt_color = torch.tensor(gt_color_np) * depth_mask
+        rcolor = torch.tensor(color_np) * depth_mask
+
     # 2d metrics
-    # rgb
-    depth_mask = (torch.from_numpy(gt_depth_np > 0).unsqueeze(-1)).float()
-    gt_color = torch.tensor(gt_color_np) * depth_mask
-    rcolor = torch.tensor(color_np) * depth_mask
-    mse_loss = torch.nn.functional.mse_loss(gt_color, rcolor)
-    psnr = -10. * torch.log10(mse_loss)
-    ssim = ms_ssim(gt_color.transpose(0, 2).unsqueeze(0).float(),
-                   rcolor.transpose(0, 2).unsqueeze(0).float(),
-                   data_range=1.0,
-                   size_average=True)
-    cal_lpips = LearnedPerceptualImagePatchSimilarity(net_type='alex',
-                                                      normalize=True)
-    lpips = cal_lpips((gt_color).unsqueeze(0).permute(0, 3, 1, 2).float(),
-                      (rcolor).unsqueeze(0).permute(0, 3, 1,
-                                                    2).float()).item()
     # depth
-    gt_depth = torch.tensor(gt_depth_np)
-    rdepth = torch.tensor(depth_np)
-    depth_l1_render = torch.abs(gt_depth[gt_depth_np > 0] -
-                                rdepth[gt_depth_np > 0]).mean().item() * 100
-    text = (f'PSNR[dB]^: {psnr.item():.2f}, '
-            f'SSIM^: {ssim:.2f}, '
-            f'LPIPS: {lpips:.2f}, '
-            f'Depth_L1[cm]: {depth_l1_render:.2f}')
-
-    fig, axs = plt.subplots(2, 3)
-    fig.tight_layout()
-    axs[0, 0].imshow(gt_depth_np, cmap='plasma', vmin=0, vmax=max_depth)
-    axs[0, 0].set_title('Input Depth')
-    axs[0, 0].set_xticks([])
-    axs[0, 0].set_yticks([])
-    axs[0, 1].imshow(depth_np, cmap='plasma', vmin=0, vmax=max_depth)
-    axs[0, 1].set_title('Generated Depth')
-    axs[0, 1].set_xticks([])
-    axs[0, 1].set_yticks([])
-    axs[0, 2].imshow(depth_residual, cmap='plasma', vmin=0, vmax=max_depth)
-    axs[0, 2].set_title('Depth Residual')
-    axs[0, 2].set_xticks([])
-    axs[0, 2].set_yticks([])
-    axs[1, 0].imshow(gt_color_np, cmap='plasma')
-    axs[1, 0].set_title('Input RGB')
-    axs[1, 0].set_xticks([])
-    axs[1, 0].set_yticks([])
-    axs[1, 1].imshow(color_np, cmap='plasma')
-    axs[1, 1].set_title('Generated RGB')
-    axs[1, 1].set_xticks([])
-    axs[1, 1].set_yticks([])
-    axs[1, 2].imshow(color_residual, cmap='plasma')
-    axs[1, 2].set_title('RGB Residual')
-    axs[1, 2].set_xticks([])
-    axs[1, 2].set_yticks([])
-    fig.text(0.02,
-             0.02,
-             text,
-             ha='left',
-             va='bottom',
-             fontsize=12,
-             color='red')
+    if depth_np is not None:
+        gt_depth = torch.tensor(gt_depth_np)
+        rdepth = torch.tensor(depth_np)
+        depth_l1_render = torch.abs(gt_depth[gt_depth_np > 0] - rdepth[
+            gt_depth_np > 0]).mean().item() * 100
+    else:
+        depth_l1_render = 0.0
+    # rgb
+    if color_np is not None:
+        mse_loss = torch.nn.functional.mse_loss(gt_color, rcolor)
+        psnr = -10. * torch.log10(mse_loss)
+        ssim = ms_ssim(gt_color.transpose(0, 2).unsqueeze(0).float(),
+                       rcolor.transpose(0, 2).unsqueeze(0).float(),
+                       data_range=1.0,
+                       size_average=True)
+        cal_lpips = LearnedPerceptualImagePatchSimilarity(net_type='alex',
+                                                          normalize=True)
+        lpips = cal_lpips((gt_color).unsqueeze(0).permute(0, 3, 1, 2).float(),
+                          (rcolor).unsqueeze(0).permute(0, 3, 1,
+                                                        2).float()).item()
+        text = (f'PSNR[dB]^: {psnr.item():.2f}, '
+                f'SSIM^: {ssim:.2f}, '
+                f'LPIPS: {lpips:.2f}, '
+                f'Depth_L1[cm]: {depth_l1_render:.2f}')
+
+        result_2d = psnr, ssim, lpips, depth_l1_render
+
+    if depth_np is not None:
+        fig, axs = plt.subplots(2, 3)
+        fig.tight_layout()
+        axs[0, 0].imshow(gt_depth_np, cmap='plasma', vmin=0, vmax=max_depth)
+        axs[0, 0].set_title('Input Depth')
+        axs[0, 0].set_xticks([])
+        axs[0, 0].set_yticks([])
+        axs[0, 1].imshow(depth_np, cmap='plasma', vmin=0, vmax=max_depth)
+        axs[0, 1].set_title('Generated Depth')
+        axs[0, 1].set_xticks([])
+        axs[0, 1].set_yticks([])
+        axs[0, 2].imshow(depth_residual, cmap='plasma', vmin=0, vmax=max_depth)
+        axs[0, 2].set_title('Depth Residual')
+        axs[0, 2].set_xticks([])
+        axs[0, 2].set_yticks([])
+        axs[1, 0].imshow(gt_color_np, cmap='plasma')
+        axs[1, 0].set_title('Input RGB')
+        axs[1, 0].set_xticks([])
+        axs[1, 0].set_yticks([])
+        axs[1, 1].imshow(color_np, cmap='plasma')
+        axs[1, 1].set_title('Generated RGB')
+        axs[1, 1].set_xticks([])
+        axs[1, 1].set_yticks([])
+        axs[1, 2].imshow(color_residual, cmap='plasma')
+        axs[1, 2].set_title('RGB Residual')
+        axs[1, 2].set_xticks([])
+        axs[1, 2].set_yticks([])
+        fig.text(0.02,
+                 0.02,
+                 text,
+                 ha='left',
+                 va='bottom',
+                 fontsize=12,
+                 color='red')
+    else:
+        if gt_depth_np is None:
+            fig, axs = plt.subplots(1, 1)
+            axs.imshow(gt_color_np, cmap='plasma')
+            axs.set_title('Input RGB')
+            axs.set_xticks([])
+            axs.set_yticks([])
+        else:
+            fig, axs = plt.subplots(1, 2)
+            axs[0].imshow(gt_color_np, cmap='plasma')
+            axs[0].set_title('Input RGB')
+            axs[0].set_xticks([])
+            axs[0].set_yticks([])
+            max_depth = np.max(gt_depth_np)
+            axs[1].imshow(gt_depth_np, cmap='plasma', vmin=0, vmax=max_depth)
+            axs[1].set_title('Input Depth')
+            axs[1].set_xticks([])
+            axs[1].set_yticks([])
 
     plt.subplots_adjust(wspace=0, hspace=0)
-    plt.savefig(f'{img_save_dir}/{idx:05d}.jpg',
-                bbox_inches='tight',
-                pad_inches=0.2)
+    if color_np is not None:
+        plt.savefig(f'{img_save_dir}/{idx:05d}.jpg',
+                    bbox_inches='tight',
+                    pad_inches=0.2)
     plt.clf()
     plt.close()
 
-    return psnr, ssim, lpips, depth_l1_render
+    return result_2d
 
 
 def rgbd2pcd(color_np, depth_np, c2w_np, camera, render_mode, device):
diff --git a/slam/common/datasets.py b/slam/common/datasets.py
index 941be52..7c7e3aa 100644
--- a/slam/common/datasets.py
+++ b/slam/common/datasets.py
@@ -2,12 +2,15 @@
 # (https://github.com/cvg/nice-slam/blob/master/src/utils/datasets.py)
 # licensed under the Apache License, Version 2.0.
 
+import csv
 import glob
 import os
 
 import cv2
 import numpy as np
 import torch
+import yaml
+from scipy.spatial.transform import Rotation
 from torch.utils.data import Dataset
 
 from slam.common.camera import Camera
@@ -53,9 +56,11 @@ def get_dataset(data_path, data_type, device='cuda:0'):
     return dataset_dict[data_type](data_path, device=device)
 
 
+# RGBD dataset
 class BaseDataset(Dataset):
     def __init__(self, data_path, device='cuda:0'):
         super(BaseDataset, self).__init__()
+        self.data_format = 'RGBD'
 
         self.input_folder = data_path
         self.device_yml = os.path.join(data_path, 'devices.yaml')
@@ -124,8 +129,9 @@ def __getitem__(self, index):
         color_data = torch.from_numpy(color_data)
         depth_data = torch.from_numpy(depth_data)
         pose = self.poses[index]
-        return index, color_data.to(self.device), depth_data.to(
-            self.device), pose.to(self.device)
+        return index, color_data.to(self.device).to(
+            torch.float), depth_data.to(self.device).to(torch.float), pose.to(
+                self.device).to(torch.float)
 
     def get_camera(self):
         return self.camera
@@ -160,6 +166,160 @@ def load_poses(self, path):
             self.poses.append(c2w)
 
 
+# mono_imu dataset
+class Euroc(Dataset):
+    def __init__(self, data_path, device='cuda:0'):
+
+        self.data_format = 'MonoImu'
+
+        self.input_folder = data_path
+        self.device = device
+        self.last_img_timestamp = -1
+
+        self.cam0_yml = os.path.join(data_path, 'mav0/cam0/sensor.yaml')
+        cam0_cfg = self.read_yaml(self.cam0_yml)
+        self.imu0_yml = os.path.join(data_path, 'mav0/imu0/sensor.yaml')
+        imu0_cfg = self.read_yaml(self.imu0_yml)
+
+        # cam0 intrinsics
+        self.H, self.W = cam0_cfg['resolution'][1], cam0_cfg['resolution'][0]
+        self.fx, self.fy, self.cx, self.cy = cam0_cfg['intrinsics'][
+            0], cam0_cfg['intrinsics'][1], cam0_cfg['intrinsics'][2], cam0_cfg[
+                'intrinsics'][3]
+        self.cam0_distortion = np.array(
+            cam0_cfg['distortion_coefficients']
+        ) if 'distortion_coefficients' in cam0_cfg else None
+
+        # imu0
+        self.gyro_n = imu0_cfg['gyroscope_noise_density']
+        self.gyro_rw = imu0_cfg['gyroscope_random_walk']
+        self.acc_n = imu0_cfg['accelerometer_noise_density']
+        self.acc_rw = imu0_cfg['accelerometer_random_walk']
+        self.imu_hz = imu0_cfg['rate_hz']
+
+        self.crop_edge = 0
+        self.downsample_factor = 1
+
+        self.camera = Camera(
+            fx=self.fx,
+            fy=self.fy,
+            cx=self.cx,
+            cy=self.cy,
+            height=self.H,
+            width=self.W,
+        )
+
+        self.img_timestamps, self.color_paths = self.get_imgs_path(
+            f'{self.input_folder}/mav0/cam0/data.csv')
+        self.n_img = len(self.color_paths)
+        self.load_poses(
+            f'{self.input_folder}/mav0/state_groundtruth_estimate0/data.csv')
+        self.load_imu(f'{self.input_folder}/mav0/imu0/data.csv')
+
+    def __len__(self):
+        return self.n_img
+
+    def __getitem__(self, index):
+        color_path = self.color_paths[index]
+        color_data = cv2.imread(color_path)
+
+        if self.cam0_distortion is not None:
+            K = as_intrinsics_matrix([self.fx, self.fy, self.cx, self.cy])
+            color_data_ud = color_data.copy()
+            color_data = cv2.undistort(color_data_ud, K, self.cam0_distortion)
+
+        color_data = cv2.cvtColor(color_data, cv2.COLOR_BGR2RGB)
+        color_data = color_data / 255.
+        color_data = torch.from_numpy(color_data)  # [H,W,C]
+
+        # pose = self.poses[index]
+        pose = self.get_img_pose(self.img_timestamps[index])
+        imu_datas = torch.empty((0, ))
+        if self.last_img_timestamp > 0:
+            imu_datas = self.get_imu_data(self.last_img_timestamp,
+                                          self.img_timestamps[index])
+            imu_datas = torch.from_numpy(imu_datas)
+            self.last_img_timestamp = self.img_timestamps[index]
+
+        return index, color_data.to(self.device).to(torch.float), imu_datas.to(
+            self.device), pose.to(self.device).to(torch.float)
+
+    def get_img_pose(self, t0):
+        nearest_index = np.argmin(np.abs(np.array(self.gt_timestamps) - t0))
+        return self.poses[nearest_index]
+
+    def get_imu_data(self, t0, t1):
+        out_imus = []
+        for i, timestamp in enumerate(self.imu_timestamps):
+            if t0 <= timestamp <= t1:
+                out_imus.append((timestamp, self.imu_datas[i]))
+        return out_imus
+
+    def get_camera(self):
+        return self.camera
+
+    def read_yaml(self, path):
+        with open(path, 'r') as f:
+            first_line = f.readline()
+            if first_line.startswith('%'):
+                yaml_content = f.read()
+            else:
+                yaml_content = first_line + f.read()
+        yaml_cfg = yaml.safe_load(yaml_content)
+        return yaml_cfg
+
+    def get_imgs_path(self, data_csv):
+        img_timestamps = []
+        color_paths = []
+        with open(data_csv, 'r') as file:
+            next(file)
+            for line in file:
+                parts = line.strip().split(',')
+                timestamp = int(parts[0])
+                filename = parts[1]
+                img_timestamps.append(timestamp)
+                color_paths.append(
+                    os.path.join(os.path.dirname(data_csv), 'data', filename))
+        return img_timestamps, color_paths
+
+    def load_imu(self, path):
+        self.imu_timestamps = []
+        self.imu_datas = []  # t, gyro, acc
+        with open(path, 'r') as file:
+            reader = csv.reader(file, delimiter=',')
+            next(reader)
+            for row in reader:
+                timestamp = int(row[0])
+                self.imu_timestamps.append(timestamp)
+                imu_data = [float(row[i]) for i in range(1, 7)]
+                self.imu_datas.append(imu_data)
+
+    def load_poses(self, path):
+        self.gt_timestamps = []
+        self.poses = []
+
+        with open(path, newline='') as csvfile:
+            reader = csv.reader(csvfile)
+            next(reader)
+            for row in reader:
+                self.gt_timestamps.append(int(row[0]))
+                line_data = {
+                    'position': [float(row[i]) for i in range(1, 4)],
+                    'orientation': [float(row[i]) for i in range(4, 8)],
+                    'velocity': [float(row[i]) for i in range(8, 11)],
+                    'bias_w': [float(row[i]) for i in range(11, 14)],
+                    'bias_a': [float(row[i]) for i in range(14, 17)]
+                }
+                position = np.array(line_data['position'])
+                orientation = np.array(line_data['orientation'])
+                R = Rotation.from_quat(orientation).as_matrix()
+                c2w = np.eye(4)
+                c2w[:3, :3] = R
+                c2w[:3, 3] = position
+                c2w = torch.from_numpy(c2w).float()
+                self.poses.append(c2w)
+
+
 class Azure(BaseDataset):
     def __init__(self, data_path, device='cuda:0'):
         super(Azure, self).__init__(data_path, device)
@@ -351,5 +511,6 @@ def pose_matrix_from_quaternion(self, pvec):
     'scannet': ScanNet,
     'cofusion': CoFusion,
     'azure': Azure,
-    'tumrgbd': TUM_RGBD
+    'tumrgbd': TUM_RGBD,
+    'euroc': Euroc,
 }
diff --git a/slam/common/frame.py b/slam/common/frame.py
index 35b5bd2..9989505 100644
--- a/slam/common/frame.py
+++ b/slam/common/frame.py
@@ -18,7 +18,10 @@ def __init__(self,
                  rot_rep='axis_angle') -> None:
         super().__init__()
         self.fid = fid
-        self.h, self.w = depth.shape
+        if depth is not None:
+            self.h, self.w = depth.shape
+        else:
+            self.h, self.w = rgb.shape[0], rgb.shape[1]
         self.rgb = rgb
         self.depth = depth
         self.gt_pose = gt_pose
diff --git a/slam/configs/input_config.py b/slam/configs/input_config.py
index d8aea8c..3ade761 100644
--- a/slam/configs/input_config.py
+++ b/slam/configs/input_config.py
@@ -8,6 +8,7 @@
 import tyro
 
 from slam.algorithms.coslam import CoSLAMConfig
+from slam.algorithms.dpvo import DPVOConfig
 from slam.algorithms.nice_slam import NiceSLAMConfig
 from slam.algorithms.point_slam import PointSLAMConfig
 from slam.algorithms.splatam import SplaTAMConfig
@@ -22,6 +23,7 @@
 from slam.models.gaussian_splatting import GaussianSplattingConfig
 from slam.models.joint_encoding import JointEncodingConfig
 from slam.models.sparse_voxel import SparseVoxelConfig
+from slam.models.vo_net_model import VONetModelConfig
 from slam.pipeline.mapper import MapperConfig
 from slam.pipeline.tracker import TrackerConfig
 from slam.pipeline.visualizer import VisualizerConfig
@@ -35,6 +37,7 @@
     'co-slam': 'Implementation of co-slam.',
     'point-slam': 'Implementation of point-slam.',
     'splaTAM': 'Implementation of splaTAM.',
+    'dpvo': 'Implementation of DPVO.',
 }
 
 algorithm_configs['nice-slam'] = XRDSLAMerConfig(
@@ -426,6 +429,27 @@
         device='cuda:0')  # TODO: only support cuda:0 now
 )
 
+algorithm_configs['dpvo'] = XRDSLAMerConfig(
+    algorithm_name='dpvo',
+    xrdslam=XRDSLAMConfig(
+        tracker=TrackerConfig(map_every=-1,
+                              render_freq=50,
+                              save_debug_result=False),
+        algorithm=DPVOConfig(
+            mapping_window_size=32,
+            patch_lifetime=13,
+            patch_per_frame=96,
+            init_frame_num=8,
+            optimization_window=10,
+            buffer_size=2048,
+            mem=32,
+            model=VONetModelConfig(
+                pretrained_path=Path('pretrained/dpvo/dpvo.pth')),
+        ),
+        visualizer=VisualizerConfig(),
+        enable_vis=False,
+        device='cuda:0'))
+
 AnnotatedBaseConfigUnion = tyro.conf.SuppressFixed[
     # Don't show unparsable (fixed) arguments in helptext.
     tyro.conf.FlagConversionOff[tyro.extras.subcommand_type_from_defaults(
diff --git a/slam/model_components/blocks_dpvo.py b/slam/model_components/blocks_dpvo.py
new file mode 100644
index 0000000..e4e66c3
--- /dev/null
+++ b/slam/model_components/blocks_dpvo.py
@@ -0,0 +1,127 @@
+# This file is from DPVO,
+# licensed under the MIT License.
+
+import torch
+import torch.nn as nn
+import torch_scatter
+
+
+class LayerNorm1D(nn.Module):
+    def __init__(self, dim):
+        super(LayerNorm1D, self).__init__()
+        self.norm = nn.LayerNorm(dim, eps=1e-4)
+
+    def forward(self, x):
+        return self.norm(x.transpose(1, 2)).transpose(1, 2)
+
+
+class GatedResidual(nn.Module):
+    def __init__(self, dim):
+        super().__init__()
+
+        self.gate = nn.Sequential(nn.Linear(dim, dim), nn.Sigmoid())
+
+        self.res = nn.Sequential(nn.Linear(dim, dim), nn.ReLU(inplace=True),
+                                 nn.Linear(dim, dim))
+
+    def forward(self, x):
+        return x + self.gate(x) * self.res(x)
+
+
+class SoftAgg(nn.Module):
+    def __init__(self, dim=512, expand=True):
+        super(SoftAgg, self).__init__()
+        self.dim = dim
+        self.expand = expand
+        self.f = nn.Linear(self.dim, self.dim)
+        self.g = nn.Linear(self.dim, self.dim)
+        self.h = nn.Linear(self.dim, self.dim)
+
+    def forward(self, x, ix):
+        _, jx = torch.unique(ix, return_inverse=True)
+
+        w = torch_scatter.scatter_softmax(self.g(x), jx, dim=1)
+        y = torch_scatter.scatter_sum(self.f(x) * w, jx, dim=1)
+
+        if self.expand:
+            return self.h(y)[:, jx]
+
+        return self.h(y)
+
+
+class SoftAggBasic(nn.Module):
+    def __init__(self, dim=512, expand=True):
+        super(SoftAggBasic, self).__init__()
+        self.dim = dim
+        self.expand = expand
+        self.f = nn.Linear(self.dim, self.dim)
+        self.g = nn.Linear(self.dim, 1)
+        self.h = nn.Linear(self.dim, self.dim)
+
+    def forward(self, x, ix):
+        _, jx = torch.unique(ix, return_inverse=True)
+        w = torch_scatter.scatter_softmax(self.g(x), jx, dim=1)
+        y = torch_scatter.scatter_sum(self.f(x) * w, jx, dim=1)
+
+        if self.expand:
+            return self.h(y)[:, jx]
+
+        return self.h(y)
+
+
+# Gradient Clipping and Zeroing Operations
+
+GRAD_CLIP = 0.1
+
+
+class GradClip(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_x):
+        grad_x = torch.where(torch.isnan(grad_x), torch.zeros_like(grad_x),
+                             grad_x)
+        return grad_x.clamp(min=-0.01, max=0.01)
+
+
+class GradientClip(nn.Module):
+    def __init__(self):
+        super(GradientClip, self).__init__()
+
+    def forward(self, x):
+        return GradClip.apply(x)
+
+
+class GradZero(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_x):
+        grad_x = torch.where(torch.isnan(grad_x), torch.zeros_like(grad_x),
+                             grad_x)
+        grad_x = torch.where(
+            torch.abs(grad_x) > GRAD_CLIP, torch.zeros_like(grad_x), grad_x)
+        return grad_x
+
+
+class GradientZero(nn.Module):
+    def __init__(self):
+        super(GradientZero, self).__init__()
+
+    def forward(self, x):
+        return GradZero.apply(x)
+
+
+class GradMag(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, x):
+        return x
+
+    @staticmethod
+    def backward(ctx, grad_x):
+        print(grad_x.abs().mean())
+        return grad_x
diff --git a/slam/model_components/extractor_dpvo.py b/slam/model_components/extractor_dpvo.py
new file mode 100644
index 0000000..ca7876f
--- /dev/null
+++ b/slam/model_components/extractor_dpvo.py
@@ -0,0 +1,307 @@
+# This file is from DPVO,
+# licensed under the MIT License.
+
+import torch.nn as nn
+
+
+class ResidualBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(ResidualBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes,
+                               planes,
+                               kernel_size=3,
+                               padding=1,
+                               stride=stride)
+        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, padding=1)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups,
+                                      num_channels=planes)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups,
+                                      num_channels=planes)
+            if not stride == 1:
+                self.norm3 = nn.GroupNorm(num_groups=num_groups,
+                                          num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes)
+            self.norm2 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes)
+            self.norm2 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm3 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            if not stride == 1:
+                self.norm3 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm3)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+class BottleneckBlock(nn.Module):
+    def __init__(self, in_planes, planes, norm_fn='group', stride=1):
+        super(BottleneckBlock, self).__init__()
+
+        self.conv1 = nn.Conv2d(in_planes,
+                               planes // 4,
+                               kernel_size=1,
+                               padding=0)
+        self.conv2 = nn.Conv2d(planes // 4,
+                               planes // 4,
+                               kernel_size=3,
+                               padding=1,
+                               stride=stride)
+        self.conv3 = nn.Conv2d(planes // 4, planes, kernel_size=1, padding=0)
+        self.relu = nn.ReLU(inplace=True)
+
+        num_groups = planes // 8
+
+        if norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=num_groups,
+                                      num_channels=planes // 4)
+            self.norm2 = nn.GroupNorm(num_groups=num_groups,
+                                      num_channels=planes // 4)
+            self.norm3 = nn.GroupNorm(num_groups=num_groups,
+                                      num_channels=planes)
+            if not stride == 1:
+                self.norm4 = nn.GroupNorm(num_groups=num_groups,
+                                          num_channels=planes)
+
+        elif norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(planes // 4)
+            self.norm2 = nn.BatchNorm2d(planes // 4)
+            self.norm3 = nn.BatchNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.BatchNorm2d(planes)
+
+        elif norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(planes // 4)
+            self.norm2 = nn.InstanceNorm2d(planes // 4)
+            self.norm3 = nn.InstanceNorm2d(planes)
+            if not stride == 1:
+                self.norm4 = nn.InstanceNorm2d(planes)
+
+        elif norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+            self.norm2 = nn.Sequential()
+            self.norm3 = nn.Sequential()
+            if not stride == 1:
+                self.norm4 = nn.Sequential()
+
+        if stride == 1:
+            self.downsample = None
+
+        else:
+            self.downsample = nn.Sequential(
+                nn.Conv2d(in_planes, planes, kernel_size=1, stride=stride),
+                self.norm4)
+
+    def forward(self, x):
+        y = x
+        y = self.relu(self.norm1(self.conv1(y)))
+        y = self.relu(self.norm2(self.conv2(y)))
+        y = self.relu(self.norm3(self.conv3(y)))
+
+        if self.downsample is not None:
+            x = self.downsample(x)
+
+        return self.relu(x + y)
+
+
+DIM = 32
+
+
+class BasicEncoder(nn.Module):
+    def __init__(self,
+                 output_dim=128,
+                 norm_fn='batch',
+                 dropout=0.0,
+                 multidim=False):
+        super(BasicEncoder, self).__init__()
+        self.norm_fn = norm_fn
+        self.multidim = multidim
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=DIM)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(DIM)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(DIM)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, DIM, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = DIM
+        self.layer1 = self._make_layer(DIM, stride=1)
+        self.layer2 = self._make_layer(2 * DIM, stride=2)
+        self.layer3 = self._make_layer(4 * DIM, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(4 * DIM, output_dim, kernel_size=1)
+
+        if self.multidim:
+            self.layer4 = self._make_layer(256, stride=2)
+            self.layer5 = self._make_layer(512, stride=2)
+
+            self.in_planes = 256
+            self.layer6 = self._make_layer(256, stride=1)
+
+            self.in_planes = 128
+            self.layer7 = self._make_layer(128, stride=1)
+
+            self.up1 = nn.Conv2d(512, 256, 1)
+            self.up2 = nn.Conv2d(256, 128, 1)
+            self.conv3 = nn.Conv2d(128, output_dim, kernel_size=1)
+
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        else:
+            self.dropout = None
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight,
+                                        mode='fan_out',
+                                        nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes,
+                               dim,
+                               self.norm_fn,
+                               stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        b, n, c1, h1, w1 = x.shape
+        x = x.view(b * n, c1, h1, w1)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+
+        x = self.conv2(x)
+
+        _, c2, h2, w2 = x.shape
+        return x.view(b, n, c2, h2, w2)
+
+
+class BasicEncoder4(nn.Module):
+    def __init__(self,
+                 output_dim=128,
+                 norm_fn='batch',
+                 dropout=0.0,
+                 multidim=False):
+        super(BasicEncoder4, self).__init__()
+        self.norm_fn = norm_fn
+        self.multidim = multidim
+
+        if self.norm_fn == 'group':
+            self.norm1 = nn.GroupNorm(num_groups=8, num_channels=DIM)
+
+        elif self.norm_fn == 'batch':
+            self.norm1 = nn.BatchNorm2d(DIM)
+
+        elif self.norm_fn == 'instance':
+            self.norm1 = nn.InstanceNorm2d(DIM)
+
+        elif self.norm_fn == 'none':
+            self.norm1 = nn.Sequential()
+
+        self.conv1 = nn.Conv2d(3, DIM, kernel_size=7, stride=2, padding=3)
+        self.relu1 = nn.ReLU(inplace=True)
+
+        self.in_planes = DIM
+        self.layer1 = self._make_layer(DIM, stride=1)
+        self.layer2 = self._make_layer(2 * DIM, stride=2)
+
+        # output convolution
+        self.conv2 = nn.Conv2d(2 * DIM, output_dim, kernel_size=1)
+
+        if dropout > 0:
+            self.dropout = nn.Dropout2d(p=dropout)
+        else:
+            self.dropout = None
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                nn.init.kaiming_normal_(m.weight,
+                                        mode='fan_out',
+                                        nonlinearity='relu')
+            elif isinstance(m,
+                            (nn.BatchNorm2d, nn.InstanceNorm2d, nn.GroupNorm)):
+                if m.weight is not None:
+                    nn.init.constant_(m.weight, 1)
+                if m.bias is not None:
+                    nn.init.constant_(m.bias, 0)
+
+    def _make_layer(self, dim, stride=1):
+        layer1 = ResidualBlock(self.in_planes,
+                               dim,
+                               self.norm_fn,
+                               stride=stride)
+        layer2 = ResidualBlock(dim, dim, self.norm_fn, stride=1)
+        layers = (layer1, layer2)
+
+        self.in_planes = dim
+        return nn.Sequential(*layers)
+
+    def forward(self, x):
+        b, n, c1, h1, w1 = x.shape
+        x = x.view(b * n, c1, h1, w1)
+
+        x = self.conv1(x)
+        x = self.norm1(x)
+        x = self.relu1(x)
+
+        x = self.layer1(x)
+        x = self.layer2(x)
+
+        x = self.conv2(x)
+
+        _, c2, h2, w2 = x.shape
+        return x.view(b, n, c2, h2, w2)
diff --git a/slam/model_components/projective_ops_dpvo.py b/slam/model_components/projective_ops_dpvo.py
new file mode 100644
index 0000000..6c8762b
--- /dev/null
+++ b/slam/model_components/projective_ops_dpvo.py
@@ -0,0 +1,156 @@
+# This file is from DPVO,
+# licensed under the MIT License.
+
+import torch
+
+MIN_DEPTH = 0.2
+
+
+def extract_intrinsics(intrinsics):
+    return intrinsics[..., None, None, :].unbind(dim=-1)
+
+
+def coords_grid(ht, wd, **kwargs):
+    y, x = torch.meshgrid(
+        torch.arange(ht).to(**kwargs).float(),
+        torch.arange(wd).to(**kwargs).float())
+
+    return torch.stack([x, y], dim=-1)
+
+
+def iproj(patches, intrinsics):
+    """inverse projection."""
+    x, y, d = patches.unbind(dim=2)
+    fx, fy, cx, cy = intrinsics[..., None, None].unbind(dim=2)
+    i = torch.ones_like(d)
+    xn = (x - cx) / fx
+    yn = (y - cy) / fy
+    X = torch.stack([xn, yn, i, d], dim=-1)
+    return X
+
+
+def proj(X, intrinsics, depth=False):
+    """projection."""
+
+    X, Y, Z, W = X.unbind(dim=-1)
+    fx, fy, cx, cy = intrinsics[..., None, None].unbind(dim=2)
+
+    # d = 0.01 * torch.ones_like(Z)
+    # d[Z > 0.01] = 1.0 / Z[Z > 0.01]
+    # d = torch.ones_like(Z)
+    # d[Z.abs() > 0.1] = 1.0 / Z[Z.abs() > 0.1]
+
+    d = 1.0 / Z.clamp(min=0.1)
+    x = fx * (d * X) + cx
+    y = fy * (d * Y) + cy
+
+    if depth:
+        return torch.stack([x, y, d], dim=-1)
+
+    return torch.stack([x, y], dim=-1)
+
+
+def transform(poses,
+              patches,
+              intrinsics,
+              ii,
+              jj,
+              kk,
+              depth=False,
+              valid=False,
+              jacobian=False,
+              tonly=False):
+    """projective transform."""
+
+    X0 = iproj(patches[:, kk], intrinsics[:, ii])
+
+    # transform
+    Gij = poses[:, jj] * poses[:, ii].inv()
+
+    if tonly:
+        Gij[..., 3:] = torch.as_tensor([0, 0, 0, 1], device=Gij.device)
+
+    X1 = Gij[:, :, None, None] * X0
+
+    # project
+    x1 = proj(X1, intrinsics[:, jj], depth)
+
+    if jacobian:
+        p = X1.shape[2]
+        X, Y, Z, H = X1[..., p // 2, p // 2, :].unbind(dim=-1)
+        o = torch.zeros_like(H)
+
+        fx, fy, _, _ = intrinsics[:, jj].unbind(dim=-1)
+
+        d = torch.zeros_like(Z)
+        d[Z.abs() > 0.2] = 1.0 / Z[Z.abs() > 0.2]
+
+        Ja = torch.stack([
+            H,
+            o,
+            o,
+            o,
+            Z,
+            -Y,
+            o,
+            H,
+            o,
+            -Z,
+            o,
+            X,
+            o,
+            o,
+            H,
+            Y,
+            -X,
+            o,
+            o,
+            o,
+            o,
+            o,
+            o,
+            o,
+        ],
+                         dim=-1).view(1, len(ii), 4, 6)
+
+        Jp = torch.stack([
+            fx * d,
+            o,
+            -fx * X * d * d,
+            o,
+            o,
+            fy * d,
+            -fy * Y * d * d,
+            o,
+        ],
+                         dim=-1).view(1, len(ii), 2, 4)
+
+        Jj = torch.matmul(Jp, Ja)
+        Ji = -Gij[:, :, None].adjT(Jj)
+
+        Jz = torch.matmul(Jp, Gij.matrix()[..., :, 3:])
+
+        return x1, (Z > 0.2).float(), (Ji, Jj, Jz)
+
+    if valid:
+        return x1, (X1[..., 2] > 0.2).float()
+
+    return x1
+
+
+def point_cloud(poses, patches, intrinsics, ix):
+    """generate point cloud from patches."""
+    return poses[:, ix, None, None].inv() * iproj(patches, intrinsics[:, ix])
+
+
+def flow_mag(poses, patches, intrinsics, ii, jj, kk, beta=0.3):
+    """projective transform."""
+
+    coords0 = transform(poses, patches, intrinsics, ii, ii, kk)
+    coords1 = transform(poses, patches, intrinsics, ii, jj, kk, tonly=False)
+    coords2 = transform(poses, patches, intrinsics, ii, jj, kk, tonly=True)
+
+    flow1 = (coords1 - coords0).norm(dim=-1)
+    flow2 = (coords2 - coords0).norm(dim=-1)
+
+    return beta * flow1 + (1 - beta) * flow2
diff --git a/slam/model_components/utils_dpvo.py b/slam/model_components/utils_dpvo.py
new file mode 100644
index 0000000..2dfe369
--- /dev/null
+++ b/slam/model_components/utils_dpvo.py
@@ -0,0 +1,94 @@
+# This file is from DPVO,
+# licensed under the MIT License.
+
+import torch
+import torch.nn.functional as F
+
+all_times = []
+
+
+class Timer:
+    def __init__(self, name, enabled=True):
+        self.name = name
+        self.enabled = enabled
+
+        if self.enabled:
+            self.start = torch.cuda.Event(enable_timing=True)
+            self.end = torch.cuda.Event(enable_timing=True)
+
+    def __enter__(self):
+        if self.enabled:
+            self.start.record()
+
+    def __exit__(self, type, value, traceback):
+        global all_times
+        if self.enabled:
+            self.end.record()
+            torch.cuda.synchronize()
+
+            elapsed = self.start.elapsed_time(self.end)
+            all_times.append(elapsed)
+            print(self.name, elapsed)
+
+
+def coords_grid(b, n, h, w, **kwargs):
+    """coordinate grid."""
+    x = torch.arange(0, w, dtype=torch.float, **kwargs)
+    y = torch.arange(0, h, dtype=torch.float, **kwargs)
+    coords = torch.stack(torch.meshgrid(y, x, indexing='ij'))
+    return coords[[1, 0]].view(1, 1, 2, h, w).repeat(b, n, 1, 1, 1)
+
+
+def coords_grid_with_index(d, **kwargs):
+    """coordinate grid with frame index."""
+    b, n, h, w = d.shape
+    x = torch.arange(0, w, dtype=torch.float, **kwargs)
+    y = torch.arange(0, h, dtype=torch.float, **kwargs)
+
+    y, x = torch.stack(torch.meshgrid(y, x, indexing='ij'))
+    y = y.view(1, 1, h, w).repeat(b, n, 1, 1)
+    x = x.view(1, 1, h, w).repeat(b, n, 1, 1)
+
+    coords = torch.stack([x, y, d], dim=2)
+    index = torch.arange(0, n, dtype=torch.float, **kwargs)
+    index = index.view(1, n, 1, 1, 1).repeat(b, 1, 1, h, w)
+
+    return coords, index
+
+
+def patchify(x, patch_size=3):
+    """extract patches from video."""
+    b, n, c, h, w = x.shape
+    x = x.view(b * n, c, h, w)
+    y = F.unfold(x, patch_size)
+    y = y.transpose(1, 2)
+    return y.reshape(b, -1, c, patch_size, patch_size)
+
+
+def pyramidify(fmap, lvls=[1]):
+    """turn fmap into a pyramid."""
+    b, n, c, h, w = fmap.shape
+
+    pyramid = []
+    for lvl in lvls:
+        gmap = F.avg_pool2d(fmap.view(b * n, c, h, w), lvl, stride=lvl)
+        pyramid += [gmap.view(b, n, c, h // lvl, w // lvl)]
+
+    return pyramid
+
+
+def all_pairs_exclusive(n, **kwargs):
+    ii, jj = torch.meshgrid(torch.arange(n, **kwargs),
+                            torch.arange(n, **kwargs))
+    k = ii != jj
+    return ii[k].reshape(-1), jj[k].reshape(-1)
+
+
+def set_depth(patches, depth):
+    patches[..., 2, :, :] = depth[..., None, None]
+    return patches
+
+
+def flatmeshgrid(*args, **kwargs):
+    grid = torch.meshgrid(*args, **kwargs)
+    return (x.reshape(-1) for x in grid)
diff --git a/slam/model_components/vonet_dpvo.py b/slam/model_components/vonet_dpvo.py
new file mode 100644
index 0000000..dd2d7ea
--- /dev/null
+++ b/slam/model_components/vonet_dpvo.py
@@ -0,0 +1,187 @@
+# This file is from DPVO,
+# licensed under the MIT License.
+
+import altcorr
+import fastba
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from slam.model_components.blocks_dpvo import (GatedResidual, GradientClip,
+                                               SoftAgg)
+from slam.model_components.extractor_dpvo import BasicEncoder4
+from slam.model_components.utils_dpvo import coords_grid_with_index, pyramidify
+
+DIM = 384
+
+
+class Update(nn.Module):
+    def __init__(self, p):
+        super(Update, self).__init__()
+
+        self.c1 = nn.Sequential(nn.Linear(DIM, DIM), nn.ReLU(inplace=True),
+                                nn.Linear(DIM, DIM))
+
+        self.c2 = nn.Sequential(nn.Linear(DIM, DIM), nn.ReLU(inplace=True),
+                                nn.Linear(DIM, DIM))
+
+        self.norm = nn.LayerNorm(DIM, eps=1e-3)
+
+        self.agg_kk = SoftAgg(DIM)
+        self.agg_ij = SoftAgg(DIM)
+
+        self.gru = nn.Sequential(
+            nn.LayerNorm(DIM, eps=1e-3),
+            GatedResidual(DIM),
+            nn.LayerNorm(DIM, eps=1e-3),
+            GatedResidual(DIM),
+        )
+
+        self.corr = nn.Sequential(
+            nn.Linear(2 * 49 * p * p, DIM),
+            nn.ReLU(inplace=True),
+            nn.Linear(DIM, DIM),
+            nn.LayerNorm(DIM, eps=1e-3),
+            nn.ReLU(inplace=True),
+            nn.Linear(DIM, DIM),
+        )
+
+        self.d = nn.Sequential(nn.ReLU(inplace=False), nn.Linear(DIM, 2),
+                               GradientClip())
+
+        self.w = nn.Sequential(nn.ReLU(inplace=False), nn.Linear(DIM, 2),
+                               GradientClip(), nn.Sigmoid())
+
+    def forward(self, net, inp, corr, flow, ii, jj, kk):
+        """update operator."""
+        net = net + inp + self.corr(corr)
+        net = self.norm(net)
+        ix, jx = fastba.neighbors(kk, jj)
+
+        mask_ix = (ix >= 0).float().reshape(1, -1, 1)
+        mask_jx = (jx >= 0).float().reshape(1, -1, 1)
+        net = net + self.c1(mask_ix * net[:, ix])
+        net = net + self.c2(mask_jx * net[:, jx])
+        net = net + self.agg_kk(net, kk)
+
+        net = net + self.agg_ij(net, ii * 12345 + jj)
+        net = self.gru(net)
+        return net, (self.d(net), self.w(net), None)
+
+
+class Patchifier(nn.Module):
+    def __init__(self, patch_size=3):
+        super(Patchifier, self).__init__()
+        self.patch_size = patch_size
+        self.fnet = BasicEncoder4(output_dim=128, norm_fn='instance')
+        self.inet = BasicEncoder4(output_dim=DIM, norm_fn='none')
+
+    def __image_gradient(self, images):
+        gray = ((images + 0.5) * (255.0 / 2)).sum(dim=2)
+        dx = gray[..., :-1, 1:] - gray[..., :-1, :-1]
+        dy = gray[..., 1:, :-1] - gray[..., :-1, :-1]
+        g = torch.sqrt(dx**2 + dy**2)
+        g = F.avg_pool2d(g, 4, 4)
+        return g
+
+    def forward(self,
+                images,
+                patches_per_image=80,
+                disps=None,
+                gradient_bias=False,
+                return_color=False):
+        """extract patches from input images."""
+        fmap = self.fnet(images) / 4.0
+        imap = self.inet(images) / 4.0
+
+        b, n, c, h, w = fmap.shape
+        P = self.patch_size
+
+        # bias patch selection towards regions with high gradient
+        if gradient_bias:
+            g = self.__image_gradient(images)
+            x = torch.randint(1,
+                              w - 1,
+                              size=[n, 3 * patches_per_image],
+                              device='cuda')
+            y = torch.randint(1,
+                              h - 1,
+                              size=[n, 3 * patches_per_image],
+                              device='cuda')
+
+            coords = torch.stack([x, y], dim=-1).float()
+            g = altcorr.patchify(g[0, :, None], coords,
+                                 0).view(n, 3 * patches_per_image)
+
+            ix = torch.argsort(g, dim=1)
+            x = torch.gather(x, 1, ix[:, -patches_per_image:])
+            y = torch.gather(y, 1, ix[:, -patches_per_image:])
+
+        else:
+            x = torch.randint(1,
+                              w - 1,
+                              size=[n, patches_per_image],
+                              device='cuda')
+            y = torch.randint(1,
+                              h - 1,
+                              size=[n, patches_per_image],
+                              device='cuda')
+
+        coords = torch.stack([x, y], dim=-1).float()
+        imap = altcorr.patchify(imap[0], coords, 0).view(b, -1, DIM, 1, 1)
+        gmap = altcorr.patchify(fmap[0], coords, P // 2).view(b, -1, 128, P, P)
+
+        if return_color:
+            clr = altcorr.patchify(images[0], 4 * (coords + 0.5),
+                                   0).view(b, -1, 3)
+
+        if disps is None:
+            disps = torch.ones(b, n, h, w, device='cuda')
+
+        grid, _ = coords_grid_with_index(disps, device=fmap.device)
+        patches = altcorr.patchify(grid[0], coords,
+                                   P // 2).view(b, -1, 3, P, P)
+
+        index = torch.arange(n, device='cuda').view(n, 1)
+        index = index.repeat(1, patches_per_image).reshape(-1)
+
+        if return_color:
+            return fmap, gmap, imap, patches, index, clr
+
+        return fmap, gmap, imap, patches, index
+
+
+class CorrBlock:
+    def __init__(self, fmap, gmap, radius=3, dropout=0.2, levels=[1, 4]):
+        self.dropout = dropout
+        self.radius = radius
+        self.levels = levels
+
+        self.gmap = gmap
+        self.pyramid = pyramidify(fmap, lvls=levels)
+
+    def __call__(self, ii, jj, coords):
+        corrs = []
+        return torch.stack(corrs, -1).view(1, len(ii), -1)
+
+
+class VONet(nn.Module):
+    def __init__(self, use_viewer=False):
+        super(VONet, self).__init__()
+        self.P = 3
+        self.patchify = Patchifier(self.P)
+        self.update = Update(self.P)
+
+        self.DIM = DIM
+        self.RES = 4
+
+    # @torch.cuda.amp.autocast(enabled=False)
+    # TODO: add forward()
+    # def forward(self,
+    #             images,
+    #             poses,
+    #             disps,
+    #             intrinsics,
+    #             STEPS=12,
+    #             P=1,
+    #             structure_only=False):
diff --git a/slam/models/vo_net_model.py b/slam/models/vo_net_model.py
new file mode 100644
index 0000000..c5fc5fe
--- /dev/null
+++ b/slam/models/vo_net_model.py
@@ -0,0 +1,50 @@
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Optional, Type
+
+import torch
+
+from slam.common.camera import Camera
+from slam.model_components.vonet_dpvo import VONet
+from slam.models.base_model import Model, ModelConfig
+
+
+@dataclass
+class VONetModelConfig(ModelConfig):
+    """Configuration for model instantiation."""
+    _target: Type = field(default_factory=lambda: VONetModel)
+    # model config params
+    pretrained_path: Optional[Path] = None
+
+
+class VONetModel(Model):
+    """Model class."""
+
+    config: VONetModelConfig
+
+    def __init__(
+        self,
+        config: VONetModelConfig,
+        camera: Camera,
+        **kwargs,
+    ) -> None:
+        super().__init__(config=config, camera=camera, **kwargs)
+
+    # inherit and implement the needed functions from Model
+    def populate_modules(self):
+        super().populate_modules()
+        """Set the necessary modules to get the network working."""
+        self.load_weights()
+
+    def load_weights(self):
+        """This function is from DPVO, licensed under the MIT License."""
+        # load network from checkpoint file
+        from collections import OrderedDict
+        state_dict = torch.load(self.config.pretrained_path)
+        new_state_dict = OrderedDict()
+        for k, v in state_dict.items():
+            if 'update.lmbda' not in k:
+                new_state_dict[k.replace('module.', '')] = v
+        self.network = VONet()
+        self.network.load_state_dict(new_state_dict)
+        self.network.eval()
diff --git a/slam/pipeline/tracker.py b/slam/pipeline/tracker.py
index e29d231..18043bc 100644
--- a/slam/pipeline/tracker.py
+++ b/slam/pipeline/tracker.py
@@ -53,9 +53,18 @@ def spin(self, map_buffer, algorithm, viz_buffer, event_ready,
                        batch_size=1,
                        shuffle=False,
                        num_workers=1))
-        for idx, gt_color, gt_depth, gt_c2w in pbar:
+        gt_depth, imu_datas = None, None
+        for cur_data in pbar:
+            if self.dataset.data_format == 'RGBD':
+                idx, gt_color, gt_depth, gt_c2w = cur_data
+            else:
+                idx, gt_color, imu_datas, gt_c2w = cur_data
+
             idx_np = idx[0].cpu().numpy()
-            gt_depth_np = gt_depth[0].cpu().numpy()
+            if gt_depth is not None:
+                gt_depth_np = gt_depth[0].cpu().numpy()
+            else:
+                gt_depth_np = None
             gt_color_np = gt_color[0].cpu().numpy()
             gt_c2w = gt_c2w[0]
             gt_c2w_np = gt_c2w.cpu().numpy()
@@ -91,8 +100,8 @@ def spin(self, map_buffer, algorithm, viz_buffer, event_ready,
 
             candidate_c2w = None
             # optimize curframe pose
+            candidate_c2w = algorithm.do_tracking(current_frame)
             if algorithm.is_initialized():
-                candidate_c2w = algorithm.do_tracking(current_frame)
                 current_frame.set_pose(candidate_c2w,
                                        separate_LR=algorithm.is_separate_LR(),
                                        rot_rep=algorithm.get_rot_rep())
@@ -102,91 +111,29 @@ def spin(self, map_buffer, algorithm, viz_buffer, event_ready,
             cur_c2w_np = cur_c2w.clone().detach().cpu().numpy()
             algorithm.add_framepose(cur_c2w.detach(), gt_c2w, gt_c2w_ori)
 
-            # show result: extract mesh and send to visualizer
             if self.enable_vis and algorithm.is_initialized(
             ) and self.config.render_freq > 0 and (
                     idx_np) % self.config.render_freq == 0:
-                # pose
-                viz_buffer.put_nowait(('pose', idx_np, cur_c2w_np, gt_c2w_np))
-                # render img
-                rcolor_np, rdepth_np = algorithm.render_img(
-                    c2w=cur_c2w_np, gt_depth=gt_depth_np,
-                    idx=idx_np)  # color: [H, W, C], depth: [H, W]
-                viz_buffer.put_nowait(('img', idx_np, gt_color_np, gt_depth_np,
-                                       rcolor_np, rdepth_np))
-                # extract mesh
-                mesh = algorithm.get_mesh()
-                if mesh is not None:
-                    culled_mesh = cull_mesh(
-                        dataset=self.dataset,
-                        mesh=mesh,
-                        estimate_c2w_list=algorithm.get_estimate_c2w_list(),
-                        eval_rec=True)
-                    viz_buffer.put_nowait(('mesh', idx_np, culled_mesh))
-                # get pointcloud
-                cloud = algorithm.get_cloud(cur_c2w_np,
-                                            gt_depth_np=gt_depth_np)
-                if cloud is not None:
-                    viz_buffer.put_nowait(('cloud', idx_np, cloud))
-
+                # send to visualizer
+                self.visualize_results(viz_buffer, idx_np, cur_c2w_np,
+                                       gt_c2w_np, gt_color_np, gt_depth_np,
+                                       algorithm)
             if not self.enable_vis and self.config.save_debug_result and \
                 algorithm.is_initialized() and self.config.render_freq > 0 \
                     and ((idx_np) % self.config.render_freq == 0
                          or idx_np == len(self.dataset) - 1):
-                # save imgs
-                imgs_save_path = f'{self.out_dir}/imgs'
-                rcolor_np, rdepth_np = algorithm.render_img(
-                    c2w=cur_c2w_np, gt_depth=gt_depth_np, idx=idx_np)
-                result_2d = save_render_imgs(idx_np,
-                                             gt_color_np=gt_color_np,
-                                             gt_depth_np=gt_depth_np,
-                                             color_np=rcolor_np,
-                                             depth_np=rdepth_np,
-                                             img_save_dir=imgs_save_path)
-                psnr, ssim, lpips, depth_l1_render = result_2d
-                print(
-                    f'<debug> idx: {idx_np}, psnr[dB]: {psnr}, ssim: {ssim}, '
-                    f'lpips: {lpips}, depth_l1_render[cm]: {depth_l1_render}')
-                self.psnr_sum += psnr
-                self.ssim_sum += ssim
-                self.lpips_sum += lpips
-                self.depth_l1_render_sum += depth_l1_render
-                self.frame_cnt += 1
-                # save mesh
-                mesh_savepath = f'{self.out_dir}/mesh/{idx_np:05d}.ply'
-                if idx_np == len(self.dataset) - 1:
-                    mesh_savepath = f'{self.out_dir}/final_mesh_rec.ply'
-                mesh = algorithm.get_mesh()
-                if mesh is not None:
-                    culled_mesh = cull_mesh(
-                        dataset=self.dataset,
-                        mesh=mesh,
-                        estimate_c2w_list=algorithm.get_estimate_c2w_list(),
-                        eval_rec=True)
-                    culled_mesh.export(mesh_savepath)
-                # save cloud
-                cloud_savepath = f'{self.out_dir}/cloud/{idx_np:05d}.ply'
-                cloud = algorithm.get_cloud(
-                    cur_c2w_np,
-                    gt_depth_np=gt_depth_np)  # o3d.geometry.PointCloud
-                if cloud is not None:
-                    pts, colors = cloud
-                    pcd = o3d.geometry.PointCloud()
-                    pcd.points = o3d.utility.Vector3dVector(pts)
-                    pcd.colors = o3d.utility.Vector3dVector(colors)
-                    o3d.io.write_point_cloud(cloud_savepath, pcd)
-                # save traj
-                traj_savepath = os.path.join(self.out_dir, 'eval.tar')
-                torch.save(
-                    {
-                        'gt_c2w_list_ori': algorithm.get_gt_c2w_list_ori(),
-                        'gt_c2w_list': algorithm.get_gt_c2w_list(),
-                        'estimate_c2w_list': algorithm.get_estimate_c2w_list(),
-                        'idx': idx,
-                    },
-                    traj_savepath,
-                    _use_new_zipfile_serialization=False)
-                if idx_np == len(self.dataset) - 1:
+                # save debug results
+                result_2d = self.save_debug_results(algorithm, idx_np,
+                                                    gt_color_np, gt_depth_np,
+                                                    cur_c2w_np)
+                if result_2d is not None:
+                    psnr, ssim, lpips, depth_l1_render = result_2d
+                    self.psnr_sum += psnr
+                    self.ssim_sum += ssim
+                    self.lpips_sum += lpips
+                    self.depth_l1_render_sum += depth_l1_render
+                    self.frame_cnt += 1
+                if idx_np == len(self.dataset) - 1 and self.frame_cnt > 0:
                     # print 2d debug render metric
                     avg_psnr = self.psnr_sum / self.frame_cnt
                     avg_ssim = self.ssim_sum / self.frame_cnt
@@ -210,76 +157,7 @@ def spin(self, map_buffer, algorithm, viz_buffer, event_ready,
         if not self.enable_vis and not self.config.save_debug_result \
            and self.config.save_re_render_result:
             print('Starting re-rendering frames...')
-            self.frame_cnt, self.psnr_sum, self.ssim_sum, self.lpips_sum, \
-                self.depth_l1_render_sum = 0, 0, 0, 0, 0
-            pbar = tqdm(
-                DataLoader(self.dataset,
-                           batch_size=1,
-                           shuffle=False,
-                           num_workers=1))
-            estimate_c2w_list = algorithm.get_estimate_c2w_list()
-            for idx, gt_color, gt_depth, gt_c2w in pbar:
-                idx_np = idx[0].cpu().numpy()
-                gt_depth_np = gt_depth[0].cpu().numpy()
-                gt_color_np = gt_color[0].cpu().numpy()
-                cur_c2w_np = estimate_c2w_list[idx_np].detach().cpu().numpy()
-                if idx_np % self.config.render_freq == 0:
-                    # save imgs
-                    imgs_save_path = f'{self.out_dir}/imgs'
-                    rcolor_np, rdepth_np = algorithm.render_img(
-                        c2w=cur_c2w_np, gt_depth=gt_depth_np, idx=idx_np)
-                    result_2d = save_render_imgs(idx_np,
-                                                 gt_color_np=gt_color_np,
-                                                 gt_depth_np=gt_depth_np,
-                                                 color_np=rcolor_np,
-                                                 depth_np=rdepth_np,
-                                                 img_save_dir=imgs_save_path)
-                    psnr, ssim, lpips, depth_l1_render = result_2d
-                    print(f'idx: {idx_np}, psnr[dB]: {psnr}, ssim: {ssim},'
-                          f'lpips: {lpips}, depth_l1_render[cm]:'
-                          f'{depth_l1_render}')
-                    self.psnr_sum += psnr
-                    self.ssim_sum += ssim
-                    self.lpips_sum += lpips
-                    self.depth_l1_render_sum += depth_l1_render
-                    self.frame_cnt += 1
-                # save final mesh
-                if idx_np == len(self.dataset) - 1:
-                    mesh = algorithm.get_mesh()
-                    if mesh is not None:
-                        mesh_savepath = f'{self.out_dir}/final_mesh.ply'
-                        culled_mesh = cull_mesh(dataset=self.dataset,
-                                                mesh=mesh,
-                                                estimate_c2w_list=algorithm.
-                                                get_estimate_c2w_list(),
-                                                eval_rec=False)
-                        culled_mesh.export(mesh_savepath)
-                        mesh_savepath = f'{self.out_dir}/final_mesh_rec.ply'
-                        culled_mesh = cull_mesh(dataset=self.dataset,
-                                                mesh=mesh,
-                                                estimate_c2w_list=algorithm.
-                                                get_estimate_c2w_list(),
-                                                eval_rec=True)
-                        culled_mesh.export(mesh_savepath)
-
-            # print 2d render metric
-            avg_psnr = self.psnr_sum / self.frame_cnt
-            avg_ssim = self.ssim_sum / self.frame_cnt
-            avg_lpips = self.lpips_sum / self.frame_cnt
-            avg_depth_l1 = self.depth_l1_render_sum / self.frame_cnt
-            print(f'avg_psnr[dB]: {avg_psnr}, avg_ms_ssim: {avg_ssim},'
-                  f'avg_lpips: {avg_lpips}, avg_depth_l1[cm]: {avg_depth_l1}')
-            # save final traj
-            traj_savepath = os.path.join(self.out_dir, 'eval.tar')
-            torch.save(
-                {
-                    'gt_c2w_list_ori': algorithm.get_gt_c2w_list_ori(),
-                    'gt_c2w_list': algorithm.get_gt_c2w_list(),
-                    'estimate_c2w_list': algorithm.get_estimate_c2w_list(),
-                    'idx': idx,
-                },
-                traj_savepath,
-                _use_new_zipfile_serialization=False)
+            self.save_re_render_frames(algorithm)
 
         # set finished
         algorithm.set_finished()
@@ -290,8 +168,8 @@ def check_mapframe(self, check_frame, map_buffer):
         else:
             map_every = self.config.map_every
         # send to mapper
-        if check_frame.fid % map_every == 0 or check_frame.fid == len(
-                self.dataset) - 1:
+        if map_every != -1 and (check_frame.fid % map_every == 0
+                                or check_frame.fid == len(self.dataset) - 1):
             check_frame.is_final_frame = (
                 check_frame.fid == len(self.dataset) - 1)
             map_buffer.put(check_frame, block=True)
@@ -313,3 +191,165 @@ def predict_current_pose(self, frame_id, gt_c2w_np, estimate_c2w_list):
             delta = c2w_est_prev @ np.linalg.inv(c2w_est_prev_prev)
             init_c2w_np = delta @ c2w_est_prev
             return init_c2w_np
+
+    def visualize_results(self, viz_buffer, idx_np, cur_c2w_np, gt_c2w_np,
+                          gt_color_np, gt_depth_np, algorithm):
+        # pose
+        viz_buffer.put_nowait(('pose', idx_np, cur_c2w_np, gt_c2w_np))
+        # render img
+        rcolor_np, rdepth_np = algorithm.render_img(
+            c2w=cur_c2w_np, gt_depth=gt_depth_np,
+            idx=idx_np)  # color: [H, W, C], depth: [H, W]
+        viz_buffer.put_nowait(
+            ('img', idx_np, gt_color_np, gt_depth_np, rcolor_np, rdepth_np))
+        # extract mesh
+        mesh = algorithm.get_mesh()
+        if mesh is not None:
+            culled_mesh = cull_mesh(
+                dataset=self.dataset,
+                mesh=mesh,
+                estimate_c2w_list=algorithm.get_estimate_c2w_list(),
+                eval_rec=True)
+            viz_buffer.put_nowait(('mesh', idx_np, culled_mesh))
+        # get pointcloud
+        cloud = algorithm.get_cloud(cur_c2w_np, gt_depth_np=gt_depth_np)
+        if cloud is not None:
+            viz_buffer.put_nowait(('cloud', idx_np, cloud))
+
+    def save_debug_results(self, algorithm, idx_np, gt_color_np, gt_depth_np,
+                           cur_c2w_np):
+        result_2d = None
+        # save render imgs
+        imgs_save_path = f'{self.out_dir}/imgs'
+        rcolor_np, rdepth_np = algorithm.render_img(c2w=cur_c2w_np,
+                                                    gt_depth=gt_depth_np,
+                                                    idx=idx_np)
+        result_2d = save_render_imgs(idx_np,
+                                     gt_color_np=gt_color_np,
+                                     gt_depth_np=gt_depth_np,
+                                     color_np=rcolor_np,
+                                     depth_np=rdepth_np,
+                                     img_save_dir=imgs_save_path)
+        if result_2d is not None:
+            psnr, ssim, lpips, depth_l1_render = result_2d
+            print(f'idx: {idx_np}, psnr[dB]: {psnr}, ssim: {ssim},'
+                  f'lpips: {lpips}, depth_l1_render[cm]: {depth_l1_render}')
+        # save mesh
+        mesh_savepath = f'{self.out_dir}/mesh/{idx_np:05d}.ply'
+        if idx_np == len(self.dataset) - 1:
+            mesh_savepath = f'{self.out_dir}/final_mesh_rec.ply'
+            mesh = algorithm.get_mesh()
+            if mesh is not None:
+                culled_mesh = cull_mesh(
+                    dataset=self.dataset,
+                    mesh=mesh,
+                    estimate_c2w_list=algorithm.get_estimate_c2w_list(),
+                    eval_rec=True)
+                culled_mesh.export(mesh_savepath)
+        # save cloud
+        cloud_savepath = f'{self.out_dir}/cloud/{idx_np:05d}.ply'
+        cloud = algorithm.get_cloud(
+            cur_c2w_np, gt_depth_np=gt_depth_np)  # o3d.geometry.PointCloud
+        if cloud is not None:
+            pts, colors = cloud
+            pcd = o3d.geometry.PointCloud()
+            pcd.points = o3d.utility.Vector3dVector(pts)
+            pcd.colors = o3d.utility.Vector3dVector(colors)
+            o3d.io.write_point_cloud(cloud_savepath, pcd)
+        # save traj
+        traj_savepath = os.path.join(self.out_dir, 'eval.tar')
+        torch.save(
+            {
+                'gt_c2w_list_ori': algorithm.get_gt_c2w_list_ori(),
+                'gt_c2w_list': algorithm.get_gt_c2w_list(),
+                'estimate_c2w_list': algorithm.get_estimate_c2w_list(),
+                'idx': torch.tensor(idx_np),
+            },
+            traj_savepath,
+            _use_new_zipfile_serialization=False)
+
+        return result_2d
+
+    def save_re_render_frames(self, algorithm):
+        self.frame_cnt, self.psnr_sum, self.ssim_sum, self.lpips_sum, \
+            self.depth_l1_render_sum = 0, 0, 0, 0, 0
+        pbar = tqdm(
+            DataLoader(self.dataset,
+                       batch_size=1,
+                       shuffle=False,
+                       num_workers=1))
+        estimate_c2w_list = algorithm.get_estimate_c2w_list()
+
+        gt_depth, imu_datas = None, None
+        for cur_data in pbar:
+            if self.dataset.data_format == 'RGBD':
+                idx, gt_color, gt_depth, gt_c2w = cur_data
+            else:
+                idx, gt_color, imu_datas, gt_c2w = cur_data
+
+            idx_np = idx[0].cpu().numpy()
+            if gt_depth is not None:
+                gt_depth_np = gt_depth[0].cpu().numpy()
+            else:
+                gt_depth_np = None
+            gt_color_np = gt_color[0].cpu().numpy()
+            cur_c2w_np = estimate_c2w_list[idx_np].detach().cpu().numpy()
+            if idx_np % self.config.render_freq == 0:
+                # save imgs
+                imgs_save_path = f'{self.out_dir}/imgs'
+                rcolor_np, rdepth_np = algorithm.render_img(
+                    c2w=cur_c2w_np, gt_depth=gt_depth_np, idx=idx_np)
+                result_2d = save_render_imgs(idx_np,
+                                             gt_color_np=gt_color_np,
+                                             gt_depth_np=gt_depth_np,
+                                             color_np=rcolor_np,
+                                             depth_np=rdepth_np,
+                                             img_save_dir=imgs_save_path)
+                if result_2d is not None:
+                    psnr, ssim, lpips, depth_l1_render = result_2d
+                    print(f'idx: {idx_np}, psnr[dB]: {psnr}, ssim: {ssim},'
+                          f'lpips: {lpips}, depth_l1_render[cm]:'
+                          f'{depth_l1_render}')
+                    self.psnr_sum += psnr
+                    self.ssim_sum += ssim
+                    self.lpips_sum += lpips
+                    self.depth_l1_render_sum += depth_l1_render
+                    self.frame_cnt += 1
+            # save final mesh
+            if idx_np == len(self.dataset) - 1:
+                mesh = algorithm.get_mesh()
+                if mesh is not None:
+                    mesh_savepath = f'{self.out_dir}/final_mesh.ply'
+                    culled_mesh = cull_mesh(
+                        dataset=self.dataset,
+                        mesh=mesh,
+                        estimate_c2w_list=algorithm.get_estimate_c2w_list(),
+                        eval_rec=False)
+                    culled_mesh.export(mesh_savepath)
+                    mesh_savepath = f'{self.out_dir}/final_mesh_rec.ply'
+                    culled_mesh = cull_mesh(
+                        dataset=self.dataset,
+                        mesh=mesh,
+                        estimate_c2w_list=algorithm.get_estimate_c2w_list(),
+                        eval_rec=True)
+                    culled_mesh.export(mesh_savepath)
+
+        # print 2d render metric
+        if self.frame_cnt > 0:
+            avg_psnr = self.psnr_sum / self.frame_cnt
+            avg_ssim = self.ssim_sum / self.frame_cnt
+            avg_lpips = self.lpips_sum / self.frame_cnt
+            avg_depth_l1 = self.depth_l1_render_sum / self.frame_cnt
+            print(f'avg_psnr[dB]: {avg_psnr}, avg_ms_ssim: {avg_ssim},'
+                  f'avg_lpips: {avg_lpips}, avg_depth_l1[cm]: {avg_depth_l1}')
+        # save final traj
+        traj_savepath = os.path.join(self.out_dir, 'eval.tar')
+        torch.save(
+            {
+                'gt_c2w_list_ori': algorithm.get_gt_c2w_list_ori(),
+                'gt_c2w_list': algorithm.get_gt_c2w_list(),
+                'estimate_c2w_list': algorithm.get_estimate_c2w_list(),
+                'idx': idx,
+            },
+            traj_savepath,
+            _use_new_zipfile_serialization=False)
diff --git a/slam/pipeline/visualizer.py b/slam/pipeline/visualizer.py
index 6b0e1fb..d847e3b 100644
--- a/slam/pipeline/visualizer.py
+++ b/slam/pipeline/visualizer.py
@@ -111,31 +111,39 @@ def animation_callback(self, vis, viz_queue):
                         rdepth = np.zeros_like(gt_depth)
                     gt_color = np.clip(gt_color, 0, 1)
                     rcolor = np.clip(rcolor, 0, 1)
-                    depth_residual = np.abs(gt_depth - rdepth)
-                    depth_residual[gt_depth == 0.0] = 0.0
                     color_residual = np.abs(gt_color - rcolor)
-                    color_residual[gt_depth == 0.0] = 0.0
+                    if gt_depth is not None:
+                        depth_residual = np.abs(gt_depth - rdepth)
+                        depth_residual[gt_depth == 0.0] = 0.0
+                        color_residual[gt_depth == 0.0] = 0.0
+                        max_depth = np.max(gt_depth)
                     color_residual = np.clip(color_residual, 0, 1)
-                    max_depth = np.max(gt_depth)
-                    self.depth_plot = self.ax_depth.imshow(gt_depth,
-                                                           cmap='plasma',
-                                                           vmin=0,
-                                                           vmax=max_depth)
-                    self.rdepth_plot = self.ax_rdepth.imshow(rdepth,
-                                                             cmap='plasma',
-                                                             vmin=0,
-                                                             vmax=max_depth)
-                    self.depth_diff_plot = self.ax_depth_diff.imshow(
-                        depth_residual, cmap='plasma', vmin=0, vmax=max_depth)
+                    if gt_depth is not None:
+                        self.depth_plot = self.ax_depth.imshow(gt_depth,
+                                                               cmap='plasma',
+                                                               vmin=0,
+                                                               vmax=max_depth)
+                        self.rdepth_plot = self.ax_rdepth.imshow(
+                            rdepth, cmap='plasma', vmin=0, vmax=max_depth)
+                        self.depth_diff_plot = self.ax_depth_diff.imshow(
+                            depth_residual,
+                            cmap='plasma',
+                            vmin=0,
+                            vmax=max_depth)
                     self.color_plot.set_array(gt_color)
                     self.rcolor_plot.set_array(rcolor)
                     self.color_diff_plot.set_array(color_residual)
                     # 2d metrics
                     # rgb
-                    depth_mask = (torch.from_numpy(
-                        gt_depth > 0).unsqueeze(-1)).float()
-                    gt_color = torch.tensor(gt_color) * depth_mask
-                    rcolor = torch.tensor(rcolor) * depth_mask
+                    if gt_depth is not None:
+                        depth_mask = (torch.from_numpy(
+                            gt_depth > 0).unsqueeze(-1)).float()
+                        gt_color = torch.tensor(gt_color) * depth_mask
+                        rcolor = torch.tensor(rcolor) * depth_mask
+                    else:
+                        gt_color = torch.tensor(gt_color)
+                        rcolor = torch.tensor(rcolor)
+
                     mse_loss = torch.nn.functional.mse_loss(gt_color, rcolor)
                     psnr = -10. * torch.log10(mse_loss)
                     ssim = ms_ssim(gt_color.transpose(0,
@@ -151,11 +159,14 @@ def animation_callback(self, vis, viz_queue):
                                                       2).float()).item()
 
                     # depth
-                    gt_depth = torch.tensor(gt_depth)
-                    rdepth = torch.tensor(rdepth)
-                    depth_l1_render = torch.abs(
-                        gt_depth[gt_depth > 0] -
-                        rdepth[gt_depth > 0]).mean().item() * 100
+                    if gt_depth is not None:
+                        gt_depth = torch.tensor(gt_depth)
+                        rdepth = torch.tensor(rdepth)
+                        depth_l1_render = torch.abs(
+                            gt_depth[gt_depth > 0] -
+                            rdepth[gt_depth > 0]).mean().item() * 100
+                    else:
+                        depth_l1_render = 0.0
                     text = (f'PSNR[dB]^: {psnr.item():.2f}, SSIM^: {ssim:.2f},'
                             f'LPIPS: {lpips:.2f}, Depth_L1[cm]:'
                             f'{depth_l1_render:.2f}')
diff --git a/third_party/dpvo_ext/README.md b/third_party/dpvo_ext/README.md
new file mode 100644
index 0000000..9f833a5
--- /dev/null
+++ b/third_party/dpvo_ext/README.md
@@ -0,0 +1,12 @@
+## Install
+
+```
+python setup.py install
+```
+
+## Notice
+
+All files in this code repository are sourced from [DPVO](https://github.com/princeton-vl/DPVO) project, and are licensed under the MIT License.
+
+Please note that all third-party files and libraries retain their original authorship and copyright, and are subject to their respective license terms.
+
diff --git a/third_party/dpvo_ext/altcorr/__init__.py b/third_party/dpvo_ext/altcorr/__init__.py
new file mode 100644
index 0000000..b3b6e09
--- /dev/null
+++ b/third_party/dpvo_ext/altcorr/__init__.py
@@ -0,0 +1 @@
+from .correlation import corr, patchify
diff --git a/third_party/dpvo_ext/altcorr/correlation.cpp b/third_party/dpvo_ext/altcorr/correlation.cpp
new file mode 100644
index 0000000..4f2b0a5
--- /dev/null
+++ b/third_party/dpvo_ext/altcorr/correlation.cpp
@@ -0,0 +1,63 @@
+#include <torch/extension.h>
+#include <vector>
+
+// CUDA forward declarations
+std::vector<torch::Tensor> corr_cuda_forward(
+    torch::Tensor fmap1,
+    torch::Tensor fmap2,
+    torch::Tensor coords,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    int radius);
+
+std::vector<torch::Tensor> corr_cuda_backward(
+  torch::Tensor fmap1,
+  torch::Tensor fmap2,
+  torch::Tensor coords,
+  torch::Tensor ii,
+  torch::Tensor jj,
+  torch::Tensor corr_grad,
+  int radius);
+
+std::vector<torch::Tensor> patchify_cuda_forward(
+    torch::Tensor net, torch::Tensor coords, int radius);
+
+std::vector<torch::Tensor> patchify_cuda_backward(
+    torch::Tensor net, torch::Tensor coords, torch::Tensor gradient, int radius);
+
+std::vector<torch::Tensor> corr_forward(
+    torch::Tensor fmap1,
+    torch::Tensor fmap2,
+    torch::Tensor coords,
+    torch::Tensor ii,
+    torch::Tensor jj, int radius) {
+  return corr_cuda_forward(fmap1, fmap2, coords, ii, jj, radius);
+}
+
+std::vector<torch::Tensor> corr_backward(
+    torch::Tensor fmap1,
+    torch::Tensor fmap2,
+    torch::Tensor coords,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor corr_grad, int radius) {
+  return corr_cuda_backward(fmap1, fmap2, coords, ii, jj, corr_grad, radius);
+}
+
+std::vector<torch::Tensor> patchify_forward(
+    torch::Tensor net, torch::Tensor coords, int radius) {
+  return patchify_cuda_forward(net, coords, radius);
+}
+
+std::vector<torch::Tensor> patchify_backward(
+    torch::Tensor net, torch::Tensor coords, torch::Tensor gradient, int radius) {
+  return patchify_cuda_backward(net, coords, gradient, radius);
+}
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &corr_forward, "CORR forward");
+  m.def("backward", &corr_backward, "CORR backward");
+
+  m.def("patchify_forward", &patchify_forward, "PATCHIFY forward");
+  m.def("patchify_backward", &patchify_backward, "PATCHIFY backward");
+}
diff --git a/third_party/dpvo_ext/altcorr/correlation.py b/third_party/dpvo_ext/altcorr/correlation.py
new file mode 100644
index 0000000..32faf29
--- /dev/null
+++ b/third_party/dpvo_ext/altcorr/correlation.py
@@ -0,0 +1,74 @@
+import cuda_corr
+import torch
+
+
+class CorrLayer(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, fmap1, fmap2, coords, ii, jj, radius, dropout):
+        """forward correlation."""
+        ctx.save_for_backward(fmap1, fmap2, coords, ii, jj)
+        ctx.radius = radius
+        ctx.dropout = dropout
+        corr, = cuda_corr.forward(fmap1, fmap2, coords, ii, jj, radius)
+
+        return corr
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward correlation."""
+        fmap1, fmap2, coords, ii, jj = ctx.saved_tensors
+
+        if ctx.dropout < 1:
+            perm = torch.rand(len(ii), device='cuda') < ctx.dropout
+            coords = coords[:, perm]
+            grad = grad[:, perm]
+            ii = ii[perm]
+            jj = jj[perm]
+
+        fmap1_grad, fmap2_grad = \
+            cuda_corr.backward(fmap1, fmap2, coords, ii, jj, grad, ctx.radius)
+
+        return fmap1_grad, fmap2_grad, None, None, None, None, None
+
+
+class PatchLayer(torch.autograd.Function):
+    @staticmethod
+    def forward(ctx, net, coords, radius):
+        """forward patchify."""
+        ctx.radius = radius
+        ctx.save_for_backward(net, coords)
+
+        patches, = cuda_corr.patchify_forward(net, coords, radius)
+        return patches
+
+    @staticmethod
+    def backward(ctx, grad):
+        """backward patchify."""
+        net, coords = ctx.saved_tensors
+        grad, = cuda_corr.patchify_backward(net, coords, grad, ctx.radius)
+
+        return grad, None, None
+
+
+def patchify(net, coords, radius, mode='bilinear'):
+    """extract patches."""
+
+    patches = PatchLayer.apply(net, coords, radius)
+
+    if mode == 'bilinear':
+        offset = (coords - coords.floor()).to(net.device)
+        dx, dy = offset[:, :, None, None, None].unbind(dim=-1)
+
+        d = 2 * radius + 1
+        x00 = (1 - dy) * (1 - dx) * patches[..., :d, :d]
+        x01 = (1 - dy) * (dx) * patches[..., :d, 1:]
+        x10 = (dy) * (1 - dx) * patches[..., 1:, :d]
+        x11 = (dy) * (dx) * patches[..., 1:, 1:]
+
+        return x00 + x01 + x10 + x11
+
+    return patches
+
+
+def corr(fmap1, fmap2, coords, ii, jj, radius=1, dropout=1):
+    return CorrLayer.apply(fmap1, fmap2, coords, ii, jj, radius, dropout)
diff --git a/third_party/dpvo_ext/altcorr/correlation_kernel.cu b/third_party/dpvo_ext/altcorr/correlation_kernel.cu
new file mode 100644
index 0000000..55b7d2a
--- /dev/null
+++ b/third_party/dpvo_ext/altcorr/correlation_kernel.cu
@@ -0,0 +1,333 @@
+#include <torch/extension.h>
+#include <THC/THCAtomics.cuh>
+#include <vector>
+#include <iostream>
+
+using namespace torch::indexing;
+
+#define THREADS 256
+#define BLOCKS(n) (n + THREADS - 1) / THREADS
+
+__forceinline__ __device__
+bool within_bounds(int h, int w, int H, int W) {
+  return h >= 0 && h < H && w >= 0 && w < W;
+}
+
+template <typename scalar_t>
+__global__ void patchify_forward_kernel(int R,
+    const torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> net,
+    const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> coords,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> patches)
+{
+  // diameter
+  const int D = 2*R + 2;
+
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+  const int C = net.size(1);
+  const int H = net.size(2);
+  const int W = net.size(3);
+
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < B * M * D * D) {
+    const int ii = n % D; n /= D;
+    const int jj = n % D; n /= D;
+    const int  m = n % M; n /= M;
+
+    const float x = coords[n][m][0];
+    const float y = coords[n][m][1];
+    const int i = static_cast<int>(floor(y)) + (ii - R);
+    const int j = static_cast<int>(floor(x)) + (jj - R);
+
+    if (within_bounds(i, j, H, W)) {
+      for (int k=0; k<C; k++)
+        patches[n][m][k][ii][jj] = net[n][k][i][j];
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void patchify_backward_kernel(int R,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> patch_gradient,
+    const torch::PackedTensorAccessor32<float,3,torch::RestrictPtrTraits> coords,
+    torch::PackedTensorAccessor32<scalar_t,4,torch::RestrictPtrTraits> gradient)
+{
+  // diameter
+  const int D = 2*R + 2;
+
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+  const int C = gradient.size(1);
+  const int H = gradient.size(2);
+  const int W = gradient.size(3);
+
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+  if (n < B * M * D * D) {
+    const int ii = n % D; n /= D;
+    const int jj = n % D; n /= D;
+    const int  m = n % M; n /= M;
+
+    const float x = coords[n][m][0];
+    const float y = coords[n][m][1];
+    const int i = static_cast<int>(floor(y)) + (ii - R);
+    const int j = static_cast<int>(floor(x)) + (jj - R);
+
+    if (within_bounds(i, j, H, W)) {
+      for (int k=0; k<C; k++)
+        atomicAdd(&gradient[n][k][i][j], patch_gradient[n][m][k][ii][jj]);
+    }
+  }
+}
+
+template <typename scalar_t>
+__global__ void corr_forward_kernel(int R,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> fmap1,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> fmap2,
+    const torch::PackedTensorAccessor32<float,5,torch::RestrictPtrTraits> coords,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> us,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> vs,
+    torch::PackedTensorAccessor32<scalar_t,6,torch::RestrictPtrTraits> corr)
+{
+  // diameter
+  const int D = 2*R + 2;
+
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+  const int H = coords.size(3);
+  const int W = coords.size(4);
+
+  const int C = fmap1.size(2);
+  const int H2 = fmap2.size(3);
+  const int W2 = fmap2.size(4);
+
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (n < B * M * H * W * D * D) {
+    const int jj = n % D; n /= D;
+    const int ii = n % D; n /= D;
+    const int j0 = n % W; n /= W;
+    const int i0 = n % H; n /= H;
+    const int  m = n % M; n /= M;
+
+    const int ix = us[m];
+    const int jx = vs[m];
+
+    const float x = coords[n][m][0][i0][j0];
+    const float y = coords[n][m][1][i0][j0];
+
+    const int i1 = static_cast<int>(floor(y)) + (ii - R);
+    const int j1 = static_cast<int>(floor(x)) + (jj - R);
+
+    scalar_t s = 0;
+    if (within_bounds(i1, j1, H2, W2)) {
+
+      #pragma unroll 8
+      for (int i=0; i<C; i+=8) {
+        scalar_t f1[8]; for (int j=0; j<8; j++) f1[j] = fmap1[n][ix][i+j][i0][j0];
+        scalar_t f2[8]; for (int j=0; j<8; j++) f2[j] = fmap2[n][jx][i+j][i1][j1];
+
+        #pragma unroll
+        for (int j=0; j<8; j++) s += f1[j] * f2[j];
+      }
+    }
+
+    corr[n][m][ii][jj][i0][j0] = s;
+  }
+}
+
+
+template <typename scalar_t>
+__global__ void corr_backward_kernel(int R,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> fmap1,
+    const torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> fmap2,
+    const torch::PackedTensorAccessor32<float,5,torch::RestrictPtrTraits> coords,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> us,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> vs,
+    const torch::PackedTensorAccessor32<float,6,torch::RestrictPtrTraits> corr_grad,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> fmap1_grad,
+    torch::PackedTensorAccessor32<scalar_t,5,torch::RestrictPtrTraits> fmap2_grad)
+{
+  // diameter
+  const int D = 2*R + 2;
+
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+  const int H = coords.size(3);
+  const int W = coords.size(4);
+
+  const int C = fmap1.size(2);
+  const int H2 = fmap2.size(3);
+  const int W2 = fmap2.size(4);
+
+  int n = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (n < B * M * H * W * D * D) {
+    const int jj = n % D; n /= D;
+    const int ii = n % D; n /= D;
+    const int j0 = n % W; n /= W;
+    const int i0 = n % H; n /= H;
+    const int  m = n % M; n /= M;
+
+    const int ix = us[m];
+    const int jx = vs[m];
+
+    const float x = coords[n][m][0][i0][j0];
+    const float y = coords[n][m][1][i0][j0];
+
+    const int i1 = static_cast<int>(floor(y)) + (ii - R);
+    const int j1 = static_cast<int>(floor(x)) + (jj - R);
+
+    const scalar_t g = (scalar_t) corr_grad[n][m][ii][jj][i0][j0];
+
+    if (within_bounds(i1, j1, H2, W2)) {
+      #pragma unroll 32
+      for (int i=0; i<C; i++) {
+        atomicAdd(&fmap1_grad[n][ix][i][i0][j0], g * fmap2[n][jx][i][i1][j1]);
+        atomicAdd(&fmap2_grad[n][jx][i][i1][j1], g * fmap1[n][ix][i][i0][j0]);
+      }
+    }
+  }
+}
+
+
+std::vector<torch::Tensor> corr_cuda_forward(
+  torch::Tensor fmap1,
+  torch::Tensor fmap2,
+  torch::Tensor coords,
+  torch::Tensor ii,
+  torch::Tensor jj,
+  int radius)
+{
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+
+  const int H = coords.size(3);
+  const int W = coords.size(4);
+  const int D = 2 * radius + 2;
+
+  auto opts = fmap1.options();
+  auto corr = torch::empty({B, M, D, D, H, W}, opts);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.type(), "corr_forward_kernel", ([&] {
+      corr_forward_kernel<scalar_t><<<BLOCKS(B * M * H * W * D * D), THREADS>>>(radius,
+        fmap1.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+        fmap2.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+        coords.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+        ii.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+        jj.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+        corr.packed_accessor32<scalar_t,6,torch::RestrictPtrTraits>());
+  }));
+
+  torch::Tensor x = coords.index({Slice(), Slice(), 0, None, None});
+  torch::Tensor y = coords.index({Slice(), Slice(), 1, None, None});
+  torch::Tensor dx = x - x.floor(); dx = dx.to(fmap1.dtype());
+  torch::Tensor dy = y - y.floor(); dy = dy.to(fmap2.dtype());
+
+  torch::Tensor out;
+  out  = (1 - dx) * (1 - dy) * corr.index({Slice(), Slice(), Slice(0, D-1), Slice(0, D-1)});
+  out +=     (dx) * (1 - dy) * corr.index({Slice(), Slice(), Slice(0, D-1), Slice(1, D-0)});
+  out += (1 - dx) *     (dy) * corr.index({Slice(), Slice(), Slice(1, D-0), Slice(0, D-1)});
+  out +=     (dx) *     (dy) * corr.index({Slice(), Slice(), Slice(1, D-0), Slice(1, D-0)});
+
+  return { out.permute({0,1,3,2,4,5}) };
+}
+
+
+std::vector<torch::Tensor> corr_cuda_backward(
+  torch::Tensor fmap1,
+  torch::Tensor fmap2,
+  torch::Tensor coords,
+  torch::Tensor ii,
+  torch::Tensor jj,
+  torch::Tensor grad,
+  int radius)
+{
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+
+  const int H = coords.size(3);
+  const int W = coords.size(4);
+  const int D = 2 * radius + 2;
+
+  grad = grad.permute({0,1,3,2,4,5}).contiguous();
+  torch::Tensor x = coords.index({Slice(), Slice(), 0, None, None});
+  torch::Tensor y = coords.index({Slice(), Slice(), 1, None, None});
+  torch::Tensor dx = x - x.floor();
+  torch::Tensor dy = y - y.floor();
+
+  auto opts = torch::TensorOptions().dtype(torch::kFloat).device(torch::kCUDA);
+  torch::Tensor g1 = torch::zeros({B, M, D, D, H, W}, grad.options());
+  torch::Tensor g2 = torch::zeros({B, M, D, D, H, W}, grad.options());
+  torch::Tensor g3 = torch::zeros({B, M, D, D, H, W}, grad.options());
+  torch::Tensor g4 = torch::zeros({B, M, D, D, H, W}, grad.options());
+
+  g1.index_put_({Slice(), Slice(), Slice(0, D-1), Slice(0, D-1)}, (1 - dx) * (1 - dy) * grad);
+  g2.index_put_({Slice(), Slice(), Slice(0, D-1), Slice(1, D-0)},     (dx) * (1 - dy) * grad);
+  g3.index_put_({Slice(), Slice(), Slice(1, D-0), Slice(0, D-1)}, (1 - dx) *     (dy) * grad);
+  g4.index_put_({Slice(), Slice(), Slice(1, D-0), Slice(1, D-0)},     (dx) *     (dy) * grad);
+
+  torch::Tensor corr_grad = g1 + g2 + g3 + g4;
+  auto fmap1_grad = torch::zeros_like(fmap1);
+  auto fmap2_grad = torch::zeros_like(fmap2);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(fmap1.type(), "corr_backward_kernel", ([&] {
+    corr_backward_kernel<scalar_t><<<BLOCKS(B * M * H * W * D * D), THREADS>>>(radius,
+      fmap1.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+      fmap2.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+      coords.packed_accessor32<float,5,torch::RestrictPtrTraits>(),
+      ii.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+      jj.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+      corr_grad.packed_accessor32<float,6,torch::RestrictPtrTraits>(),
+      fmap1_grad.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+      fmap2_grad.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+  }));
+
+  return {fmap1_grad, fmap2_grad};
+}
+
+std::vector<torch::Tensor> patchify_cuda_forward(
+  torch::Tensor net, torch::Tensor coords, int radius)
+{
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+  const int C = net.size(1);
+  const int D = 2 * radius + 2;
+
+  auto opts = net.options();
+  auto patches = torch::zeros({B, M, C, D, D}, opts);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(net.type(), "patchify_forward_kernel", ([&] {
+      patchify_forward_kernel<scalar_t><<<BLOCKS(B * M * D * D), THREADS>>>(radius,
+        net.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>(),
+        coords.packed_accessor32<float,3,torch::RestrictPtrTraits>(),
+        patches.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>());
+  }));
+
+  return { patches };
+}
+
+
+std::vector<torch::Tensor> patchify_cuda_backward(
+  torch::Tensor net,
+  torch::Tensor coords,
+  torch::Tensor gradient,
+  int radius)
+{
+  const int B = coords.size(0);
+  const int M = coords.size(1);
+  const int C = net.size(1);
+  const int H = net.size(2);
+  const int W = net.size(3);
+  const int D = 2 * radius + 2;
+
+  torch::Tensor net_gradient = torch::zeros_like(net);
+
+  AT_DISPATCH_FLOATING_TYPES_AND_HALF(net.type(), "patchify_backward_kernel", ([&] {
+    patchify_backward_kernel<scalar_t><<<BLOCKS(B * M * D * D), THREADS>>>(radius,
+      gradient.packed_accessor32<scalar_t,5,torch::RestrictPtrTraits>(),
+      coords.packed_accessor32<float,3,torch::RestrictPtrTraits>(),
+      net_gradient.packed_accessor32<scalar_t,4,torch::RestrictPtrTraits>());
+  }));
+
+  return { net_gradient };
+}
diff --git a/third_party/dpvo_ext/fastba/__init__.py b/third_party/dpvo_ext/fastba/__init__.py
new file mode 100644
index 0000000..ae9eb4a
--- /dev/null
+++ b/third_party/dpvo_ext/fastba/__init__.py
@@ -0,0 +1 @@
+from .ba_dpvo import bundle_adjust_dpvo, neighbors, reproject
diff --git a/third_party/dpvo_ext/fastba/ba_cuda.cu b/third_party/dpvo_ext/fastba/ba_cuda.cu
new file mode 100644
index 0000000..035f2bd
--- /dev/null
+++ b/third_party/dpvo_ext/fastba/ba_cuda.cu
@@ -0,0 +1,575 @@
+#include <torch/extension.h>
+#include <vector>
+#include <iostream>
+
+#include <ATen/ATen.h>
+#include <ATen/NativeFunctions.h>
+#include <ATen/Parallel.h>
+
+
+#define GPU_1D_KERNEL_LOOP(i, n) \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i<n; i += blockDim.x * gridDim.x)
+
+
+#define NUM_THREADS 256
+#define NUM_BLOCKS(batch_size) ((batch_size + NUM_THREADS - 1) / NUM_THREADS)
+
+
+__device__ void
+actSO3(const float *q, const float *X, float *Y) {
+  float uv[3];
+  uv[0] = 2.0 * (q[1]*X[2] - q[2]*X[1]);
+  uv[1] = 2.0 * (q[2]*X[0] - q[0]*X[2]);
+  uv[2] = 2.0 * (q[0]*X[1] - q[1]*X[0]);
+
+  Y[0] = X[0] + q[3]*uv[0] + (q[1]*uv[2] - q[2]*uv[1]);
+  Y[1] = X[1] + q[3]*uv[1] + (q[2]*uv[0] - q[0]*uv[2]);
+  Y[2] = X[2] + q[3]*uv[2] + (q[0]*uv[1] - q[1]*uv[0]);
+}
+
+__device__  void
+actSE3(const float *t, const float *q, const float *X, float *Y) {
+  actSO3(q, X, Y);
+  Y[3] = X[3];
+  Y[0] += X[3] * t[0];
+  Y[1] += X[3] * t[1];
+  Y[2] += X[3] * t[2];
+}
+
+__device__ void
+adjSE3(const float *t, const float *q, const float *X, float *Y) {
+  float qinv[4] = {-q[0], -q[1], -q[2], q[3]};
+  actSO3(qinv, &X[0], &Y[0]);
+  actSO3(qinv, &X[3], &Y[3]);
+
+  float u[3], v[3];
+  u[0] = t[2]*X[1] - t[1]*X[2];
+  u[1] = t[0]*X[2] - t[2]*X[0];
+  u[2] = t[1]*X[0] - t[0]*X[1];
+
+  actSO3(qinv, u, v);
+  Y[3] += v[0];
+  Y[4] += v[1];
+  Y[5] += v[2];
+}
+
+__device__ void
+relSE3(const float *ti, const float *qi, const float *tj, const float *qj, float *tij, float *qij) {
+  qij[0] = -qj[3] * qi[0] + qj[0] * qi[3] - qj[1] * qi[2] + qj[2] * qi[1],
+  qij[1] = -qj[3] * qi[1] + qj[1] * qi[3] - qj[2] * qi[0] + qj[0] * qi[2],
+  qij[2] = -qj[3] * qi[2] + qj[2] * qi[3] - qj[0] * qi[1] + qj[1] * qi[0],
+  qij[3] =  qj[3] * qi[3] + qj[0] * qi[0] + qj[1] * qi[1] + qj[2] * qi[2],
+
+  actSO3(qij, ti, tij);
+  tij[0] = tj[0] - tij[0];
+  tij[1] = tj[1] - tij[1];
+  tij[2] = tj[2] - tij[2];
+}
+
+
+__device__ void
+expSO3(const float *phi, float* q) {
+  // SO3 exponential map
+  float theta_sq = phi[0]*phi[0] + phi[1]*phi[1] + phi[2]*phi[2];
+  float theta_p4 = theta_sq * theta_sq;
+
+  float theta = sqrtf(theta_sq);
+  float imag, real;
+
+  if (theta_sq < 1e-8) {
+    imag = 0.5 - (1.0/48.0)*theta_sq + (1.0/3840.0)*theta_p4;
+    real = 1.0 - (1.0/ 8.0)*theta_sq + (1.0/ 384.0)*theta_p4;
+  } else {
+    imag = sinf(0.5 * theta) / theta;
+    real = cosf(0.5 * theta);
+  }
+
+  q[0] = imag * phi[0];
+  q[1] = imag * phi[1];
+  q[2] = imag * phi[2];
+  q[3] = real;
+
+}
+
+__device__ void
+crossInplace(const float* a, float *b) {
+  float x[3] = {
+    a[1]*b[2] - a[2]*b[1],
+    a[2]*b[0] - a[0]*b[2],
+    a[0]*b[1] - a[1]*b[0],
+  };
+
+  b[0] = x[0];
+  b[1] = x[1];
+  b[2] = x[2];
+}
+
+__device__ void
+expSE3(const float *xi, float* t, float* q) {
+  // SE3 exponential map
+
+  expSO3(xi + 3, q);
+  float tau[3] = {xi[0], xi[1], xi[2]};
+  float phi[3] = {xi[3], xi[4], xi[5]};
+
+  float theta_sq = phi[0]*phi[0] + phi[1]*phi[1] + phi[2]*phi[2];
+  float theta = sqrtf(theta_sq);
+
+  t[0] = tau[0];
+  t[1] = tau[1];
+  t[2] = tau[2];
+
+  if (theta > 1e-4) {
+    float a = (1 - cosf(theta)) / theta_sq;
+    crossInplace(phi, tau);
+    t[0] += a * tau[0];
+    t[1] += a * tau[1];
+    t[2] += a * tau[2];
+
+    float b = (theta - sinf(theta)) / (theta * theta_sq);
+    crossInplace(phi, tau);
+    t[0] += b * tau[0];
+    t[1] += b * tau[1];
+    t[2] += b * tau[2];
+  }
+}
+
+
+__device__ void
+retrSE3(const float *xi, const float* t, const float* q, float* t1, float* q1) {
+  // retraction on SE3 manifold
+
+  float dt[3] = {0, 0, 0};
+  float dq[4] = {0, 0, 0, 1};
+
+  expSE3(xi, dt, dq);
+
+  q1[0] = dq[3] * q[0] + dq[0] * q[3] + dq[1] * q[2] - dq[2] * q[1];
+  q1[1] = dq[3] * q[1] + dq[1] * q[3] + dq[2] * q[0] - dq[0] * q[2];
+  q1[2] = dq[3] * q[2] + dq[2] * q[3] + dq[0] * q[1] - dq[1] * q[0];
+  q1[3] = dq[3] * q[3] - dq[0] * q[0] - dq[1] * q[1] - dq[2] * q[2];
+
+  actSO3(dq, t, t1);
+  t1[0] += dt[0];
+  t1[1] += dt[1];
+  t1[2] += dt[2];
+}
+
+
+
+__global__ void pose_retr_kernel(const int t0, const int t1,
+    torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> poses,
+    torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> update)
+{
+  GPU_1D_KERNEL_LOOP(i, t1 - t0) {
+    const float t = t0 + i;
+    float t1[3], t0[3] = { poses[t][0], poses[t][1], poses[t][2] };
+    float q1[4], q0[4] = { poses[t][3], poses[t][4], poses[t][5], poses[t][6] };
+
+    float xi[6] = {
+      update[i][0],
+      update[i][1],
+      update[i][2],
+      update[i][3],
+      update[i][4],
+      update[i][5],
+    };
+
+    retrSE3(xi, t0, q0, t1, q1);
+
+    poses[t][0] = t1[0];
+    poses[t][1] = t1[1];
+    poses[t][2] = t1[2];
+    poses[t][3] = q1[0];
+    poses[t][4] = q1[1];
+    poses[t][5] = q1[2];
+    poses[t][6] = q1[3];
+  }
+}
+
+
+__global__ void patch_retr_kernel(
+    torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> index,
+    torch::PackedTensorAccessor32<float,4,torch::RestrictPtrTraits> patches,
+    torch::PackedTensorAccessor32<float,1,torch::RestrictPtrTraits> update)
+{
+  GPU_1D_KERNEL_LOOP(n, index.size(0)) {
+    const int p = patches.size(2);
+    const int ix = index[n];
+
+    float d = patches[ix][2][0][0];
+    d = d + update[n];
+    d = (d > 20) ? 1.0 : d;
+    d = max(d, 1e-4);
+
+    for (int i=0; i<p; i++) {
+      for (int j=0; j<p; j++) {
+        patches[ix][2][i][j] = d;
+      }
+    }
+  }
+}
+
+
+__global__ void reprojection_residuals_and_hessian(
+    const torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> poses,
+    const torch::PackedTensorAccessor32<float,4,torch::RestrictPtrTraits> patches,
+    const torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> intrinsics,
+    const torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> target,
+    const torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> weight,
+    const torch::PackedTensorAccessor32<float,1,torch::RestrictPtrTraits> lmbda,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> ii,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> jj,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> kk,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> ku,
+    torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> B,
+    torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> E,
+    torch::PackedTensorAccessor32<float,1,torch::RestrictPtrTraits> C,
+    torch::PackedTensorAccessor32<float,1,torch::RestrictPtrTraits> v,
+    torch::PackedTensorAccessor32<float,1,torch::RestrictPtrTraits> u, const int t0)
+{
+
+  __shared__ float fx, fy, cx, cy;
+  if (threadIdx.x == 0) {
+    fx = intrinsics[0][0];
+    fy = intrinsics[0][1];
+    cx = intrinsics[0][2];
+    cy = intrinsics[0][3];
+  }
+
+  __syncthreads();
+
+  GPU_1D_KERNEL_LOOP(n, ii.size(0)) {
+    int k = ku[n];
+    int ix = ii[n];
+    int jx = jj[n];
+    int kx = kk[n];
+
+    float ti[3] = { poses[ix][0], poses[ix][1], poses[ix][2] };
+    float tj[3] = { poses[jx][0], poses[jx][1], poses[jx][2] };
+    float qi[4] = { poses[ix][3], poses[ix][4], poses[ix][5], poses[ix][6] };
+    float qj[4] = { poses[jx][3], poses[jx][4], poses[jx][5], poses[jx][6] };
+
+    float Xi[4], Xj[4];
+    Xi[0] = (patches[kx][0][1][1] - cx) / fx;
+    Xi[1] = (patches[kx][1][1][1] - cy) / fy;
+    Xi[2] = 1.0;
+    Xi[3] = patches[kx][2][1][1];
+
+    float tij[3], qij[4];
+    relSE3(ti, qi, tj, qj, tij, qij);
+    actSE3(tij, qij, Xi, Xj);
+
+    const float X = Xj[0];
+    const float Y = Xj[1];
+    const float Z = Xj[2];
+    const float W = Xj[3];
+
+    const float d = (Z >= 0.2) ? 1.0 / Z : 0.0;
+    const float d2 = d * d;
+
+    const float x1 = fx * (X / Z) + cx;
+    const float y1 = fy * (Y / Z) + cy;
+
+    const float rx = target[n][0] - x1;
+    const float ry = target[n][1] - y1;
+
+    const bool in_bounds = (sqrt(rx*rx + ry*ry) < 128) && (Z > 0.2) &&
+      (x1 > -64) && (y1 > -64) && (x1 < 2*cx + 64) && (y1 < 2*cy + 64);
+
+    const float mask = in_bounds ? 1.0 : 0.0;
+
+    ix = ix - t0;
+    jx = jx - t0;
+
+    {
+      const float r = target[n][0] - x1;
+      const float w = mask * weight[n][0];
+
+      float Jz = fx * (tij[0] * d - tij[2] * (X * d2));
+      float Ji[6], Jj[6] = {fx*W*d, 0, fx*-X*W*d2, fx*-X*Y*d2, fx*(1+X*X*d2), fx*-Y*d};
+
+      adjSE3(tij, qij, Jj, Ji);
+
+      for (int i=0; i<6; i++) {
+        for (int j=0; j<6; j++) {
+          if (ix >= 0)
+            atomicAdd(&B[6*ix+i][6*ix+j],  w * Ji[i] * Ji[j]);
+          if (jx >= 0)
+            atomicAdd(&B[6*jx+i][6*jx+j],  w * Jj[i] * Jj[j]);
+          if (ix >= 0 && jx >= 0) {
+            atomicAdd(&B[6*ix+i][6*jx+j], -w * Ji[i] * Jj[j]);
+            atomicAdd(&B[6*jx+i][6*ix+j], -w * Jj[i] * Ji[j]);
+          }
+        }
+      }
+
+      for (int i=0; i<6; i++) {
+        if (ix >= 0)
+          atomicAdd(&E[6*ix+i][k], -w * Jz * Ji[i]);
+        if (jx >= 0)
+          atomicAdd(&E[6*jx+i][k],  w * Jz * Jj[i]);
+      }
+
+      for (int i=0; i<6; i++) {
+        if (ix >= 0)
+          atomicAdd(&v[6*ix+i], -w * r * Ji[i]);
+        if (jx >= 0)
+          atomicAdd(&v[6*jx+i],  w * r * Jj[i]);
+      }
+
+      atomicAdd(&C[k], w * Jz * Jz);
+      atomicAdd(&u[k], w *  r * Jz);
+    }
+
+    {
+      const float r = target[n][1] - y1;
+      const float w = mask * weight[n][1];
+
+      float Jz = fy * (tij[1] * d - tij[2] * (Y * d2));
+      float Ji[6], Jj[6] = {0, fy*W*d, fy*-Y*W*d2, fy*(-1-Y*Y*d2), fy*(X*Y*d2), fy*X*d};
+
+      adjSE3(tij, qij, Jj, Ji);
+
+      for (int i=0; i<6; i++) {
+        for (int j=0; j<6; j++) {
+          if (ix >= 0)
+            atomicAdd(&B[6*ix+i][6*ix+j],  w * Ji[i] * Ji[j]);
+          if (jx >= 0)
+            atomicAdd(&B[6*jx+i][6*jx+j],  w * Jj[i] * Jj[j]);
+          if (ix >= 0 && jx >= 0) {
+            atomicAdd(&B[6*ix+i][6*jx+j], -w * Ji[i] * Jj[j]);
+            atomicAdd(&B[6*jx+i][6*ix+j], -w * Jj[i] * Ji[j]);
+          }
+        }
+      }
+
+      for (int i=0; i<6; i++) {
+        if (ix >= 0)
+          atomicAdd(&E[6*ix+i][k], -w * Jz * Ji[i]);
+        if (jx >= 0)
+          atomicAdd(&E[6*jx+i][k],  w * Jz * Jj[i]);
+      }
+
+      for (int i=0; i<6; i++) {
+        if (ix >= 0)
+          atomicAdd(&v[6*ix+i], -w * r * Ji[i]);
+        if (jx >= 0)
+          atomicAdd(&v[6*jx+i],  w * r * Jj[i]);
+      }
+
+      atomicAdd(&C[k], w * Jz * Jz);
+      atomicAdd(&u[k], w *  r * Jz);
+    }
+  }
+}
+
+
+__global__ void reproject(
+    const torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> poses,
+    const torch::PackedTensorAccessor32<float,4,torch::RestrictPtrTraits> patches,
+    const torch::PackedTensorAccessor32<float,2,torch::RestrictPtrTraits> intrinsics,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> ii,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> jj,
+    const torch::PackedTensorAccessor32<long,1,torch::RestrictPtrTraits> kk,
+    torch::PackedTensorAccessor32<float,4,torch::RestrictPtrTraits> coords) {
+
+  __shared__ float fx, fy, cx, cy;
+  if (threadIdx.x == 0) {
+    fx = intrinsics[0][0];
+    fy = intrinsics[0][1];
+    cx = intrinsics[0][2];
+    cy = intrinsics[0][3];
+  }
+
+  __syncthreads();
+
+  GPU_1D_KERNEL_LOOP(n, ii.size(0)) {
+    int ix = ii[n];
+    int jx = jj[n];
+    int kx = kk[n];
+
+    float ti[3] = { poses[ix][0], poses[ix][1], poses[ix][2] };
+    float tj[3] = { poses[jx][0], poses[jx][1], poses[jx][2] };
+    float qi[4] = { poses[ix][3], poses[ix][4], poses[ix][5], poses[ix][6] };
+    float qj[4] = { poses[jx][3], poses[jx][4], poses[jx][5], poses[jx][6] };
+
+    float tij[3], qij[4];
+    relSE3(ti, qi, tj, qj, tij, qij);
+
+    float Xi[4], Xj[4];
+    for (int i=0; i<patches.size(2); i++) {
+      for (int j=0; j<patches.size(3); j++) {
+
+        Xi[0] = (patches[kx][0][i][j] - cx) / fx;
+        Xi[1] = (patches[kx][1][i][j] - cy) / fy;
+        Xi[2] = 1.0;
+        Xi[3] = patches[kx][2][i][j];
+
+        actSE3(tij, qij, Xi, Xj);
+
+        coords[n][0][i][j] = fx * (Xj[0] / Xj[2]) + cx;
+        coords[n][1][i][j] = fy * (Xj[1] / Xj[2]) + cy;
+        // coords[n][2][i][j] = 1.0 / Xj[2];
+
+      }
+    }
+  }
+}
+
+
+
+std::vector<torch::Tensor> cuda_ba(
+    torch::Tensor poses,
+    torch::Tensor patches,
+    torch::Tensor intrinsics,
+    torch::Tensor target,
+    torch::Tensor weight,
+    torch::Tensor lmbda,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor kk,
+    const int t0, const int t1, const int iterations)
+{
+
+  auto ktuple = torch::_unique(kk, true, true);
+  torch::Tensor kx = std::get<0>(ktuple);
+  torch::Tensor ku = std::get<1>(ktuple);
+
+  const int N = t1 - t0;    // number of poses
+  const int M = kx.size(0); // number of patches
+  const int P = patches.size(3); // patch size
+
+  auto opts = torch::TensorOptions()
+    .dtype(torch::kFloat32).device(torch::kCUDA);
+
+  poses = poses.view({-1, 7});
+  patches = patches.view({-1,3,P,P});
+  intrinsics = intrinsics.view({-1, 4});
+
+  target = target.view({-1, 2});
+  weight = weight.view({-1, 2});
+
+  const int num = ii.size(0);
+  torch::Tensor B = torch::empty({6*N, 6*N}, opts);
+  torch::Tensor E = torch::empty({6*N, 1*M}, opts);
+  torch::Tensor C = torch::empty({M}, opts);
+
+  torch::Tensor v = torch::empty({6*N}, opts);
+  torch::Tensor u = torch::empty({1*M}, opts);
+
+  for (int itr=0; itr < iterations; itr++) {
+
+    B.zero_();
+    E.zero_();
+    C.zero_();
+    v.zero_();
+    u.zero_();
+
+    v = v.view({6*N});
+    u = u.view({1*M});
+
+    reprojection_residuals_and_hessian<<<NUM_BLOCKS(ii.size(0)), NUM_THREADS>>>(
+      poses.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+      patches.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+      intrinsics.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+      target.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+      weight.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+      lmbda.packed_accessor32<float,1,torch::RestrictPtrTraits>(),
+      ii.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+      jj.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+      kk.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+      ku.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+      B.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+      E.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+      C.packed_accessor32<float,1,torch::RestrictPtrTraits>(),
+      v.packed_accessor32<float,1,torch::RestrictPtrTraits>(),
+      u.packed_accessor32<float,1,torch::RestrictPtrTraits>(), t0);
+
+    v = v.view({6*N, 1});
+    u = u.view({1*M, 1});
+
+    torch::Tensor Q = 1.0 / (C + lmbda).view({1, M});
+
+    if (t1 - t0 == 0) {
+
+      torch::Tensor Qt = torch::transpose(Q, 0, 1);
+      torch::Tensor dZ = Qt * u;
+
+      dZ = dZ.view({M});
+
+      patch_retr_kernel<<<NUM_BLOCKS(M), NUM_THREADS>>>(
+        kx.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+        patches.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+        dZ.packed_accessor32<float,1,torch::RestrictPtrTraits>());
+
+    }
+
+    else {
+
+      torch::Tensor EQ = E * Q;
+      torch::Tensor Et = torch::transpose(E, 0, 1);
+      torch::Tensor Qt = torch::transpose(Q, 0, 1);
+
+      torch::Tensor S = B - torch::matmul(EQ, Et);
+      torch::Tensor y = v - torch::matmul(EQ,  u);
+
+      torch::Tensor I = torch::eye(6*N, opts);
+      S += I * (1e-4 * S + 1.0);
+
+
+      torch::Tensor U = torch::linalg::cholesky(S);
+      torch::Tensor dX = torch::cholesky_solve(y, U);
+      torch::Tensor dZ = Qt * (u - torch::matmul(Et, dX));
+
+      dX = dX.view({N, 6});
+      dZ = dZ.view({M});
+
+      pose_retr_kernel<<<NUM_BLOCKS(N), NUM_THREADS>>>(t0, t1,
+          poses.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+          dX.packed_accessor32<float,2,torch::RestrictPtrTraits>());
+
+      patch_retr_kernel<<<NUM_BLOCKS(M), NUM_THREADS>>>(
+          kx.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+          patches.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+          dZ.packed_accessor32<float,1,torch::RestrictPtrTraits>());
+    }
+  }
+
+  return {};
+}
+
+
+torch::Tensor cuda_reproject(
+    torch::Tensor poses,
+    torch::Tensor patches,
+    torch::Tensor intrinsics,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor kk)
+{
+
+  const int N = ii.size(0);
+  const int P = patches.size(3); // patch size
+
+  poses = poses.view({-1, 7});
+  patches = patches.view({-1,3,P,P});
+  intrinsics = intrinsics.view({-1, 4});
+
+  auto opts = torch::TensorOptions()
+    .dtype(torch::kFloat32).device(torch::kCUDA);
+
+  torch::Tensor coords = torch::empty({N, 2, P, P}, opts);
+
+  reproject<<<NUM_BLOCKS(N), NUM_THREADS>>>(
+    poses.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+    patches.packed_accessor32<float,4,torch::RestrictPtrTraits>(),
+    intrinsics.packed_accessor32<float,2,torch::RestrictPtrTraits>(),
+    ii.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+    jj.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+    kk.packed_accessor32<long,1,torch::RestrictPtrTraits>(),
+    coords.packed_accessor32<float,4,torch::RestrictPtrTraits>());
+
+  return coords.view({1, N, 2, P, P});
+
+}
diff --git a/third_party/dpvo_ext/fastba/ba_dpvo.cpp b/third_party/dpvo_ext/fastba/ba_dpvo.cpp
new file mode 100644
index 0000000..3f77eeb
--- /dev/null
+++ b/third_party/dpvo_ext/fastba/ba_dpvo.cpp
@@ -0,0 +1,157 @@
+#include <torch/extension.h>
+#include <vector>
+#include <unordered_map>
+#include <algorithm>
+#include <iostream>
+
+
+std::vector<torch::Tensor> cuda_ba(
+    torch::Tensor poses,
+    torch::Tensor patches,
+    torch::Tensor intrinsics,
+    torch::Tensor target,
+    torch::Tensor weight,
+    torch::Tensor lmbda,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor kk,
+    int t0, int t1, int iterations);
+
+
+torch::Tensor cuda_reproject(
+    torch::Tensor poses,
+    torch::Tensor patches,
+    torch::Tensor intrinsics,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor kk);
+
+std::vector<torch::Tensor> bundle_adjust(
+    torch::Tensor poses,
+    torch::Tensor patches,
+    torch::Tensor intrinsics,
+    torch::Tensor target,
+    torch::Tensor weight,
+    torch::Tensor lmbda,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor kk,
+    int t0, int t1, int iterations) {
+  return cuda_ba(poses, patches, intrinsics, target, weight, lmbda, ii, jj, kk, t0, t1, iterations);
+}
+
+
+torch::Tensor reproject(
+    torch::Tensor poses,
+    torch::Tensor patches,
+    torch::Tensor intrinsics,
+    torch::Tensor ii,
+    torch::Tensor jj,
+    torch::Tensor kk) {
+  return cuda_reproject(poses, patches, intrinsics, ii, jj, kk);
+}
+
+// std::vector<torch::Tensor> neighbors(torch::Tensor ii, torch::Tensor jj)
+// {
+//   ii = ii.to(torch::kCPU);
+//   jj = jj.to(torch::kCPU);
+//   auto ii_data = ii.accessor<long,1>();
+//   auto jj_data = jj.accessor<long,1>();
+
+//   std::unordered_map<long, std::vector<long>> graph;
+//   std::unordered_map<long, std::vector<long>> index;
+//   for (int i=0; i < ii.size(0); i++) {
+//     const long ix = ii_data[i];
+//     const long jx = jj_data[i];
+//     if (graph.find(ix) == graph.end()) {
+//       graph[ix] = std::vector<long>();
+//       index[ix] = std::vector<long>();
+//     }
+//     graph[ix].push_back(jx);
+//     index[ix].push_back( i);
+//   }
+
+//   auto opts = torch::TensorOptions().dtype(torch::kInt64);
+//   torch::Tensor ix = torch::empty({ii.size(0)}, opts);
+//   torch::Tensor jx = torch::empty({jj.size(0)}, opts);
+
+//   auto ix_data = ix.accessor<long,1>();
+//   auto jx_data = jx.accessor<long,1>();
+
+//   for (std::pair<long, std::vector<long>> element : graph) {
+//     std::vector<long>& v = graph[element.first];
+//     std::vector<long>& idx = index[element.first];
+
+//     std::stable_sort(idx.begin(), idx.end(),
+//        [&v](size_t i, size_t j) {return v[i] < v[j];});
+
+//     ix_data[idx.front()] = -1;
+//     jx_data[idx.back()]  = -1;
+
+//     for (int i=0; i < idx.size(); i++) {
+//       ix_data[idx[i]] = (i > 0) ? idx[i-1] : -1;
+//       jx_data[idx[i]] = (i < idx.size() - 1) ? idx[i+1] : -1;
+//     }
+//   }
+
+//   ix = ix.to(torch::kCUDA);
+//   jx = jx.to(torch::kCUDA);
+
+//   return {ix, jx};
+// }
+
+
+std::vector<torch::Tensor> neighbors(torch::Tensor ii, torch::Tensor jj)
+{
+
+  auto tup = torch::_unique(ii, true, true);
+  torch::Tensor uniq = std::get<0>(tup).to(torch::kCPU);
+  torch::Tensor perm = std::get<1>(tup).to(torch::kCPU);
+
+  jj = jj.to(torch::kCPU);
+  auto jj_accessor = jj.accessor<long,1>();
+
+  auto perm_accessor = perm.accessor<long,1>();
+  std::vector<std::vector<long>> index(uniq.size(0));
+  for (int i=0; i < ii.size(0); i++) {
+    index[perm_accessor[i]].push_back(i);
+  }
+
+  auto opts = torch::TensorOptions().dtype(torch::kInt64);
+  torch::Tensor ix = torch::empty({ii.size(0)}, opts);
+  torch::Tensor jx = torch::empty({ii.size(0)}, opts);
+
+  auto ix_accessor = ix.accessor<long,1>();
+  auto jx_accessor = jx.accessor<long,1>();
+
+  for (int i=0; i<uniq.size(0); i++) {
+    std::vector<long>& idx = index[i];
+    std::stable_sort(idx.begin(), idx.end(),
+       [&jj_accessor](size_t i, size_t j) {return jj_accessor[i] < jj_accessor[j];});
+
+    for (int i=0; i < idx.size(); i++) {
+      ix_accessor[idx[i]] = (i > 0) ? idx[i-1] : -1;
+      jx_accessor[idx[i]] = (i < idx.size() - 1) ? idx[i+1] : -1;
+    }
+  }
+
+  // for (int i=0; i<ii.size(0); i++) {
+  //   std::cout << jj_accessor[i] << " ";
+  //   if (ix_accessor[i] >= 0) std::cout << jj_accessor[ix_accessor[i]] << " ";
+  //   if (jx_accessor[i] >= 0) std::cout << jj_accessor[jx_accessor[i]] << " ";
+  //   std::cout << std::endl;
+  // }
+
+  ix = ix.to(torch::kCUDA);
+  jx = jx.to(torch::kCUDA);
+
+  return {ix, jx};
+}
+
+
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("forward", &bundle_adjust, "bundle_adjust_dpvo forward operator");
+  m.def("neighbors", &neighbors, "temporal neighboor indicies");
+  m.def("reproject", &reproject, "temporal neighboor indicies");
+
+}
diff --git a/third_party/dpvo_ext/fastba/ba_dpvo.py b/third_party/dpvo_ext/fastba/ba_dpvo.py
new file mode 100644
index 0000000..6672773
--- /dev/null
+++ b/third_party/dpvo_ext/fastba/ba_dpvo.py
@@ -0,0 +1,20 @@
+import cuda_ba
+
+neighbors = cuda_ba.neighbors
+reproject = cuda_ba.reproject
+
+
+def bundle_adjust_dpvo(poses,
+                       patches,
+                       intrinsics,
+                       target,
+                       weight,
+                       lmbda,
+                       ii,
+                       jj,
+                       kk,
+                       t0,
+                       t1,
+                       iterations=2):
+    return cuda_ba.forward(poses.data, patches, intrinsics, target, weight,
+                           lmbda, ii, jj, kk, t0, t1, iterations)
diff --git a/third_party/dpvo_ext/lietorch/__init__.py b/third_party/dpvo_ext/lietorch/__init__.py
new file mode 100644
index 0000000..dee9cba
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/__init__.py
@@ -0,0 +1,2 @@
+__all__ = ['groups']
+from .groups import SE3, SO3, LieGroupParameter, RxSO3, Sim3, cat, stack
diff --git a/third_party/dpvo_ext/lietorch/broadcasting.py b/third_party/dpvo_ext/lietorch/broadcasting.py
new file mode 100644
index 0000000..1685201
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/broadcasting.py
@@ -0,0 +1,33 @@
+import numpy as np
+import torch
+
+
+def check_broadcastable(x, y):
+    assert len(x.shape) == len(y.shape)
+    for (n, m) in zip(x.shape[:-1], y.shape[:-1]):
+        assert n == m or n == 1 or m == 1
+
+
+def broadcast_inputs(x, y):
+    """Automatic broadcasting of missing dimensions."""
+    if y is None:
+        xs, xd = x.shape[:-1], x.shape[-1]
+        return (x.view(-1, xd).contiguous(), ), x.shape[:-1]
+
+    check_broadcastable(x, y)
+
+    xs, xd = x.shape[:-1], x.shape[-1]
+    ys, yd = y.shape[:-1], y.shape[-1]
+    out_shape = [max(n, m) for (n, m) in zip(xs, ys)]
+
+    if x.shape[:-1] == y.shape[-1]:
+        x1 = x.view(-1, xd)
+        y1 = y.view(-1, yd)
+
+    else:
+        x_expand = [m if n == 1 else 1 for (n, m) in zip(xs, ys)]
+        y_expand = [n if m == 1 else 1 for (n, m) in zip(xs, ys)]
+        x1 = x.repeat(x_expand + [1]).reshape(-1, xd).contiguous()
+        y1 = y.repeat(y_expand + [1]).reshape(-1, yd).contiguous()
+
+    return (x1, y1), tuple(out_shape)
diff --git a/third_party/dpvo_ext/lietorch/gradcheck.py b/third_party/dpvo_ext/lietorch/gradcheck.py
new file mode 100644
index 0000000..610a7ca
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/gradcheck.py
@@ -0,0 +1,674 @@
+import torch
+
+TORCH_MAJOR = int(torch.__version__.split('.')[0])
+TORCH_MINOR = int(torch.__version__.split('.')[1])
+
+from torch.types import _TensorOrTensors
+
+if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+    from torch._six import container_abcs, istuple
+else:
+    import collections.abc as container_abcs
+
+import warnings
+from itertools import product
+from typing import Callable, Iterable, List, Optional, Union
+
+import torch.testing
+from torch.overrides import is_tensor_like
+
+
+def zero_gradients(x):
+    if isinstance(x, torch.Tensor):
+        if x.grad is not None:
+            x.grad.detach_()
+            x.grad.zero_()
+    elif isinstance(x, container_abcs.Iterable):
+        for elem in x:
+            zero_gradients(elem)
+
+
+def make_jacobian(input, num_out):
+    if is_tensor_like(input):
+        if not input.is_floating_point() and not input.is_complex():
+            return None
+        if not input.requires_grad:
+            return None
+        return input.new_zeros((input.nelement(), num_out),
+                               dtype=input.dtype,
+                               layout=torch.strided)
+    elif isinstance(input,
+                    container_abcs.Iterable) and not isinstance(input, str):
+        jacobians = list(
+            filter(lambda x: x is not None,
+                   (make_jacobian(elem, num_out) for elem in input)))
+        if not jacobians:
+            return None
+        return type(input)(jacobians)  # type: ignore
+    else:
+        return None
+
+
+def iter_tensors(x: Union[torch.Tensor, Iterable[torch.Tensor]],
+                 only_requiring_grad: bool = False) -> Iterable[torch.Tensor]:
+    if is_tensor_like(x):
+        # mypy doesn't narrow type of `x` to torch.Tensor
+        if x.requires_grad or not only_requiring_grad:  # type: ignore
+            yield x  # type: ignore
+    elif isinstance(x, container_abcs.Iterable) and not isinstance(x, str):
+        for elem in x:
+            for result in iter_tensors(elem, only_requiring_grad):
+                yield result
+
+
+def get_numerical_jacobian(fn, input, target=None, eps=1e-3, grad_out=1.0):
+    """
+    input: input to `fn`
+    target: the Tensors wrt whom Jacobians are calculated (default=`input`)
+    grad_out: grad output value used to calculate gradients.
+
+    Note that `target` may not even be part of `input` to `fn`, so please be
+    **very careful** in this to not clone `target`.
+    """
+    if target is None:
+        target = input
+    output_size = fn(input).numel()
+    jacobian = make_jacobian(target, output_size)
+
+    # It's much easier to iterate over flattened lists of tensors.
+    # These are reference to the same objects in jacobian, so any changes
+    # will be reflected in it as well.
+    x_tensors = iter_tensors(target, True)
+    j_tensors = iter_tensors(jacobian)
+
+    def update_jacobians(x, idx, d, d_idx, is_mkldnn=False):
+
+        # compute_jacobian only works for pure real
+        # or pure imaginary delta
+        def compute_gradient(delta):
+            # we currently assume that the norm of delta equals eps
+            assert (delta == eps or delta == (eps * 1j))
+
+            def fn_out():
+                if not is_mkldnn:
+                    # x is a view into input and so this works
+                    return fn(input).clone()
+                else:
+                    # convert the dense tensor back to have mkldnn layout
+                    return fn([x.to_mkldnn()])
+
+            orig = x[idx].item()
+            x[idx] = orig - delta
+            outa = fn_out()
+            x[idx] = orig + delta
+            outb = fn_out()
+            x[idx] = orig
+            r = (outb - outa) / (2 * eps)
+            return r.detach().reshape(-1)
+
+        # for details on the algorithm used here, refer:
+        # Section 3.5.3 https://arxiv.org/pdf/1701.00392.pdf
+        # s = fn(z) where z = x for real valued input
+        # and z = x + yj for complex valued input
+        ds_dx = compute_gradient(eps)
+        if x.is_complex():  # C -> C, C -> R
+            ds_dy = compute_gradient(eps * 1j)
+            # conjugate wirtinger derivative
+            conj_w_d = 0.5 * (ds_dx + ds_dy * 1j)
+            # wirtinger derivative
+            w_d = 0.5 * (ds_dx - ds_dy * 1j)
+            d[d_idx] = grad_out.conjugate() * conj_w_d + grad_out * w_d.conj()
+        elif ds_dx.is_complex():  # R -> C
+            # w_d = conj_w_d = 0.5 * ds_dx
+            # dL_dz_conj = 0.5 * [grad_out.conj() * ds_dx + grad_out * ds_dx.conj()]
+            #            = 0.5 * [grad_out.conj() * ds_dx + (grad_out.conj() * ds_dx).conj()]
+            #            = 0.5 * 2 * real(grad_out.conj() * ds_dx)
+            #            = real(grad_out.conj() * ds_dx)
+            d[d_idx] = torch.real(grad_out.conjugate() * ds_dx)
+        else:  # R -> R
+            d[d_idx] = ds_dx * grad_out
+
+    # TODO: compare structure
+    for x_tensor, d_tensor in zip(x_tensors, j_tensors):
+        if x_tensor.is_sparse:
+
+            def get_stride(size):
+                dim = len(size)
+                tmp = 1
+                stride = [0] * dim
+                for i in reversed(range(dim)):
+                    stride[i] = tmp
+                    tmp *= size[i]
+                return stride
+
+            x_nnz = x_tensor._nnz()
+            x_size = list(x_tensor.size())
+            x_indices = x_tensor._indices().t()
+            x_values = x_tensor._values()
+            x_stride = get_stride(x_size)
+
+            # Use .data here to get around the version check
+            x_values = x_values.data
+
+            for i in range(x_nnz):
+                x_value = x_values[i]
+                for x_idx in product(*[range(m) for m in x_values.size()[1:]]):
+                    indices = x_indices[i].tolist() + list(x_idx)
+                    d_idx = sum(indices[k] * x_stride[k]
+                                for k in range(len(x_size)))
+                    update_jacobians(x_value, x_idx, d_tensor, d_idx)
+        elif x_tensor.layout == torch._mkldnn:  # type: ignore
+            # Use .data here to get around the version check
+            x_tensor = x_tensor.data
+            if len(input) != 1:
+                raise ValueError(
+                    'gradcheck currently only supports functions with 1 input, but got: ',
+                    len(input))
+            for d_idx, x_idx in enumerate(
+                    product(*[range(m) for m in x_tensor.size()])):
+                # this is really inefficient, but without indexing implemented, there's
+                # not really a better way than converting back and forth
+                x_tensor_dense = x_tensor.to_dense()
+                update_jacobians(x_tensor_dense,
+                                 x_idx,
+                                 d_tensor,
+                                 d_idx,
+                                 is_mkldnn=True)
+        else:
+            # Use .data here to get around the version check
+            x_tensor = x_tensor.data
+            for d_idx, x_idx in enumerate(
+                    product(*[range(m) for m in x_tensor.size()])):
+                update_jacobians(x_tensor, x_idx, d_tensor, d_idx)
+
+    return jacobian
+
+
+def get_analytical_jacobian(input, output, nondet_tol=0.0, grad_out=1.0):
+    # it is easier to call to_dense() on the sparse output than
+    # to modify analytical jacobian
+    if output.is_sparse:
+        raise ValueError(
+            'Sparse output is not supported at gradcheck yet. '
+            'Please call to_dense() on the output of fn for gradcheck.')
+    if output.layout == torch._mkldnn:  # type: ignore
+        raise ValueError(
+            'MKLDNN output is not supported at gradcheck yet. '
+            'Please call to_dense() on the output of fn for gradcheck.')
+    diff_input_list = list(iter_tensors(input, True))
+    jacobian = make_jacobian(input, output.numel())
+    jacobian_reentrant = make_jacobian(input, output.numel())
+    grad_output = torch.zeros_like(
+        output, memory_format=torch.legacy_contiguous_format)
+    flat_grad_output = grad_output.view(-1)
+    reentrant = True
+    correct_grad_sizes = True
+    correct_grad_types = True
+
+    for i in range(flat_grad_output.numel()):
+        flat_grad_output.zero_()
+        flat_grad_output[i] = grad_out
+        for jacobian_c in (jacobian, jacobian_reentrant):
+            grads_input = torch.autograd.grad(output,
+                                              diff_input_list,
+                                              grad_output,
+                                              retain_graph=True,
+                                              allow_unused=True)
+            for jacobian_x, d_x, x in zip(jacobian_c, grads_input,
+                                          diff_input_list):
+                if d_x is not None and d_x.size() != x.size():
+                    correct_grad_sizes = False
+                elif d_x is not None and d_x.dtype != x.dtype:
+                    correct_grad_types = False
+                elif jacobian_x.numel() != 0:
+                    if d_x is None:
+                        jacobian_x[:, i].zero_()
+                    else:
+                        d_x_dense = d_x.to_dense(
+                        ) if not d_x.layout == torch.strided else d_x
+                        assert jacobian_x[:, i].numel() == d_x_dense.numel()
+                        jacobian_x[:, i] = d_x_dense.contiguous().view(-1)
+
+    for jacobian_x, jacobian_reentrant_x in zip(jacobian, jacobian_reentrant):
+        if jacobian_x.numel() != 0 and (
+                jacobian_x - jacobian_reentrant_x).abs().max() > nondet_tol:
+            reentrant = False
+
+    return jacobian, reentrant, correct_grad_sizes, correct_grad_types
+
+
+def _as_tuple(x):
+    if TORCH_MAJOR == 1 and TORCH_MINOR < 8:
+        b_tuple = istuple(x)
+    else:
+        b_tuple = isinstance(x, tuple)
+
+    if b_tuple:
+        return x
+    elif isinstance(x, list):
+        return tuple(x)
+    else:
+        return x,
+
+
+def _differentiable_outputs(x):
+    return tuple(o for o in _as_tuple(x) if o.requires_grad)
+
+
+# Note [VarArg of Tensors]
+# ~~~~~~~~~~~~~~~~~~~~~~~~
+# 'func' accepts a vararg of tensors, which isn't expressable in the type system at the moment.
+# If https://mypy.readthedocs.io/en/latest/additional_features.html?highlight=callable#extended-callable-types is accepted,
+# the '...' first argument of Callable can be replaced with VarArg(Tensor).
+# For now, we permit any input.
+# the '...' first argument of Callable can be replaced with VarArg(Tensor).
+# For now, we permit any input.
+
+
+def gradcheck(
+        func: Callable[
+            ..., Union[_TensorOrTensors]],  # See Note [VarArg of Tensors]
+        inputs: _TensorOrTensors,
+        eps: float = 1e-6,
+        atol: float = 1e-5,
+        rtol: float = 1e-3,
+        raise_exception: bool = True,
+        check_sparse_nnz: bool = False,
+        nondet_tol: float = 0.0,
+        check_undefined_grad: bool = True,
+        check_grad_dtypes: bool = False) -> bool:
+    r"""Check gradients computed via small finite differences against analytical
+    gradients w.r.t. tensors in :attr:`inputs` that are of floating point or complex type
+    and with ``requires_grad=True``.
+
+    The check between numerical and analytical gradients uses :func:`~torch.allclose`.
+
+    For complex functions, no notion of Jacobian exists. Gradcheck verifies if the numerical and
+    analytical values of Wirtinger and Conjugate Wirtinger derivative are consistent. The gradient
+    computation is done under the assumption that the overall function has a real valued output.
+    For functions with complex output, gradcheck compares the numerical and analytical gradients
+    for two values of :attr:`grad_output`: 1 and 1j. For more details, check out
+    :ref:`complex_autograd-doc`.
+
+    .. note::
+        The default values are designed for :attr:`input` of double precision.
+        This check will likely fail if :attr:`input` is of less precision, e.g.,
+        ``FloatTensor``.
+
+    .. warning::
+       If any checked tensor in :attr:`input` has overlapping memory, i.e.,
+       different indices pointing to the same memory address (e.g., from
+       :func:`torch.expand`), this check will likely fail because the numerical
+       gradients computed by point perturbation at such indices will change
+       values at all other indices that share the same memory address.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor or a tuple of Tensors
+        inputs (tuple of Tensor or Tensor): inputs to the function
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
+            exact nature of the failure. This is helpful when debugging gradchecks.
+        check_sparse_nnz (bool, optional): if True, gradcheck allows for SparseTensor input,
+            and for any SparseTensor at input, gradcheck will perform check at nnz positions only.
+        nondet_tol (float, optional): tolerance for non-determinism. When running
+            identical inputs through the differentiation, the results must either match
+            exactly (default, 0.0) or be within this tolerance.
+        check_undefined_grad (bool, options): if True, check if undefined output grads
+            are supported and treated as zeros, for ``Tensor`` outputs.
+
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    def fail_test(msg):
+        if raise_exception:
+            raise RuntimeError(msg)
+        return False
+
+    tupled_inputs = _as_tuple(inputs)
+    if not check_sparse_nnz and any(
+            t.is_sparse for t in tupled_inputs if isinstance(t, torch.Tensor)):
+        return fail_test(
+            'gradcheck expects all tensor inputs are dense when check_sparse_nnz is set to False.'
+        )
+
+    # Make sure that gradients are saved for at least one input
+    any_input_requiring_grad = False
+    for idx, inp in enumerate(tupled_inputs):
+        if is_tensor_like(inp) and inp.requires_grad:
+            if not (inp.dtype == torch.float64
+                    or inp.dtype == torch.complex128):
+                warnings.warn(
+                    f'Input #{idx} requires gradient and '
+                    'is not a double precision floating point or complex. '
+                    'This check will likely fail if all the inputs are '
+                    'not of double precision floating point or complex. ')
+            content = inp._values() if inp.is_sparse else inp
+            # TODO: To cover more problematic cases, replace stride = 0 check with
+            # "any overlap in memory" once we have a proper function to check it.
+            if content.layout is not torch._mkldnn:  # type: ignore
+                if not all(
+                        st > 0 or sz <= 1
+                        for st, sz in zip(content.stride(), content.size())):
+                    raise RuntimeError(
+                        'The {}th input has a dimension with stride 0. gradcheck only '
+                        'supports inputs that are non-overlapping to be able to '
+                        'compute the numerical gradients correctly. You should call '
+                        '.contiguous on the input before passing it to gradcheck.'
+                    )
+            any_input_requiring_grad = True
+            inp.retain_grad()
+    if not any_input_requiring_grad:
+        raise ValueError(
+            'gradcheck expects at least one input tensor to require gradient, '
+            'but none of the them have requires_grad=True.')
+
+    func_out = func(*tupled_inputs)
+    output = _differentiable_outputs(func_out)
+
+    if not output:
+        for i, o in enumerate(func_out):
+
+            def fn(input):
+                return _as_tuple(func(*input))[i]
+
+            numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
+            for n in numerical:
+                if torch.ne(n, 0).sum() > 0:
+                    return fail_test(
+                        'Numerical gradient for function expected to be zero')
+        return True
+
+    for i, o in enumerate(output):
+        if not o.requires_grad:
+            continue
+
+        def fn(input):
+            return _as_tuple(func(*input))[i]
+
+        analytical, reentrant, correct_grad_sizes, correct_grad_types = get_analytical_jacobian(
+            tupled_inputs, o, nondet_tol=nondet_tol)
+        numerical = get_numerical_jacobian(fn, tupled_inputs, eps=eps)
+
+        return analytical, numerical
+
+        out_is_complex = o.is_complex()
+
+        if out_is_complex:
+            # analytical vjp with grad_out = 1.0j
+            analytical_with_imag_grad_out, reentrant_with_imag_grad_out, \
+                correct_grad_sizes_with_imag_grad_out, correct_grad_types_with_imag_grad_out \
+                = get_analytical_jacobian(tupled_inputs, o, nondet_tol=nondet_tol, grad_out=1j)
+            numerical_with_imag_grad_out = get_numerical_jacobian(
+                fn, tupled_inputs, eps=eps, grad_out=1j)
+
+        if not correct_grad_types and check_grad_dtypes:
+            return fail_test('Gradient has dtype mismatch')
+
+        if out_is_complex and not correct_grad_types_with_imag_grad_out and check_grad_dtypes:
+            return fail_test(
+                'Gradient (calculated using complex valued grad output) has dtype mismatch'
+            )
+
+        if not correct_grad_sizes:
+            return fail_test('Analytical gradient has incorrect size')
+
+        if out_is_complex and not correct_grad_sizes_with_imag_grad_out:
+            return fail_test(
+                'Analytical gradient (calculated using complex valued grad output) has incorrect size'
+            )
+
+        def checkIfNumericalAnalyticAreClose(a, n, j, error_str=''):
+            if not torch.allclose(a, n, rtol, atol):
+                return fail_test(
+                    error_str +
+                    'Jacobian mismatch for output %d with respect to input %d,\n'
+                    'numerical:%s\nanalytical:%s\n' % (i, j, n, a))
+
+        inp_tensors = iter_tensors(tupled_inputs, True)
+
+        for j, (a, n, inp) in enumerate(zip(analytical, numerical,
+                                            inp_tensors)):
+            if a.numel() != 0 or n.numel() != 0:
+                if o.is_complex():
+                    # C -> C, R -> C
+                    a_with_imag_grad_out = analytical_with_imag_grad_out[j]
+                    n_with_imag_grad_out = numerical_with_imag_grad_out[j]
+                    checkIfNumericalAnalyticAreClose(
+                        a_with_imag_grad_out, n_with_imag_grad_out, j,
+                        'Gradients failed to compare equal for grad output = 1j. '
+                    )
+                if inp.is_complex():
+                    # C -> R, C -> C
+                    checkIfNumericalAnalyticAreClose(
+                        a, n, j,
+                        'Gradients failed to compare equal for grad output = 1. '
+                    )
+                else:
+                    # R -> R, R -> C
+                    checkIfNumericalAnalyticAreClose(a, n, j)
+
+        def not_reentrant_error(error_str=''):
+            error_msg = 'Backward' + error_str + ' is not reentrant, i.e., running backward with same \
+                        input and grad_output multiple times gives different values, \
+                        although analytical gradient matches numerical gradient. \
+                        The tolerance for nondeterminism was {}.'.format(
+                nondet_tol)
+            return fail_test(error_msg)
+
+        if not reentrant:
+            return not_reentrant_error()
+
+        if out_is_complex and not reentrant_with_imag_grad_out:
+            return not_reentrant_error(
+                ' (calculated using complex valued grad output)')
+
+    # check if the backward multiplies by grad_output
+    output = _differentiable_outputs(func(*tupled_inputs))
+    if any([o.requires_grad for o in output]):
+        diff_input_list: List[torch.Tensor] = list(
+            iter_tensors(tupled_inputs, True))
+        if not diff_input_list:
+            raise RuntimeError('no Tensors requiring grad found in input')
+        grads_input = torch.autograd.grad(
+            output,
+            diff_input_list, [
+                torch.zeros_like(o,
+                                 memory_format=torch.legacy_contiguous_format)
+                for o in output
+            ],
+            allow_unused=True)
+        for gi, di in zip(grads_input, diff_input_list):
+            if gi is None:
+                continue
+            if isinstance(gi, torch.Tensor) and gi.layout != torch.strided:
+                if gi.layout != di.layout:
+                    return fail_test('grad is incorrect layout (' +
+                                     str(gi.layout) + ' is not ' +
+                                     str(di.layout) + ')')
+                if gi.layout == torch.sparse_coo:
+                    if gi.sparse_dim() != di.sparse_dim():
+                        return fail_test(
+                            'grad is sparse tensor, but has incorrect sparse_dim'
+                        )
+                    if gi.dense_dim() != di.dense_dim():
+                        return fail_test(
+                            'grad is sparse tensor, but has incorrect dense_dim'
+                        )
+                gi = gi.to_dense()
+                di = di.to_dense()
+            if not gi.eq(0).all():
+                return fail_test('backward not multiplied by grad_output')
+            if gi.dtype != di.dtype or gi.device != di.device or gi.is_sparse != di.is_sparse:
+                return fail_test('grad is incorrect type')
+            if gi.size() != di.size():
+                return fail_test('grad is incorrect size')
+
+        if check_undefined_grad:
+
+            def warn_bc_breaking():
+                warnings.warn((
+                    'Backwards compatibility: New undefined gradient support checking '
+                    'feature is enabled by default, but it may break existing callers '
+                    'of this function. If this is true for you, you can call this '
+                    'function with "check_undefined_grad=False" to disable the feature'
+                ))
+
+            def check_undefined_grad_support(output_to_check):
+                grads_output = [
+                    torch.zeros_like(
+                        o, memory_format=torch.legacy_contiguous_format)
+                    for o in output_to_check
+                ]
+                try:
+                    grads_input = torch.autograd.grad(output_to_check,
+                                                      diff_input_list,
+                                                      grads_output,
+                                                      allow_unused=True)
+                except RuntimeError:
+                    warn_bc_breaking()
+                    return fail_test((
+                        'Expected backward function to handle undefined output grads. '
+                        'Please look at "Notes about undefined output gradients" in '
+                        '"tools/autograd/derivatives.yaml"'))
+
+                for gi, i in zip(grads_input, diff_input_list):
+                    if (gi is not None) and (not gi.eq(0).all()):
+                        warn_bc_breaking()
+                        return fail_test((
+                            'Expected all input grads to be undefined or zero when all output grads are undefined '
+                            'or zero. Please look at "Notes about undefined output gradients" in '
+                            '"tools/autograd/derivatives.yaml"'))
+                return True
+
+            # All backward functions must work properly if all output grads are undefined
+            outputs_to_check = [[
+                torch._C._functions.UndefinedGrad()(o)
+                for o in _differentiable_outputs(func(*tupled_inputs))
+                # This check filters out Tensor-likes that aren't instances of Tensor.
+                if isinstance(o, torch.Tensor)
+            ]]
+
+            # If there are multiple output grads, we should be able to undef one at a time without error
+            if len(outputs_to_check[0]) > 1:
+                for undef_grad_idx in range(len(output)):
+                    output_to_check = _differentiable_outputs(
+                        func(*tupled_inputs))
+                    outputs_to_check.append([
+                        torch._C._functions.UndefinedGrad()(o)
+                        if idx == undef_grad_idx else o
+                        for idx, o in enumerate(output_to_check)
+                    ])
+
+            for output_to_check in outputs_to_check:
+                if not check_undefined_grad_support(output_to_check):
+                    return False
+
+    return True
+
+
+def gradgradcheck(
+        func: Callable[..., _TensorOrTensors],  # See Note [VarArg of Tensors]
+        inputs: _TensorOrTensors,
+        grad_outputs: Optional[_TensorOrTensors] = None,
+        eps: float = 1e-6,
+        atol: float = 1e-5,
+        rtol: float = 1e-3,
+        gen_non_contig_grad_outputs: bool = False,
+        raise_exception: bool = True,
+        nondet_tol: float = 0.0,
+        check_undefined_grad: bool = True,
+        check_grad_dtypes: bool = False) -> bool:
+    r"""Check gradients of gradients computed via small finite differences
+    against analytical gradients w.r.t. tensors in :attr:`inputs` and
+    :attr:`grad_outputs` that are of floating point or complex type and with
+    ``requires_grad=True``.
+
+    This function checks that backpropagating through the gradients computed
+    to the given :attr:`grad_outputs` are correct.
+
+    The check between numerical and analytical gradients uses :func:`~torch.allclose`.
+
+    .. note::
+        The default values are designed for :attr:`input` and
+        :attr:`grad_outputs` of double precision. This check will likely fail if
+        they are of less precision, e.g., ``FloatTensor``.
+
+    .. warning::
+       If any checked tensor in :attr:`input` and :attr:`grad_outputs` has
+       overlapping memory, i.e., different indices pointing to the same memory
+       address (e.g., from :func:`torch.expand`), this check will likely fail
+       because the numerical gradients computed by point perturbation at such
+       indices will change values at all other indices that share the same
+       memory address.
+
+    Args:
+        func (function): a Python function that takes Tensor inputs and returns
+            a Tensor or a tuple of Tensors
+        inputs (tuple of Tensor or Tensor): inputs to the function
+        grad_outputs (tuple of Tensor or Tensor, optional): The gradients with
+            respect to the function's outputs.
+        eps (float, optional): perturbation for finite differences
+        atol (float, optional): absolute tolerance
+        rtol (float, optional): relative tolerance
+        gen_non_contig_grad_outputs (bool, optional): if :attr:`grad_outputs` is
+            ``None`` and :attr:`gen_non_contig_grad_outputs` is ``True``, the
+            randomly generated gradient outputs are made to be noncontiguous
+        raise_exception (bool, optional): indicating whether to raise an exception if
+            the check fails. The exception gives more information about the
+            exact nature of the failure. This is helpful when debugging gradchecks.
+        nondet_tol (float, optional): tolerance for non-determinism. When running
+            identical inputs through the differentiation, the results must either match
+            exactly (default, 0.0) or be within this tolerance. Note that a small amount
+            of nondeterminism in the gradient will lead to larger inaccuracies in
+            the second derivative.
+        check_undefined_grad (bool, options): if True, check if undefined output grads
+            are supported and treated as zeros
+
+    Returns:
+        True if all differences satisfy allclose condition
+    """
+    tupled_inputs = _as_tuple(inputs)
+
+    if grad_outputs is None:
+        # If grad_outputs is not specified, create random Tensors of the same
+        # shape, type, and device as the outputs
+        def randn_like(x):
+            y = torch.testing.randn_like(
+                x if (x.is_floating_point() or x.is_complex()) else x.double(),
+                memory_format=torch.legacy_contiguous_format)
+            if gen_non_contig_grad_outputs:
+                y = torch.testing.make_non_contiguous(y)
+            return y.requires_grad_()
+
+        outputs = _as_tuple(func(*tupled_inputs))
+        tupled_grad_outputs = tuple(randn_like(x) for x in outputs)
+    else:
+        tupled_grad_outputs = _as_tuple(grad_outputs)
+
+    num_outputs = len(tupled_grad_outputs)
+
+    def new_func(*args):
+        input_args = args[:-num_outputs]
+        grad_outputs = args[-num_outputs:]
+        outputs = _differentiable_outputs(func(*input_args))
+        input_args = tuple(x for x in input_args
+                           if isinstance(x, torch.Tensor) and x.requires_grad)
+        grad_inputs = torch.autograd.grad(outputs,
+                                          input_args,
+                                          grad_outputs,
+                                          create_graph=True)
+        return grad_inputs
+
+    return gradcheck(new_func,
+                     tupled_inputs + tupled_grad_outputs,
+                     eps,
+                     atol,
+                     rtol,
+                     raise_exception,
+                     nondet_tol=nondet_tol,
+                     check_undefined_grad=check_undefined_grad,
+                     check_grad_dtypes=check_grad_dtypes)
diff --git a/third_party/dpvo_ext/lietorch/group_ops.py b/third_party/dpvo_ext/lietorch/group_ops.py
new file mode 100644
index 0000000..0f19318
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/group_ops.py
@@ -0,0 +1,107 @@
+import lietorch_backends
+import torch
+import torch.nn.functional as F
+
+
+class GroupOp(torch.autograd.Function):
+    """group operation base class."""
+    @classmethod
+    def forward(cls, ctx, group_id, *inputs):
+        ctx.group_id = group_id
+        ctx.save_for_backward(*inputs)
+        out = cls.forward_op(ctx.group_id, *inputs)
+        return out
+
+    @classmethod
+    def backward(cls, ctx, grad):
+        error_str = 'Backward operation not implemented for {}'.format(cls)
+        assert cls.backward_op is not None, error_str
+
+        inputs = ctx.saved_tensors
+        grad = grad.contiguous()
+        grad_inputs = cls.backward_op(ctx.group_id, grad, *inputs)
+        return (None, ) + tuple(grad_inputs)
+
+
+class Exp(GroupOp):
+    """exponential map."""
+    forward_op, backward_op = lietorch_backends.expm, lietorch_backends.expm_backward
+
+
+class Log(GroupOp):
+    """logarithm map."""
+    forward_op, backward_op = lietorch_backends.logm, lietorch_backends.logm_backward
+
+
+class Inv(GroupOp):
+    """group inverse."""
+    forward_op, backward_op = lietorch_backends.inv, lietorch_backends.inv_backward
+
+
+class Mul(GroupOp):
+    """group multiplication."""
+    forward_op, backward_op = lietorch_backends.mul, lietorch_backends.mul_backward
+
+
+class Adj(GroupOp):
+    """adjoint operator."""
+    forward_op, backward_op = lietorch_backends.adj, lietorch_backends.adj_backward
+
+
+class AdjT(GroupOp):
+    """adjoint operator."""
+    forward_op, backward_op = lietorch_backends.adjT, lietorch_backends.adjT_backward
+
+
+class Act3(GroupOp):
+    """action on point."""
+    forward_op, backward_op = lietorch_backends.act, lietorch_backends.act_backward
+
+
+class Act4(GroupOp):
+    """action on point."""
+    forward_op, backward_op = lietorch_backends.act4, lietorch_backends.act4_backward
+
+
+class Jinv(GroupOp):
+    """adjoint operator."""
+    forward_op, backward_op = lietorch_backends.Jinv, None
+
+
+class ToMatrix(GroupOp):
+    """convert to matrix representation."""
+    forward_op, backward_op = lietorch_backends.as_matrix, None
+
+
+### conversion operations to/from Euclidean embeddings ###
+
+
+class FromVec(torch.autograd.Function):
+    """convert vector into group object."""
+    @classmethod
+    def forward(cls, ctx, group_id, *inputs):
+        ctx.group_id = group_id
+        ctx.save_for_backward(*inputs)
+        return inputs[0]
+
+    @classmethod
+    def backward(cls, ctx, grad):
+        inputs = ctx.saved_tensors
+        J = lietorch_backends.projector(ctx.group_id, *inputs)
+        return None, torch.matmul(grad.unsqueeze(-2),
+                                  torch.linalg.pinv(J)).squeeze(-2)
+
+
+class ToVec(torch.autograd.Function):
+    """convert group object to vector."""
+    @classmethod
+    def forward(cls, ctx, group_id, *inputs):
+        ctx.group_id = group_id
+        ctx.save_for_backward(*inputs)
+        return inputs[0]
+
+    @classmethod
+    def backward(cls, ctx, grad):
+        inputs = ctx.saved_tensors
+        J = lietorch_backends.projector(ctx.group_id, *inputs)
+        return None, torch.matmul(grad.unsqueeze(-2), J).squeeze(-2)
diff --git a/third_party/dpvo_ext/lietorch/groups.py b/third_party/dpvo_ext/lietorch/groups.py
new file mode 100644
index 0000000..bca42c0
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/groups.py
@@ -0,0 +1,328 @@
+import numpy as np
+import torch
+
+from .broadcasting import broadcast_inputs
+# group operations implemented in cuda
+from .group_ops import (Act3, Act4, Adj, AdjT, Exp, FromVec, Inv, Jinv, Log,
+                        Mul, ToMatrix, ToVec)
+
+
+class LieGroupParameter(torch.Tensor):
+    """Wrapper class for LieGroup."""
+
+    from torch._C import _disabled_torch_function_impl
+    __torch_function__ = _disabled_torch_function_impl
+
+    def __new__(cls, group, requires_grad=True):
+        data = torch.zeros(group.tangent_shape,
+                           device=group.data.device,
+                           dtype=group.data.dtype,
+                           requires_grad=True)
+
+        return torch.Tensor._make_subclass(cls, data, requires_grad)
+
+    def __init__(self, group):
+        self.group = group
+
+    def retr(self):
+        return self.group.retr(self)
+
+    def log(self):
+        return self.retr().log()
+
+    def inv(self):
+        return self.retr().inv()
+
+    def adj(self, a):
+        return self.retr().adj(a)
+
+    def __mul__(self, other):
+        if isinstance(other, LieGroupParameter):
+            return self.retr() * other.retr()
+        else:
+            return self.retr() * other
+
+    def add_(self, update, alpha):
+        self.group = self.group.exp(alpha * update) * self.group
+
+    def __getitem__(self, index):
+        return self.retr().__getitem__(index)
+
+
+class LieGroup:
+    """Base class for Lie Group."""
+    def __init__(self, data):
+        self.data = data
+
+    def __repr__(self):
+        return '{}: size={}, device={}, dtype={}'.format(
+            self.group_name, self.shape, self.device, self.dtype)
+
+    @property
+    def shape(self):
+        return self.data.shape[:-1]
+
+    @property
+    def device(self):
+        return self.data.device
+
+    @property
+    def dtype(self):
+        return self.data.dtype
+
+    def vec(self):
+        return self.apply_op(ToVec, self.data)
+
+    @property
+    def tangent_shape(self):
+        return self.data.shape[:-1] + (self.manifold_dim, )
+
+    @classmethod
+    def Identity(cls, *batch_shape, **kwargs):
+        """Construct identity element with batch shape."""
+
+        if isinstance(batch_shape[0], tuple):
+            batch_shape = batch_shape[0]
+
+        elif isinstance(batch_shape[0], list):
+            batch_shape = tuple(batch_shape[0])
+
+        numel = np.prod(batch_shape)
+        data = cls.id_elem.reshape(1, -1)
+
+        if 'device' in kwargs:
+            data = data.to(kwargs['device'])
+
+        if 'dtype' in kwargs:
+            data = data.type(kwargs['dtype'])
+
+        data = data.repeat(numel, 1)
+        return cls(data).view(batch_shape)
+
+    @classmethod
+    def IdentityLike(cls, G):
+        return cls.Identity(G.shape, device=G.data.device, dtype=G.data.dtype)
+
+    @classmethod
+    def InitFromVec(cls, data):
+        return cls(cls.apply_op(FromVec, data))
+
+    @classmethod
+    def Random(cls, *batch_shape, sigma=1.0, **kwargs):
+        """Construct random element with batch_shape by random sampling in
+        tangent space."""
+
+        if isinstance(batch_shape[0], tuple):
+            batch_shape = batch_shape[0]
+
+        elif isinstance(batch_shape[0], list):
+            batch_shape = tuple(batch_shape[0])
+
+        tangent_shape = batch_shape + (cls.manifold_dim, )
+        xi = torch.randn(tangent_shape, **kwargs)
+        return cls.exp(sigma * xi)
+
+    @classmethod
+    def apply_op(cls, op, x, y=None):
+        """Apply group operator."""
+        inputs, out_shape = broadcast_inputs(x, y)
+
+        data = op.apply(cls.group_id, *inputs)
+        return data.view(out_shape + (-1, ))
+
+    @classmethod
+    def exp(cls, x):
+        """exponential map: x -> X."""
+        return cls(cls.apply_op(Exp, x))
+
+    def quaternion(self):
+        """extract quaternion."""
+        return self.apply_op(Quat, self.data)
+
+    def log(self):
+        """logarithm map."""
+        return self.apply_op(Log, self.data)
+
+    def inv(self):
+        """group inverse."""
+        return self.__class__(self.apply_op(Inv, self.data))
+
+    def mul(self, other):
+        """group multiplication."""
+        return self.__class__(self.apply_op(Mul, self.data, other.data))
+
+    def retr(self, a):
+        """ retraction: Exp(a) * X """
+        dX = self.__class__.apply_op(Exp, a)
+        return self.__class__(self.apply_op(Mul, dX, self.data))
+
+    def adj(self, a):
+        """ adjoint operator: b = A(X) * a """
+        return self.apply_op(Adj, self.data, a)
+
+    def adjT(self, a):
+        """ transposed adjoint operator: b = a * A(X) """
+        return self.apply_op(AdjT, self.data, a)
+
+    def Jinv(self, a):
+        return self.apply_op(Jinv, self.data, a)
+
+    def act(self, p):
+        """action on a point cloud."""
+
+        # action on point
+        if p.shape[-1] == 3:
+            return self.apply_op(Act3, self.data, p)
+
+        # action on homogeneous point
+        elif p.shape[-1] == 4:
+            return self.apply_op(Act4, self.data, p)
+
+    def matrix(self):
+        """convert element to 4x4 matrix."""
+        I = torch.eye(4, dtype=self.dtype, device=self.device)
+        I = I.view([1] * (len(self.data.shape) - 1) + [4, 4])
+        return self.__class__(self.data[..., None, :]).act(I).transpose(-1, -2)
+
+    def translation(self):
+        """extract translation component."""
+        p = torch.as_tensor([0.0, 0.0, 0.0, 1.0],
+                            dtype=self.dtype,
+                            device=self.device)
+        p = p.view([1] * (len(self.data.shape) - 1) + [
+            4,
+        ])
+        return self.apply_op(Act4, self.data, p)
+
+    def detach(self):
+        return self.__class__(self.data.detach())
+
+    def view(self, dims):
+        data_reshaped = self.data.view(dims + (self.embedded_dim, ))
+        return self.__class__(data_reshaped)
+
+    def __mul__(self, other):
+        # group multiplication
+
+        if isinstance(other, LieGroup):
+            return self.mul(other)
+
+        # action on point
+        elif isinstance(other, torch.Tensor):
+            return self.act(other)
+
+    def __getitem__(self, index):
+        return self.__class__(self.data[index])
+
+    def __setitem__(self, index, item):
+        self.data[index] = item.data
+
+    def to(self, *args, **kwargs):
+        return self.__class__(self.data.to(*args, **kwargs))
+
+    def cpu(self):
+        return self.__class__(self.data.cpu())
+
+    def cuda(self):
+        return self.__class__(self.data.cuda())
+
+    def float(self, device):
+        return self.__class__(self.data.float())
+
+    def double(self, device):
+        return self.__class__(self.data.double())
+
+    def unbind(self, dim=0):
+        return [self.__class__(x) for x in self.data.unbind(dim=dim)]
+
+
+class SO3(LieGroup):
+    group_name = 'SO3'
+    group_id = 1
+    manifold_dim = 3
+    embedded_dim = 4
+
+    # unit quaternion
+    id_elem = torch.as_tensor([0.0, 0.0, 0.0, 1.0])
+
+    def __init__(self, data):
+        if isinstance(data, SE3):
+            data = data.data[..., 3:7]
+
+        super(SO3, self).__init__(data)
+
+
+class RxSO3(LieGroup):
+    group_name = 'RxSO3'
+    group_id = 2
+    manifold_dim = 4
+    embedded_dim = 5
+
+    # unit quaternion
+    id_elem = torch.as_tensor([0.0, 0.0, 0.0, 1.0, 1.0])
+
+    def __init__(self, data):
+        if isinstance(data, Sim3):
+            data = data.data[..., 3:8]
+
+        super(RxSO3, self).__init__(data)
+
+
+class SE3(LieGroup):
+    group_name = 'SE3'
+    group_id = 3
+    manifold_dim = 6
+    embedded_dim = 7
+
+    # translation, unit quaternion
+    id_elem = torch.as_tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0])
+
+    def __init__(self, data):
+        if isinstance(data, SO3):
+            translation = torch.zeros_like(data.data[..., :3])
+            data = torch.cat([translation, data.data], -1)
+
+        super(SE3, self).__init__(data)
+
+    def scale(self, s):
+        t, q = self.data.split([3, 4], -1)
+        t = t * s.unsqueeze(-1)
+        return SE3(torch.cat([t, q], dim=-1))
+
+
+class Sim3(LieGroup):
+    group_name = 'Sim3'
+    group_id = 4
+    manifold_dim = 7
+    embedded_dim = 8
+
+    # translation, unit quaternion, scale
+    id_elem = torch.as_tensor([0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0])
+
+    def __init__(self, data):
+
+        if isinstance(data, SO3):
+            scale = torch.ones_like(SO3.data[..., :1])
+            translation = torch.zeros_like(SO3.data[..., :3])
+            data = torch.cat([translation, SO3.data, scale], -1)
+
+        elif isinstance(data, SE3):
+            scale = torch.ones_like(data.data[..., :1])
+            data = torch.cat([data.data, scale], -1)
+
+        elif isinstance(data, Sim3):
+            data = data.data
+
+        super(Sim3, self).__init__(data)
+
+
+def cat(group_list, dim):
+    """Concatenate groups along dimension."""
+    data = torch.cat([X.data for X in group_list], dim=dim)
+    return group_list[0].__class__(data)
+
+
+def stack(group_list, dim):
+    """Concatenate groups along dimension."""
+    data = torch.stack([X.data for X in group_list], dim=dim)
+    return group_list[0].__class__(data)
diff --git a/third_party/dpvo_ext/lietorch/include/common.h b/third_party/dpvo_ext/lietorch/include/common.h
new file mode 100644
index 0000000..80809ed
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/common.h
@@ -0,0 +1,11 @@
+#ifndef COMMON_H
+#define COMMON_H
+
+#define EIGEN_DEFAULT_DENSE_INDEX_TYPE int
+#define EIGEN_RUNTIME_NO_MALLOC
+
+#define EPS 1e-6
+#define PI 3.14159265358979323846
+
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/dispatch.h b/third_party/dpvo_ext/lietorch/include/dispatch.h
new file mode 100644
index 0000000..578b292
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/dispatch.h
@@ -0,0 +1,47 @@
+#ifndef DISPATCH_H
+#define DISPATCH_H
+
+#include <torch/extension.h>
+
+#include "so3.h"
+#include "rxso3.h"
+#include "se3.h"
+#include "sim3.h"
+
+
+#define PRIVATE_CASE_TYPE(group_index, enum_type, type, ...)    \
+  case enum_type: {                                             \
+    using scalar_t = type;                                      \
+    switch (group_index) {                                      \
+      case 1: {                                                 \
+        using group_t = SO3<type>;                              \
+        return __VA_ARGS__();                                   \
+      }                                                         \
+      case 2: {                                                 \
+        using group_t = RxSO3<type>;                            \
+        return __VA_ARGS__();                                   \
+      }                                                         \
+      case 3: {                                                 \
+        using group_t = SE3<type>;                              \
+        return __VA_ARGS__();                                   \
+      }                                                         \
+      case 4: {                                                 \
+        using group_t = Sim3<type>;                             \
+        return __VA_ARGS__();                                   \
+      }                                                         \
+    }                                                           \
+  }                                                             \
+
+#define DISPATCH_GROUP_AND_FLOATING_TYPES(GROUP_INDEX, TYPE, NAME, ...)              \
+  [&] {                                                                              \
+    const auto& the_type = TYPE;                                                     \
+    /* don't use TYPE again in case it is an expensive or side-effect op */          \
+    at::ScalarType _st = ::detail::scalar_type(the_type);                            \
+    switch (_st) {                                                                   \
+      PRIVATE_CASE_TYPE(GROUP_INDEX, at::ScalarType::Double, double, __VA_ARGS__)    \
+      PRIVATE_CASE_TYPE(GROUP_INDEX, at::ScalarType::Float, float, __VA_ARGS__)      \
+      default: break;                                                                \
+    }                                                                                \
+  }()
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/lietorch_cpu.h b/third_party/dpvo_ext/lietorch/include/lietorch_cpu.h
new file mode 100644
index 0000000..8510443
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/lietorch_cpu.h
@@ -0,0 +1,48 @@
+
+#ifndef LIETORCH_CPU_H_
+#define LIETORCH_CPU_H_
+
+#include <vector>
+#include <torch/extension.h>
+
+
+// unary operations
+torch::Tensor exp_forward_cpu(int, torch::Tensor);
+std::vector<torch::Tensor> exp_backward_cpu(int, torch::Tensor, torch::Tensor);
+
+torch::Tensor log_forward_cpu(int, torch::Tensor);
+std::vector<torch::Tensor> log_backward_cpu(int, torch::Tensor, torch::Tensor);
+
+torch::Tensor inv_forward_cpu(int, torch::Tensor);
+std::vector<torch::Tensor> inv_backward_cpu(int, torch::Tensor, torch::Tensor);
+
+// binary operations
+torch::Tensor mul_forward_cpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> mul_backward_cpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor adj_forward_cpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> adj_backward_cpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor adjT_forward_cpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> adjT_backward_cpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor act_forward_cpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> act_backward_cpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor act4_forward_cpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> act4_backward_cpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+
+// conversion operations
+// std::vector<torch::Tensor> to_vec_backward_cpu(int, torch::Tensor, torch::Tensor);
+// std::vector<torch::Tensor> from_vec_backward_cpu(int, torch::Tensor, torch::Tensor);
+
+// utility operations
+torch::Tensor orthogonal_projector_cpu(int, torch::Tensor);
+
+torch::Tensor as_matrix_forward_cpu(int, torch::Tensor);
+
+torch::Tensor jleft_forward_cpu(int, torch::Tensor, torch::Tensor);
+
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/lietorch_gpu.h b/third_party/dpvo_ext/lietorch/include/lietorch_gpu.h
new file mode 100644
index 0000000..344b629
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/lietorch_gpu.h
@@ -0,0 +1,48 @@
+
+#ifndef LIETORCH_GPU_H_
+#define LIETORCH_GPU_H_
+
+#include <vector>
+#include <torch/extension.h>
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+
+// unary operations
+torch::Tensor exp_forward_gpu(int, torch::Tensor);
+std::vector<torch::Tensor> exp_backward_gpu(int, torch::Tensor, torch::Tensor);
+
+torch::Tensor log_forward_gpu(int, torch::Tensor);
+std::vector<torch::Tensor> log_backward_gpu(int, torch::Tensor, torch::Tensor);
+
+torch::Tensor inv_forward_gpu(int, torch::Tensor);
+std::vector<torch::Tensor> inv_backward_gpu(int, torch::Tensor, torch::Tensor);
+
+// binary operations
+torch::Tensor mul_forward_gpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> mul_backward_gpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor adj_forward_gpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> adj_backward_gpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor adjT_forward_gpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> adjT_backward_gpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor act_forward_gpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> act_backward_gpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+torch::Tensor act4_forward_gpu(int, torch::Tensor, torch::Tensor);
+std::vector<torch::Tensor> act4_backward_gpu(int, torch::Tensor, torch::Tensor, torch::Tensor);
+
+// conversion operations
+// std::vector<torch::Tensor> to_vec_backward_gpu(int, torch::Tensor, torch::Tensor);
+// std::vector<torch::Tensor> from_vec_backward_gpu(int, torch::Tensor, torch::Tensor);
+
+// utility operators
+torch::Tensor orthogonal_projector_gpu(int, torch::Tensor);
+
+torch::Tensor as_matrix_forward_gpu(int, torch::Tensor);
+
+torch::Tensor jleft_forward_gpu(int, torch::Tensor, torch::Tensor);
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/rxso3.h b/third_party/dpvo_ext/lietorch/include/rxso3.h
new file mode 100644
index 0000000..e536472
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/rxso3.h
@@ -0,0 +1,322 @@
+
+#ifndef RxSO3_HEADER
+#define RxSO3_HEADER
+
+#include <stdio.h>
+#include <Eigen/Dense>
+#include <Eigen/Geometry>
+
+#include "common.h"
+
+template <typename Scalar>
+class RxSO3 {
+  public:
+    const static int constexpr K = 4; // manifold dimension
+    const static int constexpr N = 5; // embedding dimension
+
+    using Vector3 = Eigen::Matrix<Scalar,3,1>;
+    using Vector4 = Eigen::Matrix<Scalar,4,1>;
+    using Matrix3 = Eigen::Matrix<Scalar,3,3>;
+
+    using Tangent = Eigen::Matrix<Scalar,K,1>;
+    using Data = Eigen::Matrix<Scalar,N,1>;
+
+    using Point = Eigen::Matrix<Scalar,3,1>;
+    using Point4 = Eigen::Matrix<Scalar,4,1>;
+
+    using Quaternion = Eigen::Quaternion<Scalar>;
+    using Transformation = Eigen::Matrix<Scalar,3,3>;
+    using Adjoint = Eigen::Matrix<Scalar,4,4>;
+
+
+    EIGEN_DEVICE_FUNC RxSO3(Quaternion const& q, Scalar const s)
+        : unit_quaternion(q), scale(s) {
+      unit_quaternion.normalize();
+    };
+
+    EIGEN_DEVICE_FUNC RxSO3(const Scalar *data) : unit_quaternion(data), scale(data[4]) {
+      unit_quaternion.normalize();
+    };
+
+    EIGEN_DEVICE_FUNC RxSO3() {
+      unit_quaternion = Quaternion::Identity();
+      scale = Scalar(1.0);
+    }
+
+    EIGEN_DEVICE_FUNC RxSO3<Scalar> inv() {
+      return RxSO3<Scalar>(unit_quaternion.conjugate(), 1.0/scale);
+    }
+
+    EIGEN_DEVICE_FUNC Data data() const {
+      Data data_vec; data_vec << unit_quaternion.coeffs(), scale;
+      return data_vec;
+    }
+
+    EIGEN_DEVICE_FUNC RxSO3<Scalar> operator*(RxSO3<Scalar> const& other) {
+      return RxSO3<Scalar>(unit_quaternion * other.unit_quaternion, scale * other.scale);
+    }
+
+    EIGEN_DEVICE_FUNC Point operator*(Point const& p) const {
+      const Quaternion& q = unit_quaternion;
+      Point uv = q.vec().cross(p); uv += uv;
+      return scale * (p + q.w()*uv + q.vec().cross(uv));
+    }
+
+    EIGEN_DEVICE_FUNC Point4 act4(Point4 const& p) const {
+      Point4 p1; p1 << this->operator*(p.template segment<3>(0)), p(3);
+      return p1;
+    }
+
+    EIGEN_DEVICE_FUNC Adjoint Adj() const {
+      Adjoint Ad = Adjoint::Identity();
+      Ad.template block<3,3>(0,0) = unit_quaternion.toRotationMatrix();
+      return Ad;
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Matrix() const {
+      return scale * unit_quaternion.toRotationMatrix();
+    }
+
+    EIGEN_DEVICE_FUNC Eigen::Matrix<Scalar,4,4> Matrix4x4() const {
+      Eigen::Matrix<Scalar,4,4> T;
+      T = Eigen::Matrix<Scalar,4,4>::Identity();
+      T.template block<3,3>(0,0) = Matrix();
+      return T;
+    }
+
+    EIGEN_DEVICE_FUNC Eigen::Matrix<Scalar,5,5> orthogonal_projector() const {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,5,5> J = Eigen::Matrix<Scalar,5,5>::Zero();
+
+      J.template block<3,3>(0,0) = 0.5 * (
+        unit_quaternion.w() * Matrix3::Identity() +
+        SO3<Scalar>::hat(-unit_quaternion.vec())
+      );
+
+      J.template block<1,3>(3,0) = 0.5 * (-unit_quaternion.vec());
+
+      // scale
+      J(4,3) = scale;
+
+      return J;
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Rotation() const {
+      return unit_quaternion.toRotationMatrix();
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Adj(Tangent const& a) const {
+      return Adj() * a;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent AdjT(Tangent const& a) const {
+      return Adj().transpose() * a;
+    }
+
+    EIGEN_DEVICE_FUNC static Transformation hat(Tangent const& phi_sigma) {
+      Vector3 const phi = phi_sigma.template segment<3>(0);
+      return SO3<Scalar>::hat(phi) + phi(3) * Transformation::Identity();
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint adj(Tangent const& phi_sigma) {
+      Vector3 const phi = phi_sigma.template segment<3>(0);
+      Matrix3 const Phi = SO3<Scalar>::hat(phi);
+
+      Adjoint ad = Adjoint::Zero();
+      ad.template block<3,3>(0,0) = Phi;
+
+      return ad;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Log() const {
+      using std::abs;
+      using std::atan;
+      using std::sqrt;
+
+      Scalar squared_n = unit_quaternion.vec().squaredNorm();
+      Scalar w = unit_quaternion.w();
+      Scalar two_atan_nbyw_by_n;
+
+      /// Atan-based log thanks to
+      ///
+      /// C. Hertzberg et al.:
+      /// "Integrating Generic Sensor Fusion Algorithms with Sound State
+      /// Representation through Encapsulation of Manifolds"
+      /// Information Fusion, 2011
+
+      if (squared_n < EPS * EPS) {
+        two_atan_nbyw_by_n = Scalar(2) / w - Scalar(2.0/3.0) * (squared_n) / (w * w * w);
+      } else {
+        Scalar n = sqrt(squared_n);
+        if (abs(w) < EPS) {
+          if (w > Scalar(0)) {
+            two_atan_nbyw_by_n = PI / n;
+          } else {
+            two_atan_nbyw_by_n = -PI / n;
+          }
+        } else {
+          two_atan_nbyw_by_n = Scalar(2) * atan(n / w) / n;
+        }
+      }
+
+      Tangent phi_sigma;
+      phi_sigma << two_atan_nbyw_by_n * unit_quaternion.vec(), log(scale);
+
+      return phi_sigma;
+    }
+
+    EIGEN_DEVICE_FUNC static RxSO3<Scalar> Exp(Tangent const& phi_sigma) {
+      Vector3 phi = phi_sigma.template segment<3>(0);
+      Scalar scale = exp(phi_sigma(3));
+
+      Scalar theta2 = phi.squaredNorm();
+      Scalar theta = sqrt(theta2);
+      Scalar imag_factor;
+      Scalar real_factor;
+
+      if (theta < EPS) {
+        Scalar theta4 = theta2 * theta2;
+        imag_factor = Scalar(0.5) - Scalar(1.0/48.0) * theta2 + Scalar(1.0/3840.0) * theta4;
+        real_factor = Scalar(1) - Scalar(1.0/8.0) * theta2 + Scalar(1.0/384.0) * theta4;
+      } else {
+        imag_factor = sin(.5 * theta) / theta;
+        real_factor = cos(.5 * theta);
+      }
+
+      Quaternion q(real_factor, imag_factor*phi.x(), imag_factor*phi.y(), imag_factor*phi.z());
+      return RxSO3<Scalar>(q, scale);
+    }
+
+    EIGEN_DEVICE_FUNC static Matrix3 calcW(Tangent const& phi_sigma) {
+      // left jacobian
+      using std::abs;
+      Matrix3 const I = Matrix3::Identity();
+      Scalar const one(1);
+      Scalar const half(0.5);
+
+      Vector3 const phi = phi_sigma.template segment<3>(0);
+      Scalar const sigma = phi_sigma(3);
+      Scalar const theta = phi.norm();
+
+      Matrix3 const Phi = SO3<Scalar>::hat(phi);
+      Matrix3 const Phi2 = Phi * Phi;
+      Scalar const scale = exp(sigma);
+
+      Scalar A, B, C;
+      if (abs(sigma) < EPS) {
+        C = one;
+        if (abs(theta) < EPS) {
+          A = half;
+          B = Scalar(1. / 6.);
+        } else {
+          Scalar theta_sq = theta * theta;
+          A = (one - cos(theta)) / theta_sq;
+          B = (theta - sin(theta)) / (theta_sq * theta);
+        }
+      } else {
+        C = (scale - one) / sigma;
+        if (abs(theta) < EPS) {
+          Scalar sigma_sq = sigma * sigma;
+          A = ((sigma - one) * scale + one) / sigma_sq;
+          B = (scale * half * sigma_sq + scale - one - sigma * scale) /
+              (sigma_sq * sigma);
+        } else {
+          Scalar theta_sq = theta * theta;
+          Scalar a = scale * sin(theta);
+          Scalar b = scale * cos(theta);
+          Scalar c = theta_sq + sigma * sigma;
+          A = (a * sigma + (one - b) * theta) / (theta * c);
+          B = (C - ((b - one) * sigma + a * theta) / (c)) * one / (theta_sq);
+        }
+      }
+      return A * Phi + B * Phi2 + C * I;
+    }
+
+    EIGEN_DEVICE_FUNC static Matrix3 calcWInv(Tangent const& phi_sigma) {
+      // left jacobian inverse
+      Matrix3 const I = Matrix3::Identity();
+      Scalar const half(0.5);
+      Scalar const one(1);
+      Scalar const two(2);
+
+      Vector3 const phi = phi_sigma.template segment<3>(0);
+      Scalar const sigma = phi_sigma(3);
+      Scalar const theta = phi.norm();
+      Scalar const scale = exp(sigma);
+
+      Matrix3 const Phi = SO3<Scalar>::hat(phi);
+      Matrix3 const Phi2 = Phi * Phi;
+      Scalar const scale_sq = scale * scale;
+      Scalar const theta_sq = theta * theta;
+      Scalar const sin_theta = sin(theta);
+      Scalar const cos_theta = cos(theta);
+
+      Scalar a, b, c;
+      if (abs(sigma * sigma) < EPS) {
+        c = one - half * sigma;
+        a = -half;
+        if (abs(theta_sq) < EPS) {
+          b = Scalar(1. / 12.);
+        } else {
+          b = (theta * sin_theta + two * cos_theta - two) /
+              (two * theta_sq * (cos_theta - one));
+        }
+      } else {
+        Scalar const scale_cu = scale_sq * scale;
+        c = sigma / (scale - one);
+        if (abs(theta_sq) < EPS) {
+          a = (-sigma * scale + scale - one) / ((scale - one) * (scale - one));
+          b = (scale_sq * sigma - two * scale_sq + scale * sigma + two * scale) /
+              (two * scale_cu - Scalar(6) * scale_sq + Scalar(6) * scale - two);
+        } else {
+          Scalar const s_sin_theta = scale * sin_theta;
+          Scalar const s_cos_theta = scale * cos_theta;
+          a = (theta * s_cos_theta - theta - sigma * s_sin_theta) /
+              (theta * (scale_sq - two * s_cos_theta + one));
+          b = -scale *
+              (theta * s_sin_theta - theta * sin_theta + sigma * s_cos_theta -
+              scale * sigma + sigma * cos_theta - sigma) /
+              (theta_sq * (scale_cu - two * scale * s_cos_theta - scale_sq +
+                          two * s_cos_theta + scale - one));
+        }
+      }
+      return a * Phi + b * Phi2 + c * I;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian(Tangent const& phi_sigma) {
+      // left jacobian
+      Adjoint J = Adjoint::Identity();
+      Vector3 phi = phi_sigma.template segment<3>(0);
+      J.template block<3,3>(0,0) = SO3<Scalar>::left_jacobian(phi);
+      return J;
+    }
+
+  EIGEN_DEVICE_FUNC static Adjoint left_jacobian_inverse(Tangent const& phi_sigma) {
+      // left jacobian inverse
+      Adjoint Jinv = Adjoint::Identity();
+      Vector3 phi = phi_sigma.template segment<3>(0);
+      Jinv.template block<3,3>(0,0) = SO3<Scalar>::left_jacobian_inverse(phi);
+      return Jinv;
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,3,4> act_jacobian(Point const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,3,4> Ja;
+      Ja << SO3<Scalar>::hat(-p), p;
+      return Ja;
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,4,4> act4_jacobian(Point4 const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,4,4> J = Eigen::Matrix<Scalar,4,4>::Zero();
+      J.template block<3,3>(0,0) = SO3<Scalar>::hat(-p.template segment<3>(0));
+      J.template block<3,1>(0,3) = p.template segment<3>(0);
+      return J;
+    }
+
+  private:
+    Quaternion unit_quaternion;
+    Scalar scale;
+};
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/se3.h b/third_party/dpvo_ext/lietorch/include/se3.h
new file mode 100644
index 0000000..c5bca16
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/se3.h
@@ -0,0 +1,228 @@
+
+#ifndef SE3_HEADER
+#define SE3_HEADER
+
+#include <stdio.h>
+#include <Eigen/Dense>
+#include <Eigen/Geometry>
+
+#include "common.h"
+#include "so3.h"
+
+
+template <typename Scalar>
+class SE3 {
+  public:
+    const static int constexpr K = 6; // manifold dimension
+    const static int constexpr N = 7; // embedding dimension
+
+    using Vector3 = Eigen::Matrix<Scalar,3,1>;
+    using Vector4 = Eigen::Matrix<Scalar,4,1>;
+    using Matrix3 = Eigen::Matrix<Scalar,3,3>;
+
+    using Tangent = Eigen::Matrix<Scalar,K,1>;
+    using Point = Eigen::Matrix<Scalar,3,1>;
+    using Point4 = Eigen::Matrix<Scalar,4,1>;
+    using Data = Eigen::Matrix<Scalar,N,1>;
+    using Transformation = Eigen::Matrix<Scalar,4,4>;
+    using Adjoint = Eigen::Matrix<Scalar,K,K>;
+
+    EIGEN_DEVICE_FUNC SE3() { translation = Vector3::Zero(); }
+
+    EIGEN_DEVICE_FUNC SE3(SO3<Scalar> const& so3, Vector3 const& t) : so3(so3), translation(t) {};
+
+    EIGEN_DEVICE_FUNC SE3(const Scalar *data) :  translation(data), so3(data+3) {};
+
+    EIGEN_DEVICE_FUNC SE3<Scalar> inv() {
+      return SE3(so3.inv(), -(so3.inv()*translation));
+    }
+
+    EIGEN_DEVICE_FUNC Data data() const {
+      Data data_vec; data_vec << translation, so3.data();
+      return data_vec;
+    }
+
+    EIGEN_DEVICE_FUNC SE3<Scalar> operator*(SE3<Scalar> const& other) {
+      return SE3(so3 * other.so3, translation + so3 * other.translation);
+    }
+
+    EIGEN_DEVICE_FUNC Point operator*(Point const& p) const {
+      return so3 * p + translation;
+    }
+
+    EIGEN_DEVICE_FUNC Point4 act4(Point4 const& p) const {
+      Point4 p1; p1 << so3 * p.template segment<3>(0) + translation * p(3), p(3);
+      return p1;
+    }
+
+    EIGEN_DEVICE_FUNC Adjoint Adj() const {
+      Matrix3 R = so3.Matrix();
+      Matrix3 tx = SO3<Scalar>::hat(translation);
+      Matrix3 Zer = Matrix3::Zero();
+
+      Adjoint Ad;
+      Ad << R, tx*R, Zer, R;
+
+      return Ad;
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Matrix() const {
+      Transformation T = Transformation::Identity();
+      T.template block<3,3>(0,0) = so3.Matrix();
+      T.template block<3,1>(0,3) = translation;
+      return T;
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Matrix4x4() const {
+      return Matrix();
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Adj(Tangent const& a) const {
+      return Adj() * a;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent AdjT(Tangent const& a) const {
+      return Adj().transpose() * a;
+    }
+
+
+    EIGEN_DEVICE_FUNC static Transformation hat(Tangent const& tau_phi) {
+      Vector3 tau = tau_phi.template segment<3>(0);
+      Vector3 phi = tau_phi.template segment<3>(3);
+
+      Transformation TauPhi = Transformation::Zero();
+      TauPhi.template block<3,3>(0,0) = SO3<Scalar>::hat(phi);
+      TauPhi.template block<3,1>(0,3) = tau;
+
+      return TauPhi;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint adj(Tangent const& tau_phi) {
+      Vector3 tau = tau_phi.template segment<3>(0);
+      Vector3 phi = tau_phi.template segment<3>(3);
+
+      Matrix3 Tau = SO3<Scalar>::hat(tau);
+      Matrix3 Phi = SO3<Scalar>::hat(phi);
+      Matrix3 Zer = Matrix3::Zero();
+
+      Adjoint ad;
+      ad << Phi, Tau, Zer, Phi;
+
+      return ad;
+    }
+
+    EIGEN_DEVICE_FUNC Eigen::Matrix<Scalar,7,7> orthogonal_projector() const {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,7,7> J = Eigen::Matrix<Scalar,7,7>::Zero();
+      J.template block<3,3>(0,0) = Matrix3::Identity();
+      J.template block<3,3>(0,3) = SO3<Scalar>::hat(-translation);
+      J.template block<4,4>(3,3) = so3.orthogonal_projector();
+
+      return J;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Log() const {
+      Vector3 phi = so3.Log();
+      Matrix3 Vinv = SO3<Scalar>::left_jacobian_inverse(phi);
+
+      Tangent tau_phi;
+      tau_phi << Vinv * translation, phi;
+
+      return tau_phi;
+    }
+
+    EIGEN_DEVICE_FUNC static SE3<Scalar> Exp(Tangent const& tau_phi) {
+      Vector3 tau = tau_phi.template segment<3>(0);
+      Vector3 phi = tau_phi.template segment<3>(3);
+
+      SO3<Scalar> so3 = SO3<Scalar>::Exp(phi);
+      Vector3 t = SO3<Scalar>::left_jacobian(phi) * tau;
+
+      return SE3<Scalar>(so3, t);
+    }
+
+    EIGEN_DEVICE_FUNC static Matrix3 calcQ(Tangent const& tau_phi) {
+      // Q matrix
+      Vector3 tau = tau_phi.template segment<3>(0);
+      Vector3 phi = tau_phi.template segment<3>(3);
+      Matrix3 Tau = SO3<Scalar>::hat(tau);
+      Matrix3 Phi = SO3<Scalar>::hat(phi);
+
+      Scalar theta = phi.norm();
+      Scalar theta_pow2 = theta * theta;
+      Scalar theta_pow4 = theta_pow2 * theta_pow2;
+
+      Scalar coef1 = (theta < EPS) ?
+        Scalar(1.0/6.0) - Scalar(1.0/120.0) * theta_pow2 :
+        (theta - sin(theta)) / (theta_pow2 * theta);
+
+      Scalar coef2 = (theta < EPS) ?
+        Scalar(1.0/24.0) - Scalar(1.0/720.0) * theta_pow2 :
+        (theta_pow2 + 2*cos(theta) - 2) / (2 * theta_pow4);
+
+      Scalar coef3 = (theta < EPS) ?
+        Scalar(1.0/120.0) - Scalar(1.0/2520.0) * theta_pow2 :
+        (2*theta - 3*sin(theta) + theta*cos(theta)) / (2 * theta_pow4 * theta);
+
+      Matrix3 Q = Scalar(0.5) * Tau +
+        coef1 * (Phi*Tau + Tau*Phi + Phi*Tau*Phi) +
+        coef2 * (Phi*Phi*Tau + Tau*Phi*Phi - 3*Phi*Tau*Phi) +
+        coef3 * (Phi*Tau*Phi*Phi + Phi*Phi*Tau*Phi);
+
+      return Q;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian(Tangent const& tau_phi) {
+      // left jacobian
+      Vector3 phi = tau_phi.template segment<3>(3);
+      Matrix3 J = SO3<Scalar>::left_jacobian(phi);
+      Matrix3 Q = SE3<Scalar>::calcQ(tau_phi);
+      Matrix3 Zer = Matrix3::Zero();
+
+      Adjoint J6x6;
+      J6x6 << J, Q, Zer, J;
+
+      return J6x6;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian_inverse(Tangent const& tau_phi) {
+      // left jacobian inverse
+      Vector3 tau = tau_phi.template segment<3>(0);
+      Vector3 phi = tau_phi.template segment<3>(3);
+      Matrix3 Jinv = SO3<Scalar>::left_jacobian_inverse(phi);
+      Matrix3 Q = SE3<Scalar>::calcQ(tau_phi);
+      Matrix3 Zer = Matrix3::Zero();
+
+      Adjoint J6x6;
+      J6x6 << Jinv, -Jinv * Q * Jinv, Zer, Jinv;
+
+      return J6x6;
+
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,3,6> act_jacobian(Point const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,3,6> J;
+      J.template block<3,3>(0,0) = Matrix3::Identity();
+      J.template block<3,3>(0,3) = SO3<Scalar>::hat(-p);
+      return J;
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,4,6> act4_jacobian(Point4 const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,4,6> J = Eigen::Matrix<Scalar,4,6>::Zero();
+      J.template block<3,3>(0,0) = p(3) * Matrix3::Identity();
+      J.template block<3,3>(0,3) = SO3<Scalar>::hat(-p.template segment<3>(0));
+      return J;
+    }
+
+
+
+
+  private:
+    SO3<Scalar> so3;
+    Vector3 translation;
+
+};
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/sim3.h b/third_party/dpvo_ext/lietorch/include/sim3.h
new file mode 100644
index 0000000..d612da0
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/sim3.h
@@ -0,0 +1,216 @@
+
+#ifndef Sim3_HEADER
+#define Sim3_HEADER
+
+#include <stdio.h>
+#include <iostream>
+
+#include <Eigen/Dense>
+#include <Eigen/Geometry>
+
+#include "common.h"
+#include "so3.h"
+#include "rxso3.h"
+
+
+template <typename Scalar>
+class Sim3 {
+  public:
+    const static int constexpr K = 7; // manifold dimension
+    const static int constexpr N = 8; // embedding dimension
+
+    using Vector3 = Eigen::Matrix<Scalar,3,1>;
+    using Vector4 = Eigen::Matrix<Scalar,4,1>;
+    using Matrix3 = Eigen::Matrix<Scalar,3,3>;
+
+    using Tangent = Eigen::Matrix<Scalar,K,1>;
+    using Point = Eigen::Matrix<Scalar,3,1>;
+    using Point4 = Eigen::Matrix<Scalar,4,1>;
+    using Data = Eigen::Matrix<Scalar,N,1>;
+    using Transformation = Eigen::Matrix<Scalar,4,4>;
+    using Adjoint = Eigen::Matrix<Scalar,K,K>;
+
+    EIGEN_DEVICE_FUNC Sim3() {
+      translation = Vector3::Zero();
+    }
+
+    EIGEN_DEVICE_FUNC Sim3(RxSO3<Scalar> const& rxso3, Vector3 const& t)
+      : rxso3(rxso3), translation(t) {};
+
+    EIGEN_DEVICE_FUNC Sim3(const Scalar *data)
+      : translation(data), rxso3(data+3)  {};
+
+    EIGEN_DEVICE_FUNC Sim3<Scalar> inv() {
+      return Sim3<Scalar>(rxso3.inv(), -(rxso3.inv() * translation));
+    }
+
+    EIGEN_DEVICE_FUNC Data data() const {
+      Data data_vec; data_vec << translation, rxso3.data();
+      return data_vec;
+    }
+
+    EIGEN_DEVICE_FUNC Sim3<Scalar> operator*(Sim3<Scalar> const& other) {
+      return Sim3(rxso3 * other.rxso3, translation + rxso3 * other.translation);
+    }
+
+    EIGEN_DEVICE_FUNC Point operator*(Point const& p) const {
+      return (rxso3 * p) + translation;
+    }
+
+    EIGEN_DEVICE_FUNC Point4 act4(Point4 const& p) const {
+      Point4 p1; p1 << rxso3 * p.template segment<3>(0) + p(3) * translation , p(3);
+      return p1;
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Matrix() const {
+      Transformation T = Transformation::Identity();
+      T.template block<3,3>(0,0) = rxso3.Matrix();
+      T.template block<3,1>(0,3) = translation;
+      return T;
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Matrix4x4() const {
+      Transformation T = Transformation::Identity();
+      T.template block<3,3>(0,0) = rxso3.Matrix();
+      T.template block<3,1>(0,3) = translation;
+      return T;
+    }
+
+    EIGEN_DEVICE_FUNC Eigen::Matrix<Scalar,8,8> orthogonal_projector() const {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,8,8> J = Eigen::Matrix<Scalar,8,8>::Zero();
+      J.template block<3,3>(0,0) = Matrix3::Identity();
+      J.template block<3,3>(0,3) = SO3<Scalar>::hat(-translation);
+      J.template block<3,1>(0,6) = translation;
+      J.template block<5,5>(3,3) = rxso3.orthogonal_projector();
+      return J;
+    }
+
+    EIGEN_DEVICE_FUNC Adjoint Adj() const {
+      Adjoint Ad = Adjoint::Identity();
+      Matrix3 sR = rxso3.Matrix();
+      Matrix3 tx = SO3<Scalar>::hat(translation);
+      Matrix3 R = rxso3.Rotation();
+
+      Ad.template block<3,3>(0,0) = sR;
+      Ad.template block<3,3>(0,3) = tx * R;
+      Ad.template block<3,1>(0,6) = -translation;
+      Ad.template block<3,3>(3,3) = R;
+
+      return Ad;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Adj(Tangent const& a) const {
+      return Adj() * a;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent AdjT(Tangent const& a) const {
+      return Adj().transpose() * a;
+    }
+
+    EIGEN_DEVICE_FUNC static Transformation hat(Tangent const& tau_phi_sigma) {
+      Vector3 tau = tau_phi_sigma.template segment<3>(0);
+      Vector3 phi = tau_phi_sigma.template segment<3>(3);
+      Scalar sigma = tau_phi_sigma(6);
+
+      Matrix3 Phi = SO3<Scalar>::hat(phi);
+      Matrix3 I = Matrix3::Identity();
+
+      Transformation Omega = Transformation::Zero();
+      Omega.template block<3,3>(0,0) = Phi + sigma * I;
+      Omega.template block<3,1>(0,3) = tau;
+
+      return Omega;
+    }
+
+    EIGEN_DEVICE_FUNC  static Adjoint adj(Tangent const& tau_phi_sigma) {
+      Adjoint ad = Adjoint::Zero();
+      Vector3 tau = tau_phi_sigma.template segment<3>(0);
+      Vector3 phi = tau_phi_sigma.template segment<3>(3);
+      Scalar sigma = tau_phi_sigma(6);
+
+      Matrix3 Tau = SO3<Scalar>::hat(tau);
+      Matrix3 Phi = SO3<Scalar>::hat(phi);
+      Matrix3 I = Matrix3::Identity();
+
+      ad.template block<3,3>(0,0) = Phi + sigma * I;
+      ad.template block<3,3>(0,3) = Tau;
+      ad.template block<3,1>(0,6) = -tau;
+      ad.template block<3,3>(3,3) = Phi;
+
+      return ad;
+    }
+
+
+    EIGEN_DEVICE_FUNC Tangent Log() const {
+      // logarithm map
+      Vector4 phi_sigma = rxso3.Log();
+      Matrix3 W = RxSO3<Scalar>::calcW(phi_sigma);
+
+      Tangent tau_phi_sigma;
+      tau_phi_sigma << W.inverse() * translation, phi_sigma;
+
+      return tau_phi_sigma;
+    }
+
+    EIGEN_DEVICE_FUNC static Sim3<Scalar> Exp(Tangent const& tau_phi_sigma) {
+      // exponential map
+      Vector3 tau = tau_phi_sigma.template segment<3>(0);
+      Vector4 phi_sigma = tau_phi_sigma.template segment<4>(3);
+
+      RxSO3<Scalar> rxso3 = RxSO3<Scalar>::Exp(phi_sigma);
+      Matrix3 W = RxSO3<Scalar>::calcW(phi_sigma);
+
+      return Sim3<Scalar>(rxso3, W*tau);
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian(Tangent const& tau_phi_sigma) {
+      // left jacobian
+      Adjoint const Xi = adj(tau_phi_sigma);
+      Adjoint const Xi2 = Xi * Xi;
+      Adjoint const Xi4 = Xi2 * Xi2;
+
+      return Adjoint::Identity()
+        + Scalar(1.0/2.0)*Xi
+        + Scalar(1.0/6.0)*Xi2
+        + Scalar(1.0/24.0)*Xi*Xi2
+        + Scalar(1.0/120.0)*Xi4;
+        + Scalar(1.0/720.0)*Xi*Xi4;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian_inverse(Tangent const& tau_phi_sigma) {
+      // left jacobian inverse
+      Adjoint const Xi = adj(tau_phi_sigma);
+      Adjoint const Xi2 = Xi * Xi;
+      Adjoint const Xi4 = Xi2 * Xi2;
+
+      return Adjoint::Identity()
+        - Scalar(1.0/2.0)*Xi
+        + Scalar(1.0/12.0)*Xi2
+        - Scalar(1.0/720.0)*Xi4;
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,3,7> act_jacobian(Point const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,3,7> J;
+      J.template block<3,3>(0,0) = Matrix3::Identity();
+      J.template block<3,3>(0,3) = SO3<Scalar>::hat(-p);
+      J.template block<3,1>(0,6) = p;
+      return J;
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,4,7> act4_jacobian(Point4 const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,4,7> J = Eigen::Matrix<Scalar,4,7>::Zero();
+      J.template block<3,3>(0,0) = p(3) * Matrix3::Identity();
+      J.template block<3,3>(0,3) = SO3<Scalar>::hat(-p.template segment<3>(0));
+      J.template block<3,1>(0,6) = p.template segment<3>(0);
+      return J;
+    }
+
+  private:
+    Vector3 translation;
+    RxSO3<Scalar> rxso3;
+};
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/include/so3.h b/third_party/dpvo_ext/lietorch/include/so3.h
new file mode 100644
index 0000000..67941b0
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/include/so3.h
@@ -0,0 +1,227 @@
+
+#ifndef SO3_HEADER
+#define SO3_HEADER
+
+#include <cuda.h>
+#include <stdio.h>
+#include <Eigen/Dense>
+#include <Eigen/Geometry>
+
+#include "common.h"
+
+template <typename Scalar>
+class SO3 {
+  public:
+    const static int constexpr K = 3; // manifold dimension
+    const static int constexpr N = 4; // embedding dimension
+
+    using Vector3 = Eigen::Matrix<Scalar,3,1>;
+    using Vector4 = Eigen::Matrix<Scalar,4,1>;
+    using Matrix3 = Eigen::Matrix<Scalar,3,3>;
+
+    using Tangent = Eigen::Matrix<Scalar,K,1>;
+    using Data = Eigen::Matrix<Scalar,N,1>;
+
+    using Point = Eigen::Matrix<Scalar,3,1>;
+    using Point4 = Eigen::Matrix<Scalar,4,1>;
+    using Transformation = Eigen::Matrix<Scalar,3,3>;
+    using Adjoint = Eigen::Matrix<Scalar,K,K>;
+    using Quaternion = Eigen::Quaternion<Scalar>;
+
+    EIGEN_DEVICE_FUNC SO3(Quaternion const& q) : unit_quaternion(q) {
+      unit_quaternion.normalize();
+    };
+
+    EIGEN_DEVICE_FUNC SO3(const Scalar *data) : unit_quaternion(data) {
+      unit_quaternion.normalize();
+    };
+
+    EIGEN_DEVICE_FUNC SO3() {
+      unit_quaternion = Quaternion::Identity();
+    }
+
+    EIGEN_DEVICE_FUNC SO3<Scalar> inv() {
+      return SO3<Scalar>(unit_quaternion.conjugate());
+    }
+
+    EIGEN_DEVICE_FUNC Data data() const {
+      return unit_quaternion.coeffs();
+    }
+
+    EIGEN_DEVICE_FUNC SO3<Scalar> operator*(SO3<Scalar> const& other) {
+      return SO3(unit_quaternion * other.unit_quaternion);
+    }
+
+    EIGEN_DEVICE_FUNC Point operator*(Point const& p) const {
+      const Quaternion& q = unit_quaternion;
+      Point uv = q.vec().cross(p);
+      uv += uv;
+      return p + q.w()*uv + q.vec().cross(uv);
+    }
+
+    EIGEN_DEVICE_FUNC Point4 act4(Point4 const& p) const {
+      Point4 p1; p1 << this->operator*(p.template segment<3>(0)), p(3);
+      return p1;
+    }
+
+    EIGEN_DEVICE_FUNC Adjoint Adj() const {
+      return unit_quaternion.toRotationMatrix();
+    }
+
+    EIGEN_DEVICE_FUNC Transformation Matrix() const {
+      return unit_quaternion.toRotationMatrix();
+    }
+
+    EIGEN_DEVICE_FUNC Eigen::Matrix<Scalar,4,4> Matrix4x4() const {
+      Eigen::Matrix<Scalar,4,4> T = Eigen::Matrix<Scalar,4,4>::Identity();
+      T.template block<3,3>(0,0) = Matrix();
+      return T;
+    }
+
+    EIGEN_DEVICE_FUNC Eigen::Matrix<Scalar,4,4> orthogonal_projector() const {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,4,4> J = Eigen::Matrix<Scalar,4,4>::Zero();
+      J.template block<3,3>(0,0) = 0.5 * (
+        unit_quaternion.w() * Matrix3::Identity() +
+        SO3<Scalar>::hat(-unit_quaternion.vec())
+      );
+
+      J.template block<1,3>(3,0) = 0.5 * (-unit_quaternion.vec());
+      return J;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Adj(Tangent const& a) const {
+      return Adj() * a;
+    }
+
+    EIGEN_DEVICE_FUNC Tangent AdjT(Tangent const& a) const {
+      return Adj().transpose() * a;
+    }
+
+    EIGEN_DEVICE_FUNC static Transformation hat(Tangent const& phi) {
+      Transformation Phi;
+      Phi <<
+        0.0, -phi(2), phi(1),
+        phi(2), 0.0, -phi(0),
+        -phi(1), phi(0), 0.0;
+
+      return Phi;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint adj(Tangent const& phi) {
+      return SO3<Scalar>::hat(phi);
+    }
+
+    EIGEN_DEVICE_FUNC Tangent Log() const {
+      using std::abs;
+      using std::atan;
+      using std::sqrt;
+      Scalar squared_n = unit_quaternion.vec().squaredNorm();
+      Scalar w = unit_quaternion.w();
+
+      Scalar two_atan_nbyw_by_n;
+
+      /// Atan-based log thanks to
+      ///
+      /// C. Hertzberg et al.:
+      /// "Integrating Generic Sensor Fusion Algorithms with Sound State
+      /// Representation through Encapsulation of Manifolds"
+      /// Information Fusion, 2011
+
+      if (squared_n < EPS * EPS) {
+        // If quaternion is normalized and n=0, then w should be 1;
+        // w=0 should never happen here!
+        Scalar squared_w = w * w;
+        two_atan_nbyw_by_n =
+            Scalar(2) / w - Scalar(2.0/3.0) * (squared_n) / (w * squared_w);
+      } else {
+        Scalar n = sqrt(squared_n);
+        if (abs(w) < EPS) {
+          if (w > Scalar(0)) {
+            two_atan_nbyw_by_n = Scalar(PI) / n;
+          } else {
+            two_atan_nbyw_by_n = -Scalar(PI) / n;
+          }
+        } else {
+          two_atan_nbyw_by_n = Scalar(2) * atan(n / w) / n;
+        }
+      }
+
+      return two_atan_nbyw_by_n * unit_quaternion.vec();
+    }
+
+    EIGEN_DEVICE_FUNC static SO3<Scalar> Exp(Tangent const& phi) {
+      Scalar theta2 = phi.squaredNorm();
+      Scalar theta = sqrt(theta2);
+      Scalar imag_factor;
+      Scalar real_factor;
+
+      if (theta < EPS) {
+        Scalar theta4 = theta2 * theta2;
+        imag_factor = Scalar(0.5) - Scalar(1.0/48.0) * theta2 + Scalar(1.0/3840.0) * theta4;
+        real_factor = Scalar(1) - Scalar(1.0/8.0) * theta2 + Scalar(1.0/384.0) * theta4;
+      } else {
+        imag_factor = sin(.5 * theta) / theta;
+        real_factor = cos(.5 * theta);
+      }
+
+      Quaternion q(real_factor, imag_factor*phi.x(), imag_factor*phi.y(), imag_factor*phi.z());
+      return SO3<Scalar>(q);
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian(Tangent const& phi) {
+      // left jacobian
+      Matrix3 I = Matrix3::Identity();
+      Matrix3 Phi = SO3<Scalar>::hat(phi);
+      Matrix3 Phi2 = Phi * Phi;
+
+      Scalar theta2 = phi.squaredNorm();
+      Scalar theta = sqrt(theta2);
+
+      Scalar coef1 = (theta < EPS) ?
+        Scalar(1.0/2.0) - Scalar(1.0/24.0) * theta2 :
+        (1.0 - cos(theta)) / theta2;
+
+      Scalar coef2 = (theta < EPS) ?
+        Scalar(1.0/6.0) - Scalar(1.0/120.0) * theta2 :
+        (theta - sin(theta)) / (theta2 * theta);
+
+      return I + coef1 * Phi + coef2 * Phi2;
+    }
+
+    EIGEN_DEVICE_FUNC static Adjoint left_jacobian_inverse(Tangent const& phi) {
+      // left jacobian inverse
+      Matrix3 I = Matrix3::Identity();
+      Matrix3 Phi = SO3<Scalar>::hat(phi);
+      Matrix3 Phi2 = Phi * Phi;
+
+      Scalar theta2 = phi.squaredNorm();
+      Scalar theta = sqrt(theta2);
+      Scalar half_theta = Scalar(.5) * theta ;
+
+      Scalar coef2 = (theta < EPS) ? Scalar(1.0/12.0) :
+           (Scalar(1) -
+            theta * cos(half_theta) / (Scalar(2) * sin(half_theta))) /
+               (theta * theta);
+
+      return I + Scalar(-0.5) * Phi + coef2 * Phi2;
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,3,3> act_jacobian(Point const& p) {
+      // jacobian action on a point
+      return SO3<Scalar>::hat(-p);
+    }
+
+    EIGEN_DEVICE_FUNC static Eigen::Matrix<Scalar,4,3> act4_jacobian(Point4 const& p) {
+      // jacobian action on a point
+      Eigen::Matrix<Scalar,4,3> J = Eigen::Matrix<Scalar,4,3>::Zero();
+      J.template block<3,3>(0,0) = SO3<Scalar>::hat(-p.template segment<3>(0));
+      return J;
+    }
+
+  private:
+    Quaternion unit_quaternion;
+
+};
+
+#endif
diff --git a/third_party/dpvo_ext/lietorch/run_tests.py b/third_party/dpvo_ext/lietorch/run_tests.py
new file mode 100644
index 0000000..63f63bc
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/run_tests.py
@@ -0,0 +1,310 @@
+import lietorch
+import torch
+from gradcheck import get_analytical_jacobian, gradcheck
+from lietorch import SE3, SO3, RxSO3, Sim3
+
+### forward tests ###
+
+
+def make_homogeneous(p):
+    return torch.cat([p, torch.ones_like(p[..., :1])], dim=-1)
+
+
+def matv(A, b):
+    return torch.matmul(A, b[..., None])[..., 0]
+
+
+def test_exp_log(Group, device='cuda'):
+    """check Log(Exp(x)) == x."""
+    a = .2 * torch.randn(2, 3, 4, 5, 6, 7, Group.manifold_dim,
+                         device=device).double()
+    b = Group.exp(a).log()
+    assert torch.allclose(a, b, atol=1e-8), 'should be identity'
+    print('\t-', Group, 'Passed exp-log test')
+
+
+def test_inv(Group, device='cuda'):
+    """ check X * X^{-1} == 0 """
+    X = Group.exp(
+        .1 *
+        torch.randn(2, 3, 4, 5, Group.manifold_dim, device=device).double())
+    a = (X * X.inv()).log()
+    assert torch.allclose(a, torch.zeros_like(a), atol=1e-8), 'should be 0'
+    print('\t-', Group, 'Passed inv test')
+
+
+def test_adj(Group, device='cuda'):
+    """ check X * Exp(a) == Exp(Adj(X,a)) * X 0 """
+    X = Group.exp(
+        torch.randn(2, 3, 4, 5, Group.manifold_dim, device=device).double())
+    a = torch.randn(2, 3, 4, 5, Group.manifold_dim, device=device).double()
+
+    b = X.adj(a)
+    Y1 = X * Group.exp(a)
+    Y2 = Group.exp(b) * X
+
+    c = (Y1 * Y2.inv()).log()
+    assert torch.allclose(c, torch.zeros_like(c), atol=1e-8), 'should be 0'
+    print('\t-', Group, 'Passed adj test')
+
+
+def test_act(Group, device='cuda'):
+    X = Group.exp(torch.randn(1, Group.manifold_dim, device=device).double())
+    p = torch.randn(1, 3, device=device).double()
+
+    p1 = X.act(p)
+    p2 = matv(X.matrix(), make_homogeneous(p))
+
+    assert torch.allclose(p1, p2[..., :3], atol=1e-8), 'should be 0'
+    print('\t-', Group, 'Passed act test')
+
+
+### backward tests ###
+def test_exp_log_grad(Group, device='cuda', tol=1e-8):
+
+    D = Group.manifold_dim
+
+    def fn(a):
+        return Group.exp(a).log()
+
+    a = torch.zeros(1, Group.manifold_dim, requires_grad=True,
+                    device=device).double()
+    analytical, reentrant, correct_grad_sizes, correct_grad_types = \
+        get_analytical_jacobian((a,), fn(a))
+
+    assert torch.allclose(analytical[0],
+                          torch.eye(D, device=device).double(),
+                          atol=tol)
+
+    a = .2 * torch.randn(
+        1, Group.manifold_dim, requires_grad=True, device=device).double()
+    analytical, reentrant, correct_grad_sizes, correct_grad_types = \
+        get_analytical_jacobian((a,), fn(a))
+
+    assert torch.allclose(analytical[0],
+                          torch.eye(D, device=device).double(),
+                          atol=tol)
+
+    print('\t-', Group, 'Passed eye-grad test')
+
+
+def test_inv_log_grad(Group, device='cuda', tol=1e-8):
+
+    D = Group.manifold_dim
+    X = Group.exp(.2 * torch.randn(1, D, device=device).double())
+
+    def fn(a):
+        return (Group.exp(a) * X).inv().log()
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+    analytical, numerical = gradcheck(fn, [a], eps=1e-4)
+
+    # assert torch.allclose(analytical[0], numerical[0], atol=tol)
+    if not torch.allclose(analytical[0], numerical[0], atol=tol):
+        print(analytical[0])
+        print(numerical[0])
+
+    print('\t-', Group, 'Passed inv-grad test')
+
+
+def test_adj_grad(Group, device='cuda'):
+    D = Group.manifold_dim
+    X = Group.exp(.5 *
+                  torch.randn(1, Group.manifold_dim, device=device).double())
+
+    def fn(a, b):
+        return (Group.exp(a) * X).adj(b)
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+    b = torch.randn(1, D, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a, b], eps=1e-4)
+    assert torch.allclose(analytical[0], numerical[0], atol=1e-8)
+    assert torch.allclose(analytical[1], numerical[1], atol=1e-8)
+
+    print('\t-', Group, 'Passed adj-grad test')
+
+
+def test_adjT_grad(Group, device='cuda'):
+    D = Group.manifold_dim
+    X = Group.exp(.5 *
+                  torch.randn(1, Group.manifold_dim, device=device).double())
+
+    def fn(a, b):
+        return (Group.exp(a) * X).adjT(b)
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+    b = torch.randn(1, D, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a, b], eps=1e-4)
+
+    assert torch.allclose(analytical[0], numerical[0], atol=1e-8)
+    assert torch.allclose(analytical[1], numerical[1], atol=1e-8)
+
+    print('\t-', Group, 'Passed adjT-grad test')
+
+
+def test_act_grad(Group, device='cuda'):
+    D = Group.manifold_dim
+    X = Group.exp(5 * torch.randn(1, D, device=device).double())
+
+    def fn(a, b):
+        return (X * Group.exp(a)).act(b)
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+    b = torch.randn(1, 3, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a, b], eps=1e-4)
+
+    assert torch.allclose(analytical[0], numerical[0], atol=1e-8)
+    assert torch.allclose(analytical[1], numerical[1], atol=1e-8)
+
+    print('\t-', Group, 'Passed act-grad test')
+
+
+def test_matrix_grad(Group, device='cuda'):
+    D = Group.manifold_dim
+    X = Group.exp(torch.randn(1, D, device=device).double())
+
+    def fn(a):
+        return (Group.exp(a) * X).matrix()
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+    analytical, numerical = gradcheck(fn, [a], eps=1e-4)
+    assert torch.allclose(analytical[0], numerical[0], atol=1e-6)
+
+    print('\t-', Group, 'Passed matrix-grad test')
+
+
+def extract_translation_grad(Group, device='cuda'):
+    """prototype function."""
+
+    D = Group.manifold_dim
+    X = Group.exp(5 * torch.randn(1, D, device=device).double())
+
+    def fn(a):
+        return (Group.exp(a) * X).translation()
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a], eps=1e-4)
+
+    assert torch.allclose(analytical[0], numerical[0], atol=1e-8)
+    print('\t-', Group, 'Passed translation grad test')
+
+
+def test_vec_grad(Group, device='cuda', tol=1e-6):
+
+    D = Group.manifold_dim
+    X = Group.exp(5 * torch.randn(1, D, device=device).double())
+
+    def fn(a):
+        return (Group.exp(a) * X).vec()
+
+    a = torch.zeros(1, D, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a], eps=1e-4)
+
+    assert torch.allclose(analytical[0], numerical[0], atol=tol)
+    print('\t-', Group, 'Passed tovec grad test')
+
+
+def test_fromvec_grad(Group, device='cuda', tol=1e-6):
+    def fn(a):
+        if Group == SO3:
+            a = a / a.norm(dim=-1, keepdim=True)
+
+        elif Group == RxSO3:
+            q, s = a.split([4, 1], dim=-1)
+            q = q / q.norm(dim=-1, keepdim=True)
+            a = torch.cat([q, s.exp()], dim=-1)
+
+        elif Group == SE3:
+            t, q = a.split([3, 4], dim=-1)
+            q = q / q.norm(dim=-1, keepdim=True)
+            a = torch.cat([t, q], dim=-1)
+
+        elif Group == Sim3:
+            t, q, s = a.split([3, 4, 1], dim=-1)
+            q = q / q.norm(dim=-1, keepdim=True)
+            a = torch.cat([t, q, s.exp()], dim=-1)
+
+        return Group.InitFromVec(a).vec()
+
+    D = Group.embedded_dim
+    a = torch.randn(1, 2, D, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a], eps=1e-4)
+
+    assert torch.allclose(analytical[0], numerical[0], atol=tol)
+    print('\t-', Group, 'Passed fromvec grad test')
+
+
+def scale(device='cuda'):
+    def fn(a, s):
+        X = SE3.exp(a)
+        X.scale(s)
+        return X.log()
+
+    s = torch.rand(1, requires_grad=True, device=device).double()
+    a = torch.randn(1, 6, requires_grad=True, device=device).double()
+
+    analytical, numerical = gradcheck(fn, [a, s], eps=1e-3)
+    print(analytical[1])
+    print(numerical[1])
+
+    assert torch.allclose(analytical[0], numerical[0], atol=1e-8)
+    assert torch.allclose(analytical[1], numerical[1], atol=1e-8)
+
+    print('\t-', 'Passed se3-to-sim3 test')
+
+
+if __name__ == '__main__':
+
+    print('Testing lietorch forward pass (CPU) ...')
+    for Group in [SO3, RxSO3, SE3, Sim3]:
+        test_exp_log(Group, device='cpu')
+        test_inv(Group, device='cpu')
+        test_adj(Group, device='cpu')
+        test_act(Group, device='cpu')
+
+    print('Testing lietorch backward pass (CPU)...')
+    for Group in [SO3, RxSO3, SE3, Sim3]:
+        if Group == Sim3:
+            tol = 1e-3
+        else:
+            tol = 1e-8
+
+        test_exp_log_grad(Group, device='cpu', tol=tol)
+        test_inv_log_grad(Group, device='cpu', tol=tol)
+        test_adj_grad(Group, device='cpu')
+        test_adjT_grad(Group, device='cpu')
+        test_act_grad(Group, device='cpu')
+        test_matrix_grad(Group, device='cpu')
+        extract_translation_grad(Group, device='cpu')
+        test_vec_grad(Group, device='cpu')
+        test_fromvec_grad(Group, device='cpu')
+
+    print('Testing lietorch forward pass (GPU) ...')
+    for Group in [SO3, RxSO3, SE3, Sim3]:
+        test_exp_log(Group, device='cuda')
+        test_inv(Group, device='cuda')
+        test_adj(Group, device='cuda')
+        test_act(Group, device='cuda')
+
+    print('Testing lietorch backward pass (GPU)...')
+    for Group in [SO3, RxSO3, SE3, Sim3]:
+        if Group == Sim3:
+            tol = 1e-3
+        else:
+            tol = 1e-8
+
+        test_exp_log_grad(Group, device='cuda', tol=tol)
+        test_inv_log_grad(Group, device='cuda', tol=tol)
+        test_adj_grad(Group, device='cuda')
+        test_adjT_grad(Group, device='cuda')
+        test_act_grad(Group, device='cuda')
+        test_matrix_grad(Group, device='cuda')
+        extract_translation_grad(Group, device='cuda')
+        test_vec_grad(Group, device='cuda')
+        test_fromvec_grad(Group, device='cuda')
diff --git a/third_party/dpvo_ext/lietorch/src/lietorch.cpp b/third_party/dpvo_ext/lietorch/src/lietorch.cpp
new file mode 100644
index 0000000..8dfcc9a
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/src/lietorch.cpp
@@ -0,0 +1,316 @@
+#include <torch/extension.h>
+#include <vector>
+#include "lietorch_gpu.h"
+#include "lietorch_cpu.h"
+
+
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+
+
+/* Interface for cuda and c++ group operations
+
+    enum group_t { SO3=1, SE3=2, Sim3=3 };
+    X, Y, Z: (uppercase) Lie Group Elements
+    a, b, c: (lowercase) Lie Algebra Elements
+*/
+
+// Unary operations
+torch::Tensor expm(int group_index, torch::Tensor a) {
+    CHECK_CONTIGUOUS(a);
+    if (a.device().type() == torch::DeviceType::CPU) {
+        return exp_forward_cpu(group_index, a);
+
+    } else if (a.device().type() == torch::DeviceType::CUDA) {
+        return exp_forward_gpu(group_index, a);
+    }
+
+    return a;
+}
+
+std::vector<torch::Tensor> expm_backward(int group_index, torch::Tensor grad, torch::Tensor a) {
+    CHECK_CONTIGUOUS(a);
+    CHECK_CONTIGUOUS(grad);
+    if (a.device().type() == torch::DeviceType::CPU) {
+        return exp_backward_cpu(group_index, grad, a);
+
+    } else if (a.device().type() == torch::DeviceType::CUDA) {
+        return exp_backward_gpu(group_index, grad, a);
+    }
+
+    return {};
+}
+
+torch::Tensor logm(int group_index, torch::Tensor X) {
+    CHECK_CONTIGUOUS(X);
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return log_forward_cpu(group_index, X);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return log_forward_gpu(group_index, X);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> logm_backward(int group_index, torch::Tensor grad, torch::Tensor X) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return log_backward_cpu(group_index, grad, X);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return log_backward_gpu(group_index, grad, X);
+    }
+
+    return {};
+}
+
+torch::Tensor inv(int group_index, torch::Tensor X) {
+    CHECK_CONTIGUOUS(X);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return inv_forward_cpu(group_index, X);
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return inv_forward_gpu(group_index, X);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> inv_backward(int group_index, torch::Tensor grad, torch::Tensor X) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return inv_backward_cpu(group_index, grad, X);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return inv_backward_gpu(group_index, grad, X);
+    }
+
+    return {};
+}
+
+// Binary operations
+
+torch::Tensor mul(int group_index, torch::Tensor X, torch::Tensor Y) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(Y);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return mul_forward_cpu(group_index, X, Y);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return mul_forward_gpu(group_index, X, Y);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> mul_backward(int group_index, torch::Tensor grad, torch::Tensor X, torch::Tensor Y) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(Y);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return mul_backward_cpu(group_index, grad, X, Y);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return mul_backward_gpu(group_index, grad, X, Y);
+    }
+
+    return {};
+}
+
+torch::Tensor adj(int group_index, torch::Tensor X, torch::Tensor a) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(a);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return adj_forward_cpu(group_index, X, a);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return adj_forward_gpu(group_index, X, a);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> adj_backward(int group_index, torch::Tensor grad, torch::Tensor X, torch::Tensor a) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(a);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return adj_backward_cpu(group_index, grad, X, a);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return adj_backward_gpu(group_index, grad, X, a);
+    }
+
+    return {};
+}
+
+torch::Tensor adjT(int group_index, torch::Tensor X, torch::Tensor a) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(a);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return adjT_forward_cpu(group_index, X, a);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return adjT_forward_gpu(group_index, X, a);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> adjT_backward(int group_index, torch::Tensor grad, torch::Tensor X, torch::Tensor a) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(a);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return adjT_backward_cpu(group_index, grad, X, a);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return adjT_backward_gpu(group_index, grad, X, a);
+    }
+
+    return {};
+}
+
+
+torch::Tensor act(int group_index, torch::Tensor X, torch::Tensor p) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(p);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return act_forward_cpu(group_index, X, p);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return act_forward_gpu(group_index, X, p);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> act_backward(int group_index, torch::Tensor grad, torch::Tensor X, torch::Tensor p) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(p);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return act_backward_cpu(group_index, grad, X, p);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return act_backward_gpu(group_index, grad, X, p);
+    }
+
+    return {};
+}
+
+torch::Tensor act4(int group_index, torch::Tensor X, torch::Tensor p) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(p);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return act4_forward_cpu(group_index, X, p);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return act4_forward_gpu(group_index, X, p);
+    }
+
+    return X;
+}
+
+std::vector<torch::Tensor> act4_backward(int group_index, torch::Tensor grad, torch::Tensor X, torch::Tensor p) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(p);
+    CHECK_CONTIGUOUS(grad);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return act4_backward_cpu(group_index, grad, X, p);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return act4_backward_gpu(group_index, grad, X, p);
+    }
+
+    return {};
+}
+
+
+torch::Tensor projector(int group_index, torch::Tensor X) {
+    CHECK_CONTIGUOUS(X);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return orthogonal_projector_cpu(group_index, X);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return orthogonal_projector_gpu(group_index, X);
+    }
+
+    return X;
+}
+
+
+torch::Tensor as_matrix(int group_index, torch::Tensor X) {
+    CHECK_CONTIGUOUS(X);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return as_matrix_forward_cpu(group_index, X);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return as_matrix_forward_gpu(group_index, X);
+    }
+
+    return X;
+}
+
+torch::Tensor Jinv(int group_index, torch::Tensor X, torch::Tensor a) {
+    CHECK_CONTIGUOUS(X);
+    CHECK_CONTIGUOUS(a);
+
+    if (X.device().type() == torch::DeviceType::CPU) {
+        return jleft_forward_cpu(group_index, X, a);
+
+    } else if (X.device().type() == torch::DeviceType::CUDA) {
+        return jleft_forward_gpu(group_index, X, a);
+    }
+
+    return a;
+}
+
+// {exp, log, inv, mul, adj, adjT, act, act4} forward/backward bindings
+PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
+  m.def("expm", &expm, "exp map forward");
+  m.def("expm_backward", &expm_backward, "exp map backward");
+
+  m.def("logm", &logm, "log map forward");
+  m.def("logm_backward", &logm_backward, "log map backward");
+
+  m.def("inv", &inv, "inverse operator");
+  m.def("inv_backward", &inv_backward, "inverse operator backward");
+
+  m.def("mul", &mul, "group operator");
+  m.def("mul_backward", &mul_backward, "group operator backward");
+
+  m.def("adj", &adj, "adjoint operator");
+  m.def("adj_backward", &adj_backward, "adjoint operator backward");
+
+  m.def("adjT", &adjT, "transposed adjoint operator");
+  m.def("adjT_backward", &adjT_backward, "transposed adjoint operator backward");
+
+  m.def("act", &act, "action on point");
+  m.def("act_backward", &act_backward, "action on point backward");
+
+  m.def("act4", &act4, "action on homogeneous point");
+  m.def("act4_backward", &act4_backward, "action on homogeneous point backward");
+
+  // functions with no gradient
+  m.def("as_matrix", &as_matrix, "convert to matrix");
+  m.def("projector", &projector, "orthogonal projection matrix");
+  m.def("Jinv", &Jinv, "left inverse jacobian operator");
+
+};
diff --git a/third_party/dpvo_ext/lietorch/src/lietorch_cpu.cpp b/third_party/dpvo_ext/lietorch/src/lietorch_cpu.cpp
new file mode 100644
index 0000000..92146f9
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/src/lietorch_cpu.cpp
@@ -0,0 +1,657 @@
+
+#include "lietorch_cpu.h"
+#include <Eigen/Dense>
+
+#include <iostream>
+#include "common.h"
+#include "dispatch.h"
+
+#include "so3.h"
+#include "rxso3.h"
+#include "se3.h"
+#include "sim3.h"
+
+
+template <typename Group, typename scalar_t>
+void exp_forward_kernel(const scalar_t* a_ptr, scalar_t* X_ptr, int batch_size) {
+    // exponential map forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Tangent a(a_ptr + i*Group::K);
+            Eigen::Map<Data>(X_ptr + i*Group::N) = Group::Exp(a).data();
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void exp_backward_kernel(const scalar_t* grad, const scalar_t* a_ptr, scalar_t* da, int batch_size) {
+    // exponential map backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Tangent a(a_ptr + i*Group::K);
+            Grad dX(grad + i*Group::N);
+            Eigen::Map<Grad>(da + i*Group::K) = dX * Group::left_jacobian(a);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void log_forward_kernel(const scalar_t* X_ptr, scalar_t* a_ptr, int batch_size) {
+    // logarithm map forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Tangent a = Group(X_ptr + i*Group::N).Log();
+            Eigen::Map<Tangent>(a_ptr + i*Group::K) = a;
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void log_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, scalar_t* dX, int batch_size) {
+    // logarithm map backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Tangent a = Group(X_ptr + i*Group::N).Log();
+            Grad da(grad + i*Group::K);
+            Eigen::Map<Grad>(dX + i*Group::N) = da * Group::left_jacobian_inverse(a);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void inv_forward_kernel(const scalar_t* X_ptr, scalar_t* Y_ptr, int batch_size) {
+    // group inverse forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Eigen::Map<Data>(Y_ptr + i*Group::N) = X.inv().data();
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void inv_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, scalar_t *dX, int batch_size) {
+    // group inverse backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group Y = Group(X_ptr + i*Group::N).inv();
+            Grad dY(grad + i*Group::N);
+            Eigen::Map<Grad>(dX + i*Group::N) = -dY * Y.Adj();
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void mul_forward_kernel(const scalar_t* X_ptr, const scalar_t* Y_ptr, scalar_t* Z_ptr, int batch_size) {
+    // group multiplication forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group Z = Group(X_ptr + i*Group::N) * Group(Y_ptr + i*Group::N);
+            Eigen::Map<Data>(Z_ptr + i*Group::N) = Z.data();
+        }
+    });
+}
+
+template <class Group, typename scalar_t>
+void mul_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* Y_ptr, scalar_t* dX, scalar_t* dY, int batch_size) {
+    // group multiplication backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Grad dZ(grad + i*Group::N);
+            Group X(X_ptr + i*Group::N);
+            Eigen::Map<Grad>(dX + i*Group::N) = dZ;
+            Eigen::Map<Grad>(dY + i*Group::N) = dZ * X.Adj();
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void adj_forward_kernel(const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* b_ptr, int batch_size) {
+    // adjoint forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Tangent a(a_ptr + i*Group::K);
+            Eigen::Map<Tangent>(b_ptr + i*Group::K) = X.Adj(a);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void adj_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* dX, scalar_t* da, int batch_size) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Grad db(grad + i*Group::K);
+
+            Tangent a(a_ptr + i*Group::K);
+            Tangent b = X.Adj() * a;
+
+            Eigen::Map<Grad>(da + i*Group::K) = db * X.Adj();
+            Eigen::Map<Grad>(dX + i*Group::N) = -db * Group::adj(b);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void adjT_forward_kernel(const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* b_ptr, int batch_size) {
+    // adjoint forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Tangent a(a_ptr + i*Group::K);
+            Eigen::Map<Tangent>(b_ptr + i*Group::K) = X.AdjT(a);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void adjT_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* dX, scalar_t* da, int batch_size) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Tangent db(grad + i*Group::K);
+            Grad a(a_ptr + i*Group::K);
+
+            Eigen::Map<Tangent>(da + i*Group::K) = X.Adj(db);
+            Eigen::Map<Grad>(dX + i*Group::N) = -a * Group::adj(X.Adj(db));
+        }
+    });
+}
+
+
+template <typename Group, typename scalar_t>
+void act_forward_kernel(const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* q_ptr, int batch_size) {
+    // action on point forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+    using Point = Eigen::Matrix<scalar_t,3,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Point p(p_ptr + i*3);
+            Eigen::Map<Point>(q_ptr + i*3) = X * p;
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void act_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* dX, scalar_t* dp, int batch_size) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Point = Eigen::Matrix<scalar_t,3,1>;
+    using PointGrad = Eigen::Matrix<scalar_t,1,3>;
+    using Transformation = Eigen::Matrix<scalar_t,4,4>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Point p(p_ptr + i*3);
+            PointGrad dq(grad + i*3);
+
+            Eigen::Map<PointGrad>(dp + i*3) = dq * X.Matrix().template block<3,3>(0,0);
+            Eigen::Map<Grad>(dX + i*Group::N) = dq * Group::act_jacobian(X*p);
+        }
+    });
+}
+
+
+// template <typename Group, typename scalar_t>
+// void tovec_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, scalar_t* dX, int batch_size) {
+//     // group inverse forward kernel
+//     using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+//     using Grad = Eigen::Matrix<scalar_t,1,Group::N>;
+
+//     at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+//         for (int64_t i=start; i<end; i++) {
+//             Group X(X_ptr + i*Group::N);
+//             Grad g(grad + i*Group::N);
+//             Eigen::Map<Grad>(dX + i*Group::N) = g * X.vec_jacobian();
+//         }
+//     });
+// }
+
+// template <typename Group, typename scalar_t>
+// void fromvec_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, scalar_t* dX, int batch_size) {
+//     // group inverse forward kernel
+//     using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+//     using Grad = Eigen::Matrix<scalar_t,1,Group::N>;
+
+//     at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+//         for (int64_t i=start; i<end; i++) {
+//             Group X(X_ptr + i*Group::N);
+//             Grad g(grad + i*Group::N);
+//             Eigen::Map<Grad>(dX + i*Group::N) = g * X.vec_jacobian();
+//         }
+//     });
+// }
+
+
+template <typename Group, typename scalar_t>
+void act4_forward_kernel(const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* q_ptr, int batch_size) {
+    // action on homogeneous point forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+    using Point = Eigen::Matrix<scalar_t,4,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Point p(p_ptr + i*4);
+            Eigen::Map<Point>(q_ptr + i*4) = X.act4(p);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void act4_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* dX, scalar_t* dp, int batch_size) {
+    // action on homogeneous point backward kernel
+
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Point = Eigen::Matrix<scalar_t,4,1>;
+    using PointGrad = Eigen::Matrix<scalar_t,1,4>;
+    using Transformation = Eigen::Matrix<scalar_t,4,4>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Point p(p_ptr + i*4);
+            PointGrad dq(grad + i*4);
+
+            Eigen::Map<PointGrad>(dp + i*4) = dq * X.Matrix4x4();
+            const Point q = X.act4(p);
+            Eigen::Map<Grad>(dX + i*Group::N) = dq * Group::act4_jacobian(q);
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void as_matrix_forward_kernel(const scalar_t* X_ptr, scalar_t* T_ptr, int batch_size) {
+    // group inverse forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+    using Matrix4 = Eigen::Matrix<scalar_t,4,4,Eigen::RowMajor>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Eigen::Map<Matrix4>(T_ptr + i*16) = X.Matrix4x4();
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void orthogonal_projector_kernel(const scalar_t* X_ptr, scalar_t* P_ptr, int batch_size) {
+    // group inverse forward kernel
+    using Proj = Eigen::Matrix<scalar_t,Group::N,Group::N,Eigen::RowMajor>;
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Eigen::Map<Proj>(P_ptr + i*Group::N*Group::N) = X.orthogonal_projector();
+        }
+    });
+}
+
+template <typename Group, typename scalar_t>
+void jleft_forward_kernel(const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* b_ptr, int batch_size) {
+    // left-jacobian inverse action
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    at::parallel_for(0, batch_size, 1, [&](int64_t start, int64_t end) {
+        for (int64_t i=start; i<end; i++) {
+            Group X(X_ptr + i*Group::N);
+            Tangent a(a_ptr + i*Group::K);
+            Tangent b = Group::left_jacobian_inverse(X.Log()) * a;
+            Eigen::Map<Tangent>(b_ptr + i*Group::K) = b;
+        }
+    });
+}
+
+// unary operations
+
+torch::Tensor exp_forward_cpu(int group_id, torch::Tensor a) {
+    int batch_size = a.size(0);
+    torch::Tensor X;
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_forward_kernel", ([&] {
+        X = torch::zeros({batch_size, group_t::N}, a.options());
+        exp_forward_kernel<group_t, scalar_t>(
+            a.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return X;
+}
+
+std::vector<torch::Tensor> exp_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor a) {
+    int batch_size = a.size(0);
+    torch::Tensor da = torch::zeros(a.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_backward_kernel", ([&] {
+        exp_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            da.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {da};
+}
+
+torch::Tensor log_forward_cpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor a;
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_forward_kernel", ([&] {
+        a = torch::zeros({batch_size, group_t::K}, X.options());
+        log_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return a;
+}
+
+std::vector<torch::Tensor> log_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_backward_kernel", ([&] {
+        log_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX};
+}
+
+torch::Tensor inv_forward_cpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor Y = torch::zeros_like(X);
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_forward_kernel", ([&] {
+        inv_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            Y.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return Y;
+}
+
+std::vector<torch::Tensor> inv_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_backward_kernel", ([&] {
+        inv_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX};
+}
+
+// binary operations
+torch::Tensor mul_forward_cpu(int group_id, torch::Tensor X, torch::Tensor Y) {
+    int batch_size = X.size(0);
+    torch::Tensor Z = torch::zeros_like(X);
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_forward_kernel", ([&] {
+        mul_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            Y.data_ptr<scalar_t>(),
+            Z.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return Z;
+}
+
+std::vector<torch::Tensor> mul_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor Y) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor dY = torch::zeros(Y.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_backward_kernel", ([&] {
+        mul_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            Y.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            dY.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, dY};
+}
+
+torch::Tensor adj_forward_cpu(int group_id, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor b = torch::zeros(a.sizes(), a.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_forward_kernel", ([&] {
+        adj_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return b;
+}
+
+std::vector<torch::Tensor> adj_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor da = torch::zeros(a.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_backward_kernel", ([&] {
+        adj_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            da.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, da};
+}
+
+
+torch::Tensor adjT_forward_cpu(int group_id, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor b = torch::zeros(a.sizes(), a.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_forward_kernel", ([&] {
+        adjT_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return b;
+}
+
+std::vector<torch::Tensor> adjT_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor da = torch::zeros(a.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_backward_kernel", ([&] {
+        adjT_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            da.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, da};
+}
+
+
+torch::Tensor act_forward_cpu(int group_id, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor q = torch::zeros(p.sizes(), p.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_forward_kernel", ([&] {
+        act_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            q.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return q;
+}
+
+std::vector<torch::Tensor> act_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_backward_kernel", ([&] {
+        act_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            dp.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, dp};
+}
+
+
+torch::Tensor act4_forward_cpu(int group_id, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor q = torch::zeros(p.sizes(), p.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_forward_kernel", ([&] {
+        act4_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            q.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return q;
+}
+
+std::vector<torch::Tensor> act4_backward_cpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_backward_kernel", ([&] {
+        act4_backward_kernel<group_t, scalar_t>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            dp.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, dp};
+}
+
+
+torch::Tensor as_matrix_forward_cpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "as_matrix_forward_kernel", ([&] {
+        as_matrix_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            T4x4.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return T4x4;
+}
+
+
+torch::Tensor orthogonal_projector_cpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor P;
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "orthogonal_projector_kernel", ([&] {
+        P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options());
+        orthogonal_projector_kernel<group_t, scalar_t>(X.data_ptr<scalar_t>(), P.data_ptr<scalar_t>(), batch_size);
+    }));
+
+    return P;
+}
+
+
+
+torch::Tensor jleft_forward_cpu(int group_id, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor b = torch::zeros(a.sizes(), a.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "jleft_forward_kernel", ([&] {
+        jleft_forward_kernel<group_t, scalar_t>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return b;
+}
diff --git a/third_party/dpvo_ext/lietorch/src/lietorch_gpu.cu b/third_party/dpvo_ext/lietorch/src/lietorch_gpu.cu
new file mode 100644
index 0000000..a3b73ee
--- /dev/null
+++ b/third_party/dpvo_ext/lietorch/src/lietorch_gpu.cu
@@ -0,0 +1,601 @@
+
+#include "lietorch_gpu.h"
+#include <Eigen/Dense>
+
+#include "common.h"
+#include "dispatch.h"
+
+#include "so3.h"
+#include "rxso3.h"
+#include "se3.h"
+#include "sim3.h"
+
+#define GPU_1D_KERNEL_LOOP(i, n) \
+  for (size_t i = blockIdx.x * blockDim.x + threadIdx.x; i<n; i += blockDim.x * gridDim.x)
+
+#define NUM_THREADS 256
+#define NUM_BLOCKS(batch_size) ((batch_size + NUM_THREADS - 1) / NUM_THREADS)
+
+
+template <typename Group, typename scalar_t>
+__global__ void exp_forward_kernel(const scalar_t* a_ptr, scalar_t* X_ptr, int num_threads) {
+    // exponential map forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Tangent a(a_ptr + i*Group::K);
+        Eigen::Map<Data>(X_ptr + i*Group::N) = Group::Exp(a).data();
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void exp_backward_kernel(const scalar_t* grad, const scalar_t* a_ptr, scalar_t* da, int num_threads) {
+    // exponential map backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Tangent a(a_ptr + i*Group::K);
+        Grad dX(grad + i*Group::N);
+        Eigen::Map<Grad>(da + i*Group::K) = dX * Group::left_jacobian(a);
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void log_forward_kernel(const scalar_t* X_ptr, scalar_t* a_ptr, int num_threads) {
+    // logarithm map forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Tangent a = Group(X_ptr + i*Group::N).Log();
+        Eigen::Map<Tangent>(a_ptr + i*Group::K) = a;
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void log_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, scalar_t* dX, int num_threads) {
+    // logarithm map backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Tangent a = Group(X_ptr + i*Group::N).Log();
+        Grad da(grad + i*Group::K);
+        Eigen::Map<Grad>(dX + i*Group::N) = da * Group::left_jacobian_inverse(a);
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void inv_forward_kernel(const scalar_t* X_ptr, scalar_t* Y_ptr, int num_threads) {
+    // group inverse forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Eigen::Map<Data>(Y_ptr + i*Group::N) = X.inv().data();
+    }
+}
+
+
+template <typename Group, typename scalar_t>
+__global__ void inv_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, scalar_t *dX, int num_threads) {
+    // group inverse backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group Y = Group(X_ptr + i*Group::N).inv();
+        Grad dY(grad + i*Group::N);
+        Eigen::Map<Grad>(dX + i*Group::N) = -dY * Y.Adj();
+    }
+}
+
+
+template <typename Group, typename scalar_t>
+__global__ void mul_forward_kernel(const scalar_t* X_ptr, const scalar_t* Y_ptr, scalar_t* Z_ptr, int num_threads) {
+    // group multiplication forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group Z = Group(X_ptr + i*Group::N) * Group(Y_ptr + i*Group::N);
+        Eigen::Map<Data>(Z_ptr + i*Group::N) = Z.data();
+    }
+}
+
+template <class Group, typename scalar_t>
+__global__ void mul_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* Y_ptr, scalar_t* dX, scalar_t* dY, int num_threads) {
+    // group multiplication backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Grad dZ(grad + i*Group::N);
+        Group X(X_ptr + i*Group::N);
+        Eigen::Map<Grad>(dX + i*Group::N) = dZ;
+        Eigen::Map<Grad>(dY + i*Group::N) = dZ * X.Adj();
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void adj_forward_kernel(const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* b_ptr, int num_threads) {
+    // adjoint forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Tangent a(a_ptr + i*Group::K);
+        Eigen::Map<Tangent>(b_ptr + i*Group::K) = X.Adj(a);
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void adj_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* dX, scalar_t* da, int num_threads) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Grad db(grad + i*Group::K);
+
+        Tangent a(a_ptr + i*Group::K);
+        Tangent b = X.Adj() * a;
+
+        Eigen::Map<Grad>(da + i*Group::K) = db * X.Adj();
+        Eigen::Map<Grad>(dX + i*Group::N) = -db * Group::adj(b);
+    }
+}
+
+
+template <typename Group, typename scalar_t>
+__global__ void adjT_forward_kernel(const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* b_ptr, int num_threads) {
+    // adjoint forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Tangent a(a_ptr + i*Group::K);
+        Eigen::Map<Tangent>(b_ptr + i*Group::K) = X.AdjT(a);
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void adjT_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* dX, scalar_t* da, int num_threads) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Tangent db(grad + i*Group::K);
+        Grad a(a_ptr + i*Group::K);
+
+        Eigen::Map<Tangent>(da + i*Group::K) = X.Adj(db);
+        Eigen::Map<Grad>(dX + i*Group::N) = -a * Group::adj(X.Adj(db));
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void act_forward_kernel(const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* q_ptr, int num_threads) {
+    // action on point forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+    using Point = Eigen::Matrix<scalar_t,3,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Point p(p_ptr + i*3);
+        Eigen::Map<Point>(q_ptr + i*3) = X * p;
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void act_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* dX, scalar_t* dp, int num_threads) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Point = Eigen::Matrix<scalar_t,3,1>;
+    using PointGrad = Eigen::Matrix<scalar_t,1,3>;
+    using Transformation = Eigen::Matrix<scalar_t,4,4>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Point p(p_ptr + i*3);
+        PointGrad dq(grad + i*3);
+
+        Eigen::Map<PointGrad>(dp + i*3) = dq * X.Matrix4x4().block<3,3>(0,0);
+        Eigen::Map<Grad>(dX + i*Group::N) = dq * Group::act_jacobian(X*p);
+    }
+}
+
+
+template <typename Group, typename scalar_t>
+__global__ void act4_forward_kernel(const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* q_ptr, int num_threads) {
+    // action on point forward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+    using Point = Eigen::Matrix<scalar_t,4,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Point p(p_ptr + i*4);
+        Eigen::Map<Point>(q_ptr + i*4) = X.act4(p);
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void act4_backward_kernel(const scalar_t* grad, const scalar_t* X_ptr, const scalar_t* p_ptr, scalar_t* dX, scalar_t* dp, int num_threads) {
+    // adjoint backward kernel
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Grad = Eigen::Matrix<scalar_t,1,Group::K>;
+    using Point = Eigen::Matrix<scalar_t,4,1>;
+    using PointGrad = Eigen::Matrix<scalar_t,1,4>;
+    using Transformation = Eigen::Matrix<scalar_t,4,4>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Point p(p_ptr + i*4);
+        PointGrad dq(grad + i*4);
+
+        Eigen::Map<PointGrad>(dp + i*4) = dq * X.Matrix4x4();
+        const Point q = X.act4(p);
+        Eigen::Map<Grad>(dX + i*Group::N) = dq * Group::act4_jacobian(q);
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void as_matrix_forward_kernel(const scalar_t* X_ptr, scalar_t* T_ptr, int num_threads) {
+    // convert to 4x4 matrix representation
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+    using Matrix4 = Eigen::Matrix<scalar_t,4,4,Eigen::RowMajor>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Eigen::Map<Matrix4>(T_ptr + i*16) = X.Matrix4x4();
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void orthogonal_projector_kernel(const scalar_t* X_ptr, scalar_t* P_ptr, int num_threads) {
+    // orthogonal projection matrix
+    using Proj = Eigen::Matrix<scalar_t,Group::N,Group::N,Eigen::RowMajor>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Eigen::Map<Proj>(P_ptr + i*Group::N*Group::N) = X.orthogonal_projector();
+    }
+}
+
+template <typename Group, typename scalar_t>
+__global__ void jleft_forward_kernel(const scalar_t* X_ptr, const scalar_t* a_ptr, scalar_t* b_ptr, int num_threads) {
+    // left jacobian inverse action
+    using Tangent = Eigen::Matrix<scalar_t,Group::K,1>;
+    using Data = Eigen::Matrix<scalar_t,Group::N,1>;
+
+    GPU_1D_KERNEL_LOOP(i, num_threads) {
+        Group X(X_ptr + i*Group::N);
+        Tangent a(a_ptr + i*Group::K);
+        Tangent b = Group::left_jacobian_inverse(X.Log()) * a;
+        Eigen::Map<Tangent>(b_ptr + i*Group::K) = b;
+    }
+}
+
+// unary operations
+
+torch::Tensor exp_forward_gpu(int group_id, torch::Tensor a) {
+    int batch_size = a.size(0);
+    torch::Tensor X;
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_forward_kernel", ([&] {
+        X = torch::zeros({batch_size, group_t::N}, a.options());
+        exp_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            a.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return X;
+}
+
+std::vector<torch::Tensor> exp_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor a) {
+    int batch_size = a.size(0);
+    torch::Tensor da = torch::zeros(a.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, a.type(), "exp_backward_kernel", ([&] {
+        exp_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            da.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {da};
+}
+
+torch::Tensor log_forward_gpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor a;
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_forward_kernel", ([&] {
+        a = torch::zeros({batch_size, group_t::K}, X.options());
+        log_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return a;
+}
+
+std::vector<torch::Tensor> log_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "log_backward_kernel", ([&] {
+        log_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX};
+}
+
+torch::Tensor inv_forward_gpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor Y = torch::zeros_like(X);
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_forward_kernel", ([&] {
+        inv_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            Y.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return Y;
+}
+
+std::vector<torch::Tensor> inv_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "inv_backward_kernel", ([&] {
+        inv_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX};
+}
+
+// binary operations
+torch::Tensor mul_forward_gpu(int group_id, torch::Tensor X, torch::Tensor Y) {
+    int batch_size = X.size(0);
+    torch::Tensor Z = torch::zeros_like(X);
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_forward_kernel", ([&] {
+        mul_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            Y.data_ptr<scalar_t>(),
+            Z.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return Z;
+}
+
+std::vector<torch::Tensor> mul_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor Y) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor dY = torch::zeros(Y.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "mul_backward_kernel", ([&] {
+        mul_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            Y.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            dY.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, dY};
+}
+
+torch::Tensor adj_forward_gpu(int group_id, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor b = torch::zeros(a.sizes(), a.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_forward_kernel", ([&] {
+        adj_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return b;
+}
+
+std::vector<torch::Tensor> adj_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor da = torch::zeros(a.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adj_backward_kernel", ([&] {
+        adj_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            da.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, da};
+}
+
+
+torch::Tensor adjT_forward_gpu(int group_id, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor b = torch::zeros(a.sizes(), a.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_forward_kernel", ([&] {
+        adjT_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return b;
+}
+
+std::vector<torch::Tensor> adjT_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor da = torch::zeros(a.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "adjT_backward_kernel", ([&] {
+        adjT_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            da.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, da};
+}
+
+
+
+torch::Tensor act_forward_gpu(int group_id, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor q = torch::zeros(p.sizes(), p.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_forward_kernel", ([&] {
+        act_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            q.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return q;
+}
+
+std::vector<torch::Tensor> act_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act_backward_kernel", ([&] {
+        act_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            dp.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, dp};
+}
+
+torch::Tensor act4_forward_gpu(int group_id, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor q = torch::zeros(p.sizes(), p.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_forward_kernel", ([&] {
+        act4_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            q.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return q;
+}
+
+std::vector<torch::Tensor> act4_backward_gpu(int group_id, torch::Tensor grad, torch::Tensor X, torch::Tensor p) {
+    int batch_size = X.size(0);
+    torch::Tensor dX = torch::zeros(X.sizes(), grad.options());
+    torch::Tensor dp = torch::zeros(p.sizes(), grad.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "act4_backward_kernel", ([&] {
+        act4_backward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            grad.data_ptr<scalar_t>(),
+            X.data_ptr<scalar_t>(),
+            p.data_ptr<scalar_t>(),
+            dX.data_ptr<scalar_t>(),
+            dp.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return {dX, dp};
+}
+
+
+torch::Tensor as_matrix_forward_gpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor T4x4 = torch::zeros({X.size(0), 4, 4}, X.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "as_matrix_forward_kernel", ([&] {
+        as_matrix_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            T4x4.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return T4x4;
+}
+
+
+torch::Tensor orthogonal_projector_gpu(int group_id, torch::Tensor X) {
+    int batch_size = X.size(0);
+    torch::Tensor P;
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "orthogonal_projector_kernel", ([&] {
+        P = torch::zeros({X.size(0), group_t::N, group_t::N}, X.options());
+        orthogonal_projector_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            P.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return P;
+}
+
+
+torch::Tensor jleft_forward_gpu(int group_id, torch::Tensor X, torch::Tensor a) {
+    int batch_size = X.size(0);
+    torch::Tensor b = torch::zeros(a.sizes(), a.options());
+
+    DISPATCH_GROUP_AND_FLOATING_TYPES(group_id, X.type(), "jleft_forward_kernel", ([&] {
+        jleft_forward_kernel<group_t, scalar_t><<<NUM_BLOCKS(batch_size), NUM_THREADS>>>(
+            X.data_ptr<scalar_t>(),
+            a.data_ptr<scalar_t>(),
+            b.data_ptr<scalar_t>(),
+            batch_size);
+    }));
+
+    return b;
+}
diff --git a/third_party/dpvo_ext/setup.py b/third_party/dpvo_ext/setup.py
new file mode 100644
index 0000000..5c86bc4
--- /dev/null
+++ b/third_party/dpvo_ext/setup.py
@@ -0,0 +1,41 @@
+import os.path as osp
+
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+ROOT = osp.dirname(osp.abspath(__file__))
+
+setup(name='dpvo',
+      packages=find_packages(),
+      ext_modules=[
+          CUDAExtension('cuda_corr',
+                        sources=[
+                            'altcorr/correlation.cpp',
+                            'altcorr/correlation_kernel.cu'
+                        ],
+                        extra_compile_args={
+                            'cxx': ['-O3'],
+                            'nvcc': ['-O3'],
+                        }),
+          CUDAExtension('cuda_ba',
+                        sources=['fastba/ba_dpvo.cpp', 'fastba/ba_cuda.cu'],
+                        extra_compile_args={
+                            'cxx': ['-O3'],
+                            'nvcc': ['-O3'],
+                        }),
+          CUDAExtension('lietorch_backends',
+                        include_dirs=[
+                            osp.join(ROOT, 'lietorch/include'),
+                            osp.join(ROOT, 'eigen-3.4.0')
+                        ],
+                        sources=[
+                            'lietorch/src/lietorch.cpp',
+                            'lietorch/src/lietorch_gpu.cu',
+                            'lietorch/src/lietorch_cpu.cpp'
+                        ],
+                        extra_compile_args={
+                            'cxx': ['-O3'],
+                            'nvcc': ['-O3'],
+                        }),
+      ],
+      cmdclass={'build_ext': BuildExtension})
diff --git a/third_party/evaluate_3d_reconstruction_lib/README.md b/third_party/evaluate_3d_reconstruction_lib/README.md
new file mode 100644
index 0000000..0c42884
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/README.md
@@ -0,0 +1,16 @@
+# evaluate_3d_reconstruction
+
+
+## Install
+
+```
+python setup.py install
+```
+
+
+
+## Notice
+
+This code repository is based on modifications made to [**evaluate_3d_reconstruction_lib**](https://github.com/eriksandstroem/evaluate_3d_reconstruction_lib), authored by Erik Sandstroem. The original code is based on the  [**Thanks and Temples**](https://github.com/isl-org/TanksAndTemples/tree/master/python_toolbox/evaluation), which is licensed under the MIT License.
+
+Please note that our project's license is the Apache License Version 2.0, and the use of this third-party library is subject to the terms of the MIT License as specified in the original repository.
diff --git a/third_party/evaluate_3d_reconstruction_lib/README_original.md b/third_party/evaluate_3d_reconstruction_lib/README_original.md
new file mode 100644
index 0000000..433e334
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/README_original.md
@@ -0,0 +1,51 @@
+# Evaluate F-score
+
+This is a modified version of the F-score evaluation of 3D meshes provided by [**Thanks and Temples**](https://github.com/isl-org/TanksAndTemples/tree/master/python_toolbox/evaluation).
+
+For improved evaluation realism, this version does not crop or downsample the meshes.
+
+### Prerequisites
+The library has been tested with the following dependencies, but should work with other versions as well.
+
+1. Python 3.8.5
+2. Open3D 0.9.0
+3. Matplotlib 3.3.3
+
+## Installation
+
+1. Clone the repository to your local directory: <pre><code>git clone https://github.com/tfy14esa/evaluate_3d_reconstruction_lib.git</code></pre>
+2. If you don't want to provide the full path to the ground truth mesh when you call the evaluation script, you can specify the path to the folder containing the ground truth meshes in a hard coded way (not recommended). Open the file <pre><code>evaluate_3d_reconstruction/evaluate_3d_reconstruction/config.py</code></pre> and revise the path to where you store the ground truth meshes.
+3. Then open the file <pre><code>evaluate_3d_reconstruction/evaluate_3d_reconstruction/evaluate_3d_reconstruction.py</code></pre> and revise the shebang at the top to point to the python executable of your virtual environment (this can be useful if you want to execute the evaluation script directly from the command line (see below)).
+4. Activate your virtual environment
+5. Enter the root folder of the library: <pre><code>cd evaluate_3d_reconstruction_lib</code></pre>
+6. Install the library: <pre><code>pip install .</code></pre>
+
+### Usage
+
+The main function of the library is
+<pre><code>def run_evaluation(pred_ply, path_to_pred_ply, scene, distance_thresh=0.10, gt_translate_to_zero=False, pred_translate_to_zero=False):
+   """Calculates the F-score from a predicted mesh to a reference mesh. Generates
+    a directory and fills this with numerical and mesh results.
+
+        Args:
+            pred_ply (string): string object to denote the name of predicted mesh (as a .ply file)
+            path_to_pred_ply (string): string object to denote the full path to the pred_ply file
+            scene (string): string object to denote the scene name (a corresponding ground truth .ply file with the name "scene + .ply" needs to exist)
+            distance_threshold (float):
+            gt_translate_to_zero (bool): boolean describing whether to translate gt mesh to origin
+            pred_translate_to_zero (bool): boolean describing whether to translate predicted mesh to origin
+            icp_align (bool): align the recontructed mesh with the gt using ICP
+            full_path_to_gt_ply (string): specify full path to ground truth mesh
+
+        Returns:
+            None
+    """
+</code></pre>
+
+The main function can be called in two principled ways:
+
+1. As an executable directly from the command line as:
+<pre><code>evaluate_3d_reconstruction.py pred_ply scene full_path_to_gt_ply</code></pre> To achieve this, run <pre><code>chmod +x evaluate_3d_reconstruction.py</code></pre> and export the path to the script in your bashrc-file i.e. add similar to the following to your bashrc: <pre><code># Export path to my python evaluate 3d reconstruction script
+export PATH="/cluster/project/cvl/esandstroem/src/late_fusion_3dconv/deps/evaluate_3d_reconstruction/evaluate_3d_reconstruction:$PATH"</code></pre>
+
+2. As a normal function in other python scripts. To achieve this, simply import the function using <pre><code>from evaluate_3d_reconstruction import run_evaluation</code></pre>
diff --git a/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/__init__.py b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/__init__.py
new file mode 100644
index 0000000..b9128b8
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/__init__.py
@@ -0,0 +1 @@
+from .evaluate_3d_reconstruction import run_evaluation
diff --git a/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/config.py b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/config.py
new file mode 100644
index 0000000..63881a0
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/config.py
@@ -0,0 +1,36 @@
+# This script is modified from the original source by Erik Sandstroem
+
+# ----------------------------------------------------------------------------
+# -                   TanksAndTemples Website Toolbox                        -
+# -                    http://www.tanksandtemples.org                        -
+# ----------------------------------------------------------------------------
+# The MIT License (MIT)
+#
+# Copyright (c) 2017
+# Arno Knapitsch <arno.knapitsch@gmail.com >
+# Jaesik Park <syncle@gmail.com>
+# Qian-Yi Zhou <Qianyi.Zhou@gmail.com>
+# Vladlen Koltun <vkoltun@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# ----------------------------------------------------------------------------
+
+# declare global variables
+ground_truth_data_base = (
+    '/cluster/work/cvl/esandstroem/data/ground_truth_data/gt_mesh_poisson')
diff --git a/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/evaluate_3d_reconstruction.py b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/evaluate_3d_reconstruction.py
new file mode 100755
index 0000000..a3422c6
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/evaluate_3d_reconstruction.py
@@ -0,0 +1,290 @@
+#!/home/esandstroem/scratch/venvs/voxfusion_older_torch_env/bin/python
+
+# This script is modified from the original source by Erik Sandstroem
+
+# ----------------------------------------------------------------------------
+# -                   TanksAndTemples Website Toolbox                        -
+# -                    http://www.tanksandtemples.org                        -
+# ----------------------------------------------------------------------------
+# The MIT License (MIT)
+#
+# Copyright (c) 2017
+# Arno Knapitsch <arno.knapitsch@gmail.com >
+# Jaesik Park <syncle@gmail.com>
+# Qian-Yi Zhou <Qianyi.Zhou@gmail.com>
+# Vladlen Koltun <vkoltun@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# ----------------------------------------------------------------------------
+
+# See README.md file for usage instructions
+
+import pathlib
+from sys import argv
+
+import numpy as np
+import open3d as o3d
+import trimesh
+from evaluate_3d_reconstruction.config import ground_truth_data_base
+from evaluate_3d_reconstruction.evaluation import EvaluateHisto
+from evaluate_3d_reconstruction.plot import plot_graph
+from evaluate_3d_reconstruction.util import make_dir
+
+
+def get_align_transformation(rec_meshfile, gt_meshfile):
+    """Get the transformation matrix to align the reconstructed mesh to the
+    ground truth mesh."""
+    o3d_rec_mesh = trimesh.load(rec_meshfile)
+    o3d_gt_mesh = trimesh.load(gt_meshfile)
+    o3d_rec_pc = o3d.geometry.PointCloud(
+        points=o3d.utility.Vector3dVector(o3d_rec_mesh.vertices))
+    o3d_gt_pc = o3d.geometry.PointCloud(
+        points=o3d.utility.Vector3dVector(o3d_gt_mesh.vertices))
+    criteria = o3d.pipelines.registration.ICPConvergenceCriteria(
+        max_iteration=50)
+    trans_init = np.eye(4)
+    threshold = 0.1
+    reg_p2p = o3d.pipelines.registration.registration_icp(
+        o3d_rec_pc,
+        o3d_gt_pc,
+        threshold,
+        trans_init,
+        o3d.pipelines.registration.TransformationEstimationPointToPoint(),
+        criteria=criteria,
+    )
+    transformation = reg_p2p.transformation
+    return transformation
+
+
+def run_evaluation(
+    pred_ply,
+    path_to_pred_ply,
+    scene,
+    distance_thresh=0.01,
+    gt_translate_to_zero=False,
+    pred_translate_to_zero=False,
+    transform_matrix=None,
+    icp_align=True,
+    full_path_to_gt_ply=None,
+):
+    """Calculates the F-score from a predicted mesh to a reference mesh.
+    Generates a directory and fills this with numerical and mesh results.
+
+    Args:
+        pred_ply (string): string object to denote the name of predicted mesh (as a .ply file)
+        path_to_pred_ply (string): string object to denote the full path to the pred_ply file
+        scene (string): string object to denote the scene name (a corresponding ground truth .ply file with the name "scene + .ply" needs to exist)
+        distance_threshold (float):
+        gt_translate_to_zero (bool): boolean describing whether to transform gt to origin
+        pred_translate_to_zero (bool): boolean describing whether to transform prediction to origin
+        icp_align (bool): align the recontructed mesh with the gt using ICP
+        full_path_to_gt_ply (string): specify full path to ground truth mesh
+
+    Returns:
+        None
+    """
+    print('Align: ', icp_align)
+    # specify path to ground truth mesh
+    if full_path_to_gt_ply is None:
+        gt_ply_path = ground_truth_data_base + '/' + scene + '.ply'
+    else:
+        gt_ply_path = full_path_to_gt_ply
+
+    # full path to predicted mesh
+    pred_ply_path = path_to_pred_ply + '/' + pred_ply
+
+    # output directory
+    out_dir = path_to_pred_ply + '/' + pred_ply[:-4]
+
+    # create output directory
+    make_dir(out_dir)
+
+    print('')
+    print('===========================')
+    print('Evaluating %s' % scene)
+    print('===========================')
+
+    dTau = distance_thresh  # constant Tau regardless of scene size
+
+    # dictionary to store input to evaluation function
+    geometric_dict = dict()
+
+    # Load reconstruction and corresponding GT
+    # Using trimesh to load because sometimes open3d fails loading .ply files
+
+    mesh = trimesh.load(pred_ply_path)
+    if transform_matrix is not None:
+        transformation = transform_matrix
+        mesh = mesh.apply_transform(transformation)
+        # use icp
+        mesh_gt = trimesh.load(gt_ply_path)
+        o3d_rec_pc = o3d.geometry.PointCloud(
+            points=o3d.utility.Vector3dVector(mesh.vertices))
+        o3d_gt_pc = o3d.geometry.PointCloud(
+            points=o3d.utility.Vector3dVector(mesh_gt.vertices))
+        trans_init = np.eye(4)
+        threshold = 0.1
+        reg_p2p = o3d.pipelines.registration.registration_icp(
+            o3d_rec_pc, o3d_gt_pc, threshold, trans_init,
+            o3d.pipelines.registration.TransformationEstimationPointToPoint())
+        transformation = reg_p2p.transformation
+        mesh = mesh.apply_transform(transformation)
+        trans_mesh_path = path_to_pred_ply + '/' + 'trans_mesh.ply'
+        mesh.export(trans_mesh_path)
+    elif icp_align:
+        transformation = get_align_transformation(pred_ply_path, gt_ply_path)
+        print('Rigid Transform Applied to Reconstructed Mesh: ',
+              transformation)
+        mesh = mesh.apply_transform(transformation)
+        trans_mesh_path = path_to_pred_ply + '/' + 'trans_mesh.ply'
+        mesh.export(trans_mesh_path)
+
+    mesh = o3d.geometry.TriangleMesh(
+        vertices=o3d.utility.Vector3dVector(mesh.vertices),
+        triangles=o3d.utility.Vector3iVector(mesh.faces),
+    )  # mesh which we want to color for precision
+    geometric_dict['mesh'] = mesh
+
+    if pred_translate_to_zero:
+        mesh.vertices = o3d.utility.Vector3dVector(
+            np.array(mesh.vertices) - np.array(mesh.vertices).min(axis=0))
+
+    gt_mesh = trimesh.load(gt_ply_path)
+    gt_mesh = o3d.geometry.TriangleMesh(
+        vertices=o3d.utility.Vector3dVector(gt_mesh.vertices),
+        triangles=o3d.utility.Vector3iVector(gt_mesh.faces),
+    )  # mesh which we want to color for recall
+    geometric_dict['gt_mesh'] = gt_mesh
+
+    if gt_translate_to_zero:
+        gt_mesh.vertices = o3d.utility.Vector3dVector(
+            np.array(gt_mesh.vertices) -
+            np.array(gt_mesh.vertices).min(axis=0))
+
+    # sample points on surface. Make sure that we have equal sample density from both meshes.
+    # Sample the amount of points equaling the number of vertices from the mesh with most vertices.
+    if np.array(gt_mesh.vertices).shape[0] < np.array(mesh.vertices).shape[0]:
+        gt_pcd = gt_mesh.sample_points_uniformly(
+            number_of_points=np.array(mesh.vertices).shape[0])
+        pcd = o3d.geometry.PointCloud(
+            points=o3d.utility.Vector3dVector(mesh.vertices))
+        geometric_dict['pcd'] = pcd
+        geometric_dict['gt_pcd_color'] = o3d.geometry.PointCloud(
+            points=o3d.utility.Vector3dVector(
+                gt_mesh.vertices))  # for visualization
+        geometric_dict['gt_pcd'] = gt_pcd  # for F-score computation
+    else:
+        gt_pcd = o3d.geometry.PointCloud(
+            points=o3d.utility.Vector3dVector(gt_mesh.vertices))
+        pcd = mesh.sample_points_uniformly(
+            number_of_points=np.array(gt_mesh.vertices).shape[0])
+        geometric_dict['pcd'] = pcd  # for F-score computation
+        geometric_dict['pcd_color'] = o3d.geometry.PointCloud(
+            points=o3d.utility.Vector3dVector(
+                mesh.vertices))  # for visualization
+        geometric_dict['gt_pcd'] = gt_pcd
+
+    dist_threshold = dTau
+
+    # Histograms and P/R/F
+    plot_stretch = 5
+    [
+        precision,
+        recall,
+        fscore,
+        edges_source,
+        cum_source,
+        edges_target,
+        cum_target,
+        min1,
+        min2,
+        max1,
+        max2,
+        mean1,
+        mean2,
+        median1,
+        median2,
+        std1,
+        std2,
+    ] = EvaluateHisto(
+        geometric_dict,
+        dTau,
+        out_dir,
+        plot_stretch,
+        scene,
+    )
+    eva = [precision, recall, fscore]
+    print('==============================')
+    print('evaluation result : %s' % scene)
+    print('==============================')
+    print('distance tau : %.3f' % dTau)
+    print('precision : %.4f' % eva[0])
+    print('recall : %.4f' % eva[1])
+    print('f-score : %.4f' % eva[2])
+    print('==============================')
+    print('precision statistics')
+    print('min: %.4f' % min1)
+    print('max: %.4f' % max1)
+    print('mean: %.4f' % mean1)
+    print('median: %.4f' % median1)
+    print('std: %.4f' % std1)
+    print('==============================')
+    print('recall statistics')
+    print('min: %.4f' % min2)
+    print('max: %.4f' % max2)
+    print('mean: %.4f' % mean2)
+    print('median: %.4f' % median2)
+    print('std: %.4f' % std2)
+    print('==============================')
+
+    # Plotting
+    plot_graph(
+        scene,
+        fscore,
+        dist_threshold,
+        edges_source,
+        cum_source,
+        edges_target,
+        cum_target,
+        plot_stretch,
+        out_dir,
+    )
+
+    return {
+        'dist_threshold': dTau,
+        'precision': eva[0],
+        'recall': eva[1],
+        'f-score': eva[2],
+        'mean precision': mean1,
+        'mean recall': mean2
+    }
+
+
+if __name__ == '__main__':
+
+    pred_ply = argv[1]  # name of predicted .ply file
+    scene = argv[2]  # scene name
+    full_path_to_gt_ply = argv[3]
+
+    run_evaluation(
+        pred_ply=pred_ply,
+        path_to_pred_ply=str(pathlib.Path().absolute()),
+        scene=scene,
+        full_path_to_gt_ply=full_path_to_gt_ply,
+    )
diff --git a/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/evaluation.py b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/evaluation.py
new file mode 100644
index 0000000..453fe11
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/evaluation.py
@@ -0,0 +1,245 @@
+# This script is modified from the original source by Erik Sandstroem
+
+# ----------------------------------------------------------------------------
+# -                   TanksAndTemples Website Toolbox                        -
+# -                    http://www.tanksandtemples.org                        -
+# ----------------------------------------------------------------------------
+# The MIT License (MIT)
+#
+# Copyright (c) 2017
+# Arno Knapitsch <arno.knapitsch@gmail.com >
+# Jaesik Park <syncle@gmail.com>
+# Qian-Yi Zhou <Qianyi.Zhou@gmail.com>
+# Vladlen Koltun <vkoltun@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# ----------------------------------------------------------------------------
+
+import copy
+import json
+import shutil
+
+import matplotlib
+import matplotlib.pyplot as plt
+import numpy as np
+import open3d as o3d
+from matplotlib import rc
+
+matplotlib.use('Agg')
+
+rc('font', **{'family': 'serif', 'sans-serif': ['Times New Roman']})
+rc('text', usetex=True if shutil.which('latex') else
+   False)  # if Latex is installed and executable on PATH
+
+
+def write_color_distances_mesh(path, mesh, distances, max_distance):
+    cmap = plt.get_cmap('hsv')
+    distances = np.array(distances)
+
+    max_dist = max_distance
+    c = distances / max_dist
+    c[c > 0.85] = 0.85
+    c += 0.33
+    c[c > 1] = c[c > 1] - 1
+
+    colors = cmap(c)[:, :3]
+    mesh.vertex_colors = o3d.utility.Vector3dVector(colors)
+    o3d.io.write_triangle_mesh(path, mesh)
+
+
+def EvaluateHisto(
+    geometric_dict,
+    threshold,
+    filename_mvs,
+    plot_stretch,
+    scene_name,
+):
+    print('[Evaluate]')
+    s = copy.deepcopy(geometric_dict['pcd'])
+
+    t = copy.deepcopy(geometric_dict['gt_pcd'])
+    print('[compute distance from source to target]')
+    distance1 = s.compute_point_cloud_distance(
+        t)  # distance from source to target
+    print('[compute distance from target to source]')
+    distance2 = t.compute_point_cloud_distance(
+        s)  # distance from target to source
+
+    # plot histograms of the distances
+    cm = plt.get_cmap('hsv')
+    _, bins, patches = plt.hist(distance1, bins=1000)
+    bin_centers = 0.5 * (bins[:-1] + bins[1:])
+
+    # scale values to interval [0,1]
+    max_col = 3 * threshold
+    col = bin_centers - min(bin_centers)
+
+    col /= max_col
+    for c, p in zip(col, patches):
+        if c > 0.85:
+            c = 0.85
+        c += 0.33
+        if c > 1:
+            c -= 1
+        plt.setp(p, 'facecolor', cm(c))
+
+    plt.ylabel('$\#$ of points', fontsize=18)
+    plt.xlabel('Meters', fontsize=18)
+    plt.title('Precision Histogram', fontsize=18)
+    plt.grid(True)
+    plt.savefig(filename_mvs + '/' + 'histogram_rec_to_gt')
+    plt.clf()
+    _, bins, patches = plt.hist(distance2, bins=1000)
+    bin_centers = 0.5 * (bins[:-1] + bins[1:])
+
+    # scale values to interval [0,1]
+    col = bin_centers - min(bin_centers)
+    col /= max_col
+    for c, p in zip(col, patches):
+        if c > 0.85:
+            c = 0.85
+        c += 0.33
+        if c > 1:
+            c -= 1
+        plt.setp(p, 'facecolor', cm(c))
+
+    plt.ylabel('$\#$ of points', fontsize=18)
+    plt.xlabel('Meters', fontsize=18)
+    plt.title('Recall Histogram', fontsize=18)
+    plt.grid(True)
+    plt.savefig(filename_mvs + '/' + 'histogram_gt_to_rec')
+
+    source_n_fn = filename_mvs + '/' + scene_name + '.precision.ply'
+    target_n_fn = filename_mvs + '/' + scene_name + '.recall.ply'
+
+    print('[Add color coding to predicted mesh to visualize error]')
+    if 'pcd_color' in geometric_dict:
+        distance1 = geometric_dict['pcd_color'].compute_point_cloud_distance(
+            t)  # distance from source to target
+    write_color_distances_mesh(source_n_fn, geometric_dict['mesh'], distance1,
+                               3 * threshold)
+    print('[Written to ', source_n_fn, ']')
+
+    print('[Add color coding to target mesh to visualize error]')
+    if 'gt_pcd_color' in geometric_dict:
+        distance2 = geometric_dict[
+            'gt_pcd_color'].compute_point_cloud_distance(
+                s)  # distance from target to source
+    write_color_distances_mesh(target_n_fn, geometric_dict['gt_mesh'],
+                               distance2, 3 * threshold)
+    print('[Written to ', target_n_fn, ']')
+
+    # get histogram and f-score
+    [
+        precision,
+        recall,
+        fscore,
+        edges_source,
+        cum_source,
+        edges_target,
+        cum_target,
+    ] = get_f1_score_histo(threshold, filename_mvs, plot_stretch, distance1,
+                           distance2)
+    np.savetxt(filename_mvs + '/' + scene_name + '.recall.txt', cum_target)
+    np.savetxt(filename_mvs + '/' + scene_name + '.precision.txt', cum_source)
+    np.savetxt(
+        filename_mvs + '/' + scene_name + '.prf_tau_plotstr.txt',
+        np.array([precision, recall, fscore, threshold, plot_stretch]),
+    )
+    # calculate mean, median, min, max, std of distance1 and distance2
+    min1 = np.amin(distance1)
+    min2 = np.amin(distance2)
+    max1 = np.amax(distance1)
+    max2 = np.amax(distance2)
+    mean1 = np.mean(distance1)
+    mean2 = np.mean(distance2)
+    median1 = np.median(distance1)
+    median2 = np.median(distance2)
+    std1 = np.std(distance1)
+    std2 = np.std(distance2)
+    np.savetxt(
+        filename_mvs + '/' + scene_name +
+        '.min12_max12_mean12_median12_std12.txt',
+        np.array([
+            min1, min2, max1, max2, mean1, mean2, median1, median2, std1, std2
+        ]),
+    )
+
+    return [
+        precision,
+        recall,
+        fscore,
+        edges_source,
+        cum_source,
+        edges_target,
+        cum_target,
+        min1,
+        min2,
+        max1,
+        max2,
+        mean1,
+        mean2,
+        median1,
+        median2,
+        std1,
+        std2,
+    ]
+
+
+def get_f1_score_histo(threshold, filename_mvs, plot_stretch, distance1,
+                       distance2):
+    print('[get_f1_score_histo]')
+    dist_threshold = threshold
+    if len(distance1) and len(distance2):
+
+        recall = float(sum(d < threshold
+                           for d in distance2)) / float(len(distance2))
+        precision = float(sum(d < threshold
+                              for d in distance1)) / float(len(distance1))
+        fscore = 2 * recall * precision / (recall + precision)
+        num = len(distance1)
+        bins = np.arange(0, dist_threshold * plot_stretch,
+                         dist_threshold / 100)
+        hist, edges_source = np.histogram(distance1, bins)
+        cum_source = np.cumsum(hist).astype(float) / num
+
+        num = len(distance2)
+        bins = np.arange(0, dist_threshold * plot_stretch,
+                         dist_threshold / 100)
+        hist, edges_target = np.histogram(distance2, bins)
+        cum_target = np.cumsum(hist).astype(float) / num
+
+    else:
+        precision = 0
+        recall = 0
+        fscore = 0
+        edges_source = np.array([0])
+        cum_source = np.array([0])
+        edges_target = np.array([0])
+        cum_target = np.array([0])
+
+    return [
+        precision,
+        recall,
+        fscore,
+        edges_source,
+        cum_source,
+        edges_target,
+        cum_target,
+    ]
diff --git a/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/plot.py b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/plot.py
new file mode 100644
index 0000000..f2b5901
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/plot.py
@@ -0,0 +1,118 @@
+# This script is modified from the original source by Erik Sandstroem
+
+# ----------------------------------------------------------------------------
+# -                   TanksAndTemples Website Toolbox                        -
+# -                    http://www.tanksandtemples.org                        -
+# ----------------------------------------------------------------------------
+# The MIT License (MIT)
+#
+# Copyright (c) 2017
+# Arno Knapitsch <arno.knapitsch@gmail.com >
+# Jaesik Park <syncle@gmail.com>
+# Qian-Yi Zhou <Qianyi.Zhou@gmail.com>
+# Vladlen Koltun <vkoltun@gmail.com>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+# THE SOFTWARE.
+# ----------------------------------------------------------------------------
+#
+# This python script is for downloading dataset from www.tanksandtemples.org
+# The dataset has a different license, please refer to
+# https://tanksandtemples.org/license/
+
+import shutil
+
+import matplotlib.pyplot as plt
+from cycler import cycler
+from matplotlib import rc
+
+rc('font', **{'family': 'serif', 'sans-serif': ['Times New Roman']})
+rc('text', usetex=True if shutil.which('latex') else
+   False)  # if Latex is installed and executable on PATH
+
+
+def plot_graph(
+    scene,
+    fscore,
+    dist_threshold,
+    edges_source,
+    cum_source,
+    edges_target,
+    cum_target,
+    plot_stretch,
+    mvs_outpath,
+    show_figure=False,
+):
+    f = plt.figure()
+    plt_size = [14, 7]
+    pfontsize = 'medium'
+
+    ax = plt.subplot(111)
+    label_str = 'precision'
+    ax.plot(
+        edges_source[1::],
+        cum_source * 100,
+        c='red',
+        label=label_str,
+        linewidth=2.0,
+    )
+
+    label_str = 'recall'
+    ax.plot(
+        edges_target[1::],
+        cum_target * 100,
+        c='blue',
+        label=label_str,
+        linewidth=2.0,
+    )
+
+    ax.grid(True)
+    plt.rcParams['figure.figsize'] = plt_size
+    plt.rc('axes', prop_cycle=cycler('color', ['r', 'g', 'b', 'y']))
+
+    plt.axvline(x=dist_threshold, c='black', ls='dashed', linewidth=2.0)
+
+    plt.ylabel('$\#$ of points ($\%$)', fontsize=10)
+    plt.xlabel('Meters', fontsize=10)
+    plt.axis([0, dist_threshold * plot_stretch, 0, 100])
+    ax.legend(shadow=True, fancybox=True, fontsize=pfontsize)
+
+    plt.setp(ax.get_legend().get_texts(), fontsize=pfontsize)
+
+    plt.legend(loc=2, borderaxespad=0.0, fontsize=pfontsize)
+    plt.legend(loc=4)
+    plt.legend(loc='lower right')
+
+    box = ax.get_position()
+    ax.set_position([box.x0, box.y0, box.width * 0.8, box.height])
+
+    # Put a legend to the right of the current axis
+    ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
+    plt.setp(ax.get_legend().get_texts(), fontsize=pfontsize)
+    png_name = mvs_outpath + '/PR_{0}_@d_th_0_{1}.png'.format(
+        scene, '%04d' % (dist_threshold * 10000))
+    pdf_name = mvs_outpath + '/PR_{0}_@d_th_0_{1}.pdf'.format(
+        scene, '%04d' % (dist_threshold * 10000))
+
+    # save figure and display
+    f.savefig(png_name, format='png', bbox_inches='tight')
+    f.savefig(pdf_name, format='pdf', bbox_inches='tight')
+    if show_figure:
+        plt.show()
+
+    plt.clf()  # important to clear plot
diff --git a/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/util.py b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/util.py
new file mode 100644
index 0000000..1d83da7
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/evaluate_3d_reconstruction/util.py
@@ -0,0 +1,6 @@
+import os
+
+
+def make_dir(path):
+    if not os.path.exists(path):
+        os.makedirs(path)
diff --git a/third_party/evaluate_3d_reconstruction_lib/setup.py b/third_party/evaluate_3d_reconstruction_lib/setup.py
new file mode 100644
index 0000000..ab8ba93
--- /dev/null
+++ b/third_party/evaluate_3d_reconstruction_lib/setup.py
@@ -0,0 +1,25 @@
+import os
+
+from setuptools import find_packages, setup
+
+
+def read(fname):
+    """Utility function to read the README file.
+
+    Used for the long_description.  It's nice, because now 1) we have a top
+    level README file and 2) it's easier to type in the README file than to put
+    a raw string in below.
+    """
+    return open(os.path.join(os.path.dirname(__file__), fname)).read()
+
+
+setup(
+    name='evaluate_3d_reconstruction',
+    packages=find_packages(include=['evaluate_3d_reconstruction']),
+    version='0.1.0',
+    description='Evaluates meshes using the F-score. \
+        Adapted from Tanks and Temples evaluation script.',
+    long_description=read('README.md'),
+    author='Erik Sandstroem',
+    install_requires=['numpy'],
+)
diff --git a/third_party/install.sh b/third_party/install.sh
index 25a59b1..33f4efe 100755
--- a/third_party/install.sh
+++ b/third_party/install.sh
@@ -5,3 +5,11 @@ python setup.py install
 
 cd ../sparse_voxels/
 python setup.py install
+
+cd ../dpvo_ext/
+wget https://gitlab.com/libeigen/eigen/-/archive/3.4.0/eigen-3.4.0.zip
+unzip eigen-3.4.0.zip -d .
+python setup.py install
+
+cd ../evaluate_3d_reconstruction_lib/
+python setup.py install