diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml
index 92d6036f..3af88008 100644
--- a/.github/workflows/lint.yml
+++ b/.github/workflows/lint.yml
@@ -86,6 +86,11 @@ jobs:
         run: |
           make clang-format
 
+      - name: clang-tidy
+        run: |
+          sudo apt-get update && sudo apt-get install libomp-dev --yes
+          make clang-tidy
+
       - name: addlicense
         run: |
           make addlicense
diff --git a/CPPLINT.cfg b/CPPLINT.cfg
index 41265bb6..dd346401 100644
--- a/CPPLINT.cfg
+++ b/CPPLINT.cfg
@@ -1 +1,4 @@
 linelength=100
+filter=-readability/nolint
+filter=-readability/braces
+filter=-whitespace/newline
diff --git a/Makefile b/Makefile
index e29e0dc5..d539f3ee 100644
--- a/Makefile
+++ b/Makefile
@@ -5,7 +5,8 @@ PROJECT_PATH   = $(PROJECT_NAME)
 SHELL          = /bin/bash
 SOURCE_FOLDERS = $(PROJECT_PATH) examples include src tests docs
 PYTHON_FILES   = $(shell find $(SOURCE_FOLDERS) -type f -name "*.py" -o -name "*.pyi")
-CXX_FILES      = $(shell find $(SOURCE_FOLDERS) -type f -name "*.h" -o -name "*.cpp" -o -name "*.cuh" -o -name "*.cu")
+CXX_FILES      = $(shell find $(SOURCE_FOLDERS) -type f -name "*.h" -o -name "*.cpp")
+CUDA_FILES      = $(shell find $(SOURCE_FOLDERS) -type f -name "*.cuh" -o -name "*.cu")
 COMMIT_HASH    = $(shell git log -1 --format=%h)
 PATH           := $(HOME)/go/bin:$(PATH)
 PYTHON         ?= $(shell command -v python3 || command -v python)
@@ -81,6 +82,9 @@ pytest-install:
 	$(call check_pip_install,pytest-cov)
 	$(call check_pip_install,pytest-xdist)
 
+cmake-install:
+	command -v cmake || $(call check_pip_install,cmake)
+
 cpplint-install:
 	$(call check_pip_install,cpplint)
 
@@ -129,11 +133,25 @@ pre-commit: pre-commit-install
 
 # C++ linters
 
+cmake-configure: cmake-install
+	cmake -S . -B cmake-build-debug \
+		-DCMAKE_BUILD_TYPE=Debug \
+		-DCMAKE_EXPORT_COMPILE_COMMANDS=ON \
+		-DPYTHON_EXECUTABLE="$(PYTHON)"
+
+cmake-build: cmake-configure
+	cmake --build cmake-build-debug --parallel
+
+cmake: cmake-build
+
 cpplint: cpplint-install
-	$(PYTHON) -m cpplint $(CXX_FILES)
+	$(PYTHON) -m cpplint $(CXX_FILES) $(CUDA_FILES)
 
 clang-format: clang-format-install
-	$(CLANG_FORMAT) --style=file -i $(CXX_FILES) -n --Werror
+	$(CLANG_FORMAT) --style=file -i $(CXX_FILES) $(CUDA_FILES) -n --Werror
+
+clang-tidy: clang-tidy-install cmake-configure
+	clang-tidy -p=cmake-build-debug $(CXX_FILES)
 
 # Documentation
 
@@ -156,12 +174,12 @@ clean-docs:
 
 # Utility functions
 
-lint: flake8 py-format mypy pylint clang-format cpplint addlicense docstyle spelling
+lint: flake8 py-format mypy pylint clang-format clang-tidy cpplint addlicense docstyle spelling
 
 format: py-format-install clang-format-install addlicense-install
 	$(PYTHON) -m isort --project $(PROJECT_NAME) $(PYTHON_FILES)
 	$(PYTHON) -m black $(PYTHON_FILES) tutorials
-	$(CLANG_FORMAT) -style=file -i $(CXX_FILES)
+	$(CLANG_FORMAT) -style=file -i $(CXX_FILES) $(CUDA_FILES)
 	addlicense -c $(COPYRIGHT) -ignore tests/coverage.xml -l apache -y 2022-$(shell date +"%Y") $(SOURCE_FOLDERS)
 
 clean-py:
diff --git a/docs/source/spelling_wordlist.txt b/docs/source/spelling_wordlist.txt
index bd646f0c..92244376 100644
--- a/docs/source/spelling_wordlist.txt
+++ b/docs/source/spelling_wordlist.txt
@@ -146,3 +146,4 @@ ATen
 samplable
 conj
 reparameterize
+rtype
diff --git a/include/adam_op/adam_op.h b/include/adam_op/adam_op.h
index 8b7ae2bf..76baea3f 100644
--- a/include/adam_op/adam_op.h
+++ b/include/adam_op/adam_op.h
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #pragma once
+
 #include <torch/extension.h>
 
 #include <vector>
@@ -69,7 +70,7 @@ TensorArray<2> adamBackwardUpdates(const torch::Tensor &dupdates,
                                    const pyfloat_t b2,
                                    const pyuint_t count);
 
-void buildSubmodule(py::module &mod);  // NOLINT
+void buildSubmodule(py::module &mod);  // NOLINT[runtime/references]
 
 }  // namespace adam_op
 }  // namespace torchopt
diff --git a/include/adam_op/adam_op_impl_cpu.h b/include/adam_op/adam_op_impl_cpu.h
index 3e8da376..20f12ae1 100644
--- a/include/adam_op/adam_op_impl_cpu.h
+++ b/include/adam_op/adam_op_impl_cpu.h
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #pragma once
+
 #include <torch/extension.h>
 
 #include <vector>
diff --git a/include/adam_op/adam_op_impl_cuda.cuh b/include/adam_op/adam_op_impl_cuda.cuh
index a7ddb937..cdb3ae58 100644
--- a/include/adam_op/adam_op_impl_cuda.cuh
+++ b/include/adam_op/adam_op_impl_cuda.cuh
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #pragma once
+
 #include <torch/extension.h>
 
 #include <vector>
diff --git a/include/common.h b/include/common.h
index 5353e48e..ac281eb9 100644
--- a/include/common.h
+++ b/include/common.h
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #pragma once
+
 #include <torch/extension.h>
 
 #include <array>
diff --git a/include/utils.h b/include/utils.h
index 714f98d4..d5cd2e00 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -14,6 +14,7 @@
 // =============================================================================
 
 #pragma once
+
 #include <torch/extension.h>
 
 #include <array>
diff --git a/src/adam_op/adam_op.cpp b/src/adam_op/adam_op.cpp
index 18bb5d27..57b6ee0f 100644
--- a/src/adam_op/adam_op.cpp
+++ b/src/adam_op/adam_op.cpp
@@ -149,7 +149,7 @@ TensorArray<2> adamBackwardUpdates(const torch::Tensor &dupdates,
   }
 }
 
-void buildSubmodule(py::module &mod) {  // NOLINT
+void buildSubmodule(py::module &mod) {  // NOLINT[runtime/references]
   py::module m = mod.def_submodule("adam_op", "Adam Ops");
   m.def("forward_",
         &adamForwardInplace,
diff --git a/src/adam_op/adam_op_impl_cpu.cpp b/src/adam_op/adam_op_impl_cpu.cpp
index cf734c4f..e242bedf 100644
--- a/src/adam_op/adam_op_impl_cpu.cpp
+++ b/src/adam_op/adam_op_impl_cpu.cpp
@@ -40,9 +40,8 @@ void adamForwardInplaceCPUKernel(const other_t b1,
                                  scalar_t *__restrict__ updates_ptr,
                                  scalar_t *__restrict__ mu_ptr,
                                  scalar_t *__restrict__ nu_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t updates = updates_ptr[tid];
     const scalar_t mu = mu_ptr[tid];
@@ -94,9 +93,8 @@ void adamForwardMuCPUKernel(const scalar_t *__restrict__ updates_ptr,
                             const other_t b1,
                             const size_t n,
                             scalar_t *__restrict__ mu_out_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t updates = updates_ptr[tid];
     const scalar_t mu = mu_ptr[tid];
@@ -128,9 +126,8 @@ void adamForwardNuCPUKernel(const scalar_t *__restrict__ updates_ptr,
                             const other_t b2,
                             const size_t n,
                             scalar_t *__restrict__ nu_out_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t updates = updates_ptr[tid];
     const scalar_t nu = nu_ptr[tid];
@@ -166,9 +163,8 @@ void adamForwardUpdatesCPUKernel(const scalar_t *__restrict__ new_mu_ptr,
                                  const other_t eps_root,
                                  const size_t n,
                                  scalar_t *__restrict__ updates_out_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t new_mu = new_mu_ptr[tid];
     const scalar_t new_nu = new_nu_ptr[tid];
@@ -212,9 +208,8 @@ void adamBackwardMuCPUKernel(const scalar_t *__restrict__ dmu_ptr,
                              const size_t n,
                              scalar_t *__restrict__ dupdates_out_ptr,
                              scalar_t *__restrict__ dmu_out_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t dmu = dmu_ptr[tid];
 
@@ -249,9 +244,8 @@ void adamBackwardNuCPUKernel(const scalar_t *__restrict__ dnu_ptr,
                              const size_t n,
                              scalar_t *__restrict__ dupdates_out_ptr,
                              scalar_t *__restrict__ dnu_out_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t dnu = dnu_ptr[tid];
     const scalar_t updates = updates_ptr[tid];
@@ -290,9 +284,8 @@ void adamBackwardUpdatesCPUKernel(const scalar_t *__restrict__ dupdates_ptr,
                                   const size_t n,
                                   scalar_t *__restrict__ dnew_mu_out_ptr,
                                   scalar_t *__restrict__ dnew_nu_out_ptr) {
-#pragma omp parallel for num_threads( \
-    std::min(n / MIN_NUMEL_USE_OMP,   \
-             static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)  // NOLINT
+#pragma omp parallel for num_threads(std::min( \
+    n / MIN_NUMEL_USE_OMP, static_cast <size_t>(omp_get_num_procs()))) if (n > MIN_NUMEL_USE_OMP)
   for (size_t tid = 0; tid < n; ++tid) {
     const scalar_t dupdates = dupdates_ptr[tid];
     const scalar_t updates = updates_ptr[tid];