add CUDA distance implementation

- python interface changes - add `from_fasta_to_lower_triangular` - this constructs lower triangular matrix file directly from fasta file - for now only works on GPU - faster & requires less RAM than doing `from_fasta` followed by `dump_lower_triangular` - add `use_gpu` option to `from_fasta` - if True and include_x is False then the GPU is used to calcuate distances matrix - CUDA implementation - each block of threads calculates a single element of the distances matrix - a kernel is launched running on a grid of these blocks to calculate a subset of the distances matrix - I/O is interleaved with computation: the CPU writes the previous kernel results as the next kernel is running - Build wheels with manylinux2014 with CUDA installed from https://github.com/ameli/manylinux-cuda TODO: - add scaling plot to PR - add logic to check for GPU, fail gracefully if not found - add error handling if CUDA kernel launch fails - ensure wheel built with CUDA still runs as before on CPU both with & without a gpu - see https://github.com/OpenNMT/CTranslate2/blob/master/.github/workflows/ci.yml for example of building wheels with cuda, they install cuda into the manylinux container
ssciwr · Jan 8, 2024 · b3d45b8 · b3d45b8
1 parent 1f218de
commit b3d45b8
Show file tree

Hide file tree

Showing 33 changed files with 1,084 additions and 247 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -65,4 +65,4 @@ jobs:
       - name: run tests
         shell: bash
         working-directory: ${{runner.workspace}}/build
-        run: ctest
+        run: ctest -j2 -E gpu
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -1,13 +1,14 @@
 name: Build Wheels + PyPI deploy
 
 on:
-  push:
-    branches:
-      - main
-  pull_request:
-  workflow_dispatch:
-  release:
-    types: [published]
+  push
+  # :
+  #   branches:
+  #     - main
+  # pull_request:
+  # workflow_dispatch:
+  # release:
+  #   types: [published]
 
 concurrency:
   group: wheels-${{ github.ref }}
@@ -28,9 +29,13 @@ jobs:
           submodules: "recursive"
 
       - uses: pypa/[email protected]
+        env:
+          CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_12.3
+          CIBW_BEFORE_ALL_LINUX: ci/manylinux_setup.sh
 
-      - uses: actions/upload-artifact@v3
+      - uses: actions/upload-artifact@v4
         with:
+          name: artifacts-${{ matrix.os }}
           path: ./wheelhouse/*.whl
 
   upload_pypi:
@@ -41,9 +46,10 @@ jobs:
     permissions:
       id-token: write
     steps:
-      - uses: actions/download-artifact@v3
+      - uses: actions/download-artifact@v4
         with:
-          name: artifact
+          pattern: artifacts-*
+          merge-multiple: true
           path: dist
 
       - uses: pypa/gh-action-pypi-publish@release/v1
diff --git a/.gitmodules b/.gitmodules
@@ -13,3 +13,6 @@
 [submodule "Catch2"]
 	path = ext/Catch2
 	url = https://github.com/catchorg/Catch2.git
+[submodule "ext/fmt"]
+	path = ext/fmt
+	url = https://github.com/fmtlib/fmt.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required(VERSION 3.11..3.26)
+cmake_minimum_required(VERSION 3.18..3.27)
 
 project(
   hammingdist
@@ -39,6 +39,14 @@ set(HAMMING_WITH_NEON
     no
     CACHE BOOL "Enable NEON optimized code on Arm64 CPUs")
 
+set(HAMMING_WITH_CUDA
+    no
+    CACHE BOOL "Build with CUDA support (nvidia gpu)")
+
+if(HAMMING_WITH_CUDA)
+  enable_language(CUDA)
+endif()
+
 # Add git submodules
 add_subdirectory(ext)
 

diff --git a/ci/manylinux_setup.sh b/ci/manylinux_setup.sh
@@ -0,0 +1,18 @@
+#! /bin/bash
+
+set -e -x
+
+# yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+
+# yum install --setopt=obsoletes=0 -y \
+#         cuda-nvcc-11-2-11.2.152-1 \
+#         cuda-cudart-devel-11-2-11.2.152-1 \
+#         libcurand-devel-11-2-10.2.3.152-1 \
+#         libcudnn8-devel-8.1.1.33-1.cuda11.2 \
+#         libcublas-devel-11-2-11.4.1.1043-1
+
+# ln -s cuda-11.2 /usr/local/cuda
+
+nvcc --version
+
+which nvcc
diff --git a/ext/CMakeLists.txt b/ext/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(fmt)
+
 if(HAMMING_BUILD_PYTHON)
   add_subdirectory(pybind11)
 endif()

diff --git a/ext/fmt b/ext/fmt
diff --git a/include/hamming/distance_avx2.hh b/include/hamming/distance_avx2.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_AVX2_HH
-#define _HAMMING_DISTANCE_AVX2_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_avx2(const std::vector<GeneBlock> &a,
                   const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/distance_avx512.hh b/include/hamming/distance_avx512.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_AVX512_HH
-#define _HAMMING_DISTANCE_AVX512_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_avx512(const std::vector<GeneBlock> &a,
                     const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/distance_cuda.hh b/include/hamming/distance_cuda.hh
@@ -0,0 +1,25 @@
+#pragma once
+
+#include <cstdint>
+#include <iostream>
+#include <vector>
+
+#include "hamming/hamming_impl_types.hh"
+
+namespace hamming {
+
+int distance_cuda(const std::vector<GeneBlock> &a,
+                  const std::vector<GeneBlock> &b);
+
+// for now explicit function def for each choice of integer type
+std::vector<uint8_t>
+distances_cuda_8bit(const std::vector<std::vector<GeneBlock>> &data);
+
+std::vector<uint16_t>
+distances_cuda_16bit(const std::vector<std::vector<GeneBlock>> &data);
+
+void distances_cuda_to_lower_triangular(
+    const std::vector<std::vector<GeneBlock>> &data,
+    const std::string &filename);
+
+} // namespace hamming
diff --git a/include/hamming/distance_sse2.hh b/include/hamming/distance_sse2.hh
@@ -1,5 +1,4 @@
-#ifndef _HAMMING_DISTANCE_SSE2_HH
-#define _HAMMING_DISTANCE_SSE2_HH
+#pragma once
 
 #include <cstdint>
 #include <vector>
@@ -12,5 +11,3 @@ int distance_sse2(const std::vector<GeneBlock> &a,
                   const std::vector<GeneBlock> &b);
 
 }
-
-#endif
diff --git a/include/hamming/hamming.hh b/include/hamming/hamming.hh
@@ -1,13 +1,12 @@
-#ifndef _HAMMING_HH
-#define _HAMMING_HH
+#pragma once
 
 #include "hamming/hamming_impl.hh"
 #include "hamming/hamming_types.hh"
+#include "hamming/hamming_utils.hh"
 #include <cmath>
 #include <fstream>
 #include <sstream>
 #include <string>
-#include <unordered_map>
 #include <vector>
 
 namespace hamming {
@@ -19,10 +18,11 @@ inline std::size_t uint_sqrt(std::size_t x) {
 template <typename DistIntType> struct DataSet {
   explicit DataSet(std::vector<std::string> &data, bool include_x = false,
                    bool clear_input_data = false,
-                   std::vector<std::size_t> &&indices = {})
+                   std::vector<std::size_t> &&indices = {},
+                   bool use_gpu = false)
       : nsamples(data.size()), sequence_indices(std::move(indices)) {
     validate_data(data);
-    result = distances<DistIntType>(data, include_x, clear_input_data);
+    result = distances<DistIntType>(data, include_x, clear_input_data, use_gpu);
   }
 
   explicit DataSet(const std::string &filename) {
@@ -67,35 +67,7 @@ template <typename DistIntType> struct DataSet {
   }
 
   void dump_lower_triangular(const std::string &filename) {
-    std::ofstream stream(filename);
-#ifdef HAMMING_WITH_OPENMP
-    std::size_t block_size = 200;
-#pragma omp parallel for ordered schedule(static, 1)
-    for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) {
-      std::stringstream line;
-      std::size_t i_end = i_start + block_size;
-      if (i_end > nsamples) {
-        i_end = nsamples;
-      }
-      for (std::size_t i = i_start; i < i_end; ++i) {
-        std::size_t offset{i * (i - 1) / 2};
-        for (std::size_t j = 0; j + 1 < i; ++j) {
-          line << static_cast<int>(result[offset + j]) << ",";
-        }
-        line << static_cast<int>(result[offset + i - 1]) << "\n";
-      }
-#pragma omp ordered
-      stream << line.str();
-    }
-#else
-    std::size_t k = 0;
-    for (std::size_t i = 1; i < nsamples; ++i) {
-      for (std::size_t j = 0; j + 1 < i; ++j) {
-        stream << static_cast<int>(result[k++]) << ",";
-      }
-      stream << static_cast<int>(result[k++]) << "\n";
-    }
-#endif
+    write_lower_triangular(filename, result);
   }
 
   void dump_sequence_indices(const std::string &filename) {
@@ -127,67 +99,52 @@ template <typename DistIntType> struct DataSet {
   std::vector<std::size_t> sequence_indices{};
 };
 
-DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data);
+DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data,
+                                            bool include_x = false,
+                                            bool use_gpu = false);
 
 DataSet<DefaultDistIntType> from_csv(const std::string &filename);
 
-DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename);
-
 template <typename DistIntType>
-DataSet<DistIntType>
-from_fasta(const std::string &filename, bool include_x = false,
-           bool remove_duplicates = false, std::size_t n = 0) {
-  std::vector<std::string> data;
-  data.reserve(n);
-  if (n == 0) {
-    n = std::numeric_limits<std::size_t>::max();
-    data.reserve(65536);
-  }
-  std::unordered_map<std::string, std::size_t> map_seq_to_index;
-  std::vector<std::size_t> sequence_indices{};
-  // Initializing the stream
+DataSet<DistIntType> from_lower_triangular(const std::string &filename) {
+  std::vector<DistIntType> distances;
   std::ifstream stream(filename);
-  std::size_t count = 0;
-  std::size_t count_unique = 0;
   std::string line;
-  // skip first header
-  std::getline(stream, line);
-  while (count < n && !stream.eof()) {
-    std::string seq{};
-    while (std::getline(stream, line) && line[0] != '>') {
-      seq.append(line);
-    }
-    if (remove_duplicates) {
-      auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
-      if (result.second) {
-        ++count_unique;
-      }
-      sequence_indices.push_back(result.first->second);
-    } else {
-      data.push_back(std::move(seq));
-    }
-    ++count;
-  }
-  if (remove_duplicates) {
-    // copy each unique sequence to the vector of strings
-    data.resize(count_unique);
-    for (auto &key_value_pair : map_seq_to_index) {
-      data[key_value_pair.second] = key_value_pair.first;
+  while (std::getline(stream, line)) {
+    std::istringstream s(line);
+    std::string d;
+    while (s.good()) {
+      std::getline(s, d, ',');
+      distances.push_back(safe_int_cast<DistIntType>(std::stoi(d)));
     }
   }
+  return DataSet(std::move(distances));
+}
+
+template <typename DistIntType>
+DataSet<DistIntType> from_fasta(const std::string &filename,
+                                bool include_x = false,
+                                bool remove_duplicates = false,
+                                std::size_t n = 0, bool use_gpu = false) {
+  auto [data, sequence_indices] = read_fasta(filename, remove_duplicates, n);
   return DataSet<DistIntType>(data, include_x, true,
-                              std::move(sequence_indices));
+                              std::move(sequence_indices), use_gpu);
 }
 
+void from_fasta_to_lower_triangular(const std::string &input_filename,
+                                    const std::string &output_filename,
+                                    bool remove_duplicates = false,
+                                    std::size_t n = 0, bool use_gpu = false);
+
 ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
                               bool include_x = false);
+
 std::vector<ReferenceDistIntType>
 fasta_reference_distances(const std::string &reference_sequence,
                           const std::string &fasta_file,
                           bool include_x = false);
+
 std::vector<std::size_t> fasta_sequence_indices(const std::string &fasta_file,
                                                 std::size_t n = 0);
 
 } // namespace hamming
-
-#endif