Skip to content

Commit

Permalink
add CUDA distance implementation
Browse files Browse the repository at this point in the history
- python interface changes
  - add `from_fasta_to_lower_triangular`
    - this constructs lower triangular matrix file directly from fasta file
    - for now only works on GPU
    - faster & requires less RAM than doing `from_fasta` followed by `dump_lower_triangular`
  - add `use_gpu` option to `from_fasta`
    - if True and include_x is False then the GPU is used to calcuate distances matrix
- CUDA implementation
  - each block of threads calculates a single element of the distances matrix
  - a kernel is launched running on a grid of these blocks to calculate a subset of the distances matrix
  - I/O is interleaved with computation: the CPU writes the previous kernel results as the next kernel is running
- Build wheels with manylinux2014 with CUDA installed from https://github.com/ameli/manylinux-cuda

TODO:

- add scaling plot to PR
- add logic to check for GPU, fail gracefully if not found
- add error handling if CUDA kernel launch fails
- ensure wheel built with CUDA still runs as before on CPU both with & without a gpu
  - see https://github.com/OpenNMT/CTranslate2/blob/master/.github/workflows/ci.yml for example of building wheels with cuda, they install cuda into the manylinux container
  • Loading branch information
lkeegan committed Jan 8, 2024
1 parent 1f218de commit b3d45b8
Show file tree
Hide file tree
Showing 33 changed files with 1,084 additions and 247 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -65,4 +65,4 @@ jobs:
- name: run tests
shell: bash
working-directory: ${{runner.workspace}}/build
run: ctest
run: ctest -j2 -E gpu
26 changes: 16 additions & 10 deletions .github/workflows/wheels.yml
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
name: Build Wheels + PyPI deploy

on:
push:
branches:
- main
pull_request:
workflow_dispatch:
release:
types: [published]
push
# :
# branches:
# - main
# pull_request:
# workflow_dispatch:
# release:
# types: [published]

concurrency:
group: wheels-${{ github.ref }}
Expand All @@ -28,9 +29,13 @@ jobs:
submodules: "recursive"

- uses: pypa/[email protected]
env:
CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_12.3
CIBW_BEFORE_ALL_LINUX: ci/manylinux_setup.sh

- uses: actions/upload-artifact@v3
- uses: actions/upload-artifact@v4
with:
name: artifacts-${{ matrix.os }}
path: ./wheelhouse/*.whl

upload_pypi:
Expand All @@ -41,9 +46,10 @@ jobs:
permissions:
id-token: write
steps:
- uses: actions/download-artifact@v3
- uses: actions/download-artifact@v4
with:
name: artifact
pattern: artifacts-*
merge-multiple: true
path: dist

- uses: pypa/gh-action-pypi-publish@release/v1
3 changes: 3 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,6 @@
[submodule "Catch2"]
path = ext/Catch2
url = https://github.com/catchorg/Catch2.git
[submodule "ext/fmt"]
path = ext/fmt
url = https://github.com/fmtlib/fmt.git
10 changes: 9 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
cmake_minimum_required(VERSION 3.11..3.26)
cmake_minimum_required(VERSION 3.18..3.27)

project(
hammingdist
Expand Down Expand Up @@ -39,6 +39,14 @@ set(HAMMING_WITH_NEON
no
CACHE BOOL "Enable NEON optimized code on Arm64 CPUs")

set(HAMMING_WITH_CUDA
no
CACHE BOOL "Build with CUDA support (nvidia gpu)")

if(HAMMING_WITH_CUDA)
enable_language(CUDA)
endif()

# Add git submodules
add_subdirectory(ext)

Expand Down
18 changes: 18 additions & 0 deletions ci/manylinux_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
#! /bin/bash

set -e -x

# yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo

# yum install --setopt=obsoletes=0 -y \
# cuda-nvcc-11-2-11.2.152-1 \
# cuda-cudart-devel-11-2-11.2.152-1 \
# libcurand-devel-11-2-10.2.3.152-1 \
# libcudnn8-devel-8.1.1.33-1.cuda11.2 \
# libcublas-devel-11-2-11.4.1.1043-1

# ln -s cuda-11.2 /usr/local/cuda

nvcc --version

which nvcc
2 changes: 2 additions & 0 deletions ext/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
add_subdirectory(fmt)

if(HAMMING_BUILD_PYTHON)
add_subdirectory(pybind11)
endif()
Expand Down
1 change: 1 addition & 0 deletions ext/fmt
Submodule fmt added at f5e543
5 changes: 1 addition & 4 deletions include/hamming/distance_avx2.hh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#ifndef _HAMMING_DISTANCE_AVX2_HH
#define _HAMMING_DISTANCE_AVX2_HH
#pragma once

#include <cstdint>
#include <vector>
Expand All @@ -12,5 +11,3 @@ int distance_avx2(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b);

}

#endif
5 changes: 1 addition & 4 deletions include/hamming/distance_avx512.hh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#ifndef _HAMMING_DISTANCE_AVX512_HH
#define _HAMMING_DISTANCE_AVX512_HH
#pragma once

#include <cstdint>
#include <vector>
Expand All @@ -12,5 +11,3 @@ int distance_avx512(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b);

}

#endif
25 changes: 25 additions & 0 deletions include/hamming/distance_cuda.hh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#pragma once

#include <cstdint>
#include <iostream>
#include <vector>

#include "hamming/hamming_impl_types.hh"

namespace hamming {

int distance_cuda(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b);

// for now explicit function def for each choice of integer type
std::vector<uint8_t>
distances_cuda_8bit(const std::vector<std::vector<GeneBlock>> &data);

std::vector<uint16_t>
distances_cuda_16bit(const std::vector<std::vector<GeneBlock>> &data);

void distances_cuda_to_lower_triangular(
const std::vector<std::vector<GeneBlock>> &data,
const std::string &filename);

} // namespace hamming
5 changes: 1 addition & 4 deletions include/hamming/distance_sse2.hh
Original file line number Diff line number Diff line change
@@ -1,5 +1,4 @@
#ifndef _HAMMING_DISTANCE_SSE2_HH
#define _HAMMING_DISTANCE_SSE2_HH
#pragma once

#include <cstdint>
#include <vector>
Expand All @@ -12,5 +11,3 @@ int distance_sse2(const std::vector<GeneBlock> &a,
const std::vector<GeneBlock> &b);

}

#endif
111 changes: 34 additions & 77 deletions include/hamming/hamming.hh
Original file line number Diff line number Diff line change
@@ -1,13 +1,12 @@
#ifndef _HAMMING_HH
#define _HAMMING_HH
#pragma once

#include "hamming/hamming_impl.hh"
#include "hamming/hamming_types.hh"
#include "hamming/hamming_utils.hh"
#include <cmath>
#include <fstream>
#include <sstream>
#include <string>
#include <unordered_map>
#include <vector>

namespace hamming {
Expand All @@ -19,10 +18,11 @@ inline std::size_t uint_sqrt(std::size_t x) {
template <typename DistIntType> struct DataSet {
explicit DataSet(std::vector<std::string> &data, bool include_x = false,
bool clear_input_data = false,
std::vector<std::size_t> &&indices = {})
std::vector<std::size_t> &&indices = {},
bool use_gpu = false)
: nsamples(data.size()), sequence_indices(std::move(indices)) {
validate_data(data);
result = distances<DistIntType>(data, include_x, clear_input_data);
result = distances<DistIntType>(data, include_x, clear_input_data, use_gpu);
}

explicit DataSet(const std::string &filename) {
Expand Down Expand Up @@ -67,35 +67,7 @@ template <typename DistIntType> struct DataSet {
}

void dump_lower_triangular(const std::string &filename) {
std::ofstream stream(filename);
#ifdef HAMMING_WITH_OPENMP
std::size_t block_size = 200;
#pragma omp parallel for ordered schedule(static, 1)
for (std::size_t i_start = 1; i_start < nsamples; i_start += block_size) {
std::stringstream line;
std::size_t i_end = i_start + block_size;
if (i_end > nsamples) {
i_end = nsamples;
}
for (std::size_t i = i_start; i < i_end; ++i) {
std::size_t offset{i * (i - 1) / 2};
for (std::size_t j = 0; j + 1 < i; ++j) {
line << static_cast<int>(result[offset + j]) << ",";
}
line << static_cast<int>(result[offset + i - 1]) << "\n";
}
#pragma omp ordered
stream << line.str();
}
#else
std::size_t k = 0;
for (std::size_t i = 1; i < nsamples; ++i) {
for (std::size_t j = 0; j + 1 < i; ++j) {
stream << static_cast<int>(result[k++]) << ",";
}
stream << static_cast<int>(result[k++]) << "\n";
}
#endif
write_lower_triangular(filename, result);
}

void dump_sequence_indices(const std::string &filename) {
Expand Down Expand Up @@ -127,67 +99,52 @@ template <typename DistIntType> struct DataSet {
std::vector<std::size_t> sequence_indices{};
};

DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data);
DataSet<DefaultDistIntType> from_stringlist(std::vector<std::string> &data,
bool include_x = false,
bool use_gpu = false);

DataSet<DefaultDistIntType> from_csv(const std::string &filename);

DataSet<DefaultDistIntType> from_lower_triangular(const std::string &filename);

template <typename DistIntType>
DataSet<DistIntType>
from_fasta(const std::string &filename, bool include_x = false,
bool remove_duplicates = false, std::size_t n = 0) {
std::vector<std::string> data;
data.reserve(n);
if (n == 0) {
n = std::numeric_limits<std::size_t>::max();
data.reserve(65536);
}
std::unordered_map<std::string, std::size_t> map_seq_to_index;
std::vector<std::size_t> sequence_indices{};
// Initializing the stream
DataSet<DistIntType> from_lower_triangular(const std::string &filename) {
std::vector<DistIntType> distances;
std::ifstream stream(filename);
std::size_t count = 0;
std::size_t count_unique = 0;
std::string line;
// skip first header
std::getline(stream, line);
while (count < n && !stream.eof()) {
std::string seq{};
while (std::getline(stream, line) && line[0] != '>') {
seq.append(line);
}
if (remove_duplicates) {
auto result = map_seq_to_index.emplace(std::move(seq), count_unique);
if (result.second) {
++count_unique;
}
sequence_indices.push_back(result.first->second);
} else {
data.push_back(std::move(seq));
}
++count;
}
if (remove_duplicates) {
// copy each unique sequence to the vector of strings
data.resize(count_unique);
for (auto &key_value_pair : map_seq_to_index) {
data[key_value_pair.second] = key_value_pair.first;
while (std::getline(stream, line)) {
std::istringstream s(line);
std::string d;
while (s.good()) {
std::getline(s, d, ',');
distances.push_back(safe_int_cast<DistIntType>(std::stoi(d)));
}
}
return DataSet(std::move(distances));
}

template <typename DistIntType>
DataSet<DistIntType> from_fasta(const std::string &filename,
bool include_x = false,
bool remove_duplicates = false,
std::size_t n = 0, bool use_gpu = false) {
auto [data, sequence_indices] = read_fasta(filename, remove_duplicates, n);
return DataSet<DistIntType>(data, include_x, true,
std::move(sequence_indices));
std::move(sequence_indices), use_gpu);
}

void from_fasta_to_lower_triangular(const std::string &input_filename,
const std::string &output_filename,
bool remove_duplicates = false,
std::size_t n = 0, bool use_gpu = false);

ReferenceDistIntType distance(const std::string &seq0, const std::string &seq1,
bool include_x = false);

std::vector<ReferenceDistIntType>
fasta_reference_distances(const std::string &reference_sequence,
const std::string &fasta_file,
bool include_x = false);

std::vector<std::size_t> fasta_sequence_indices(const std::string &fasta_file,
std::size_t n = 0);

} // namespace hamming

#endif
Loading

0 comments on commit b3d45b8

Please sign in to comment.