Allow users to access the distances data as a numpy array

- add read-only property `lt_array` to `Dataset` that provides the raw distances data as a 1-d numpy array - add example of use to readme - bump deps - add python 3.13 - temporarily skip tests for Python 3.13 wheel on linux due to numpy import error - bump version - resolves #134
ssciwr · Aug 19, 2024 · 1b53c3f · 1b53c3f
1 parent 860fae9
commit 1b53c3f
Show file tree

Hide file tree

Showing 11 changed files with 41 additions and 17 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -32,7 +32,7 @@ jobs:
           - os: macos-13
             open-mp: "OFF"
             neon: "OFF"
-          - os: macos-14
+          - os: macos-latest
             open-mp: "OFF"
             neon: "ON"
           - os: windows-latest

diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
@@ -20,13 +20,13 @@ jobs:
 
     strategy:
       matrix:
-        os: [ubuntu-latest, macos-13, macos-14, windows-latest]
+        os: [ubuntu-latest, macos-13, macos-latest, windows-latest]
 
     steps:
       - uses: actions/checkout@v4
         with:
           submodules: "recursive"
-      - uses: pypa/cibuildwheel@v2.19
+      - uses: pypa/cibuildwheel@v2.20
         env:
           CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_11.8
       - uses: actions/upload-artifact@v4

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ repos:
       - id: mixed-line-ending
 
   - repo: https://github.com/psf/black
-    rev: 24.4.2
+    rev: 24.8.0
     hooks:
       - id: black
 
@@ -29,7 +29,7 @@ repos:
       - id: prettier
 
   - repo: https://github.com/python-jsonschema/check-jsonschema
-    rev: 0.28.6
+    rev: 0.29.1
     hooks:
       - id: check-github-workflows
       - id: check-readthedocs

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,8 +1,8 @@
-cmake_minimum_required(VERSION 3.23..3.29)
+cmake_minimum_required(VERSION 3.23..3.30)
 
 project(
   hammingdist
-  VERSION 1.2.0
+  VERSION 1.3.0
   LANGUAGES CXX)
 
 include(CTest)

diff --git a/README.md b/README.md
@@ -58,6 +58,13 @@ data.dump_sparse("sparse.txt", threshold=3)
 # If the `remove_duplicates` option was used, the sequence indices can also be written.
 # For each input sequence, this prints the corresponding index in the output:
 data.dump_sequence_indices("indices.txt")
+
+# The lower-triangular distance elements can also be directly accessed as a 1-d numpy array:
+lt_array = data.lt_array
+# The elements in this array correspond to the 2-d indices (row=1,col=0), (row=2,col=0), (row=2,col=1), ...
+# These indices can be generated using the numpy tril_indices function, e.g. to construct the lower-triangular matrix:
+lt_matrix = np.zeros((n_seq, n_seq))
+lt_matrix[np.tril_indices(n_seq, -1)] = lt_array
 ```
 
 ## Duplicates

diff --git a/ext/Catch2 b/ext/Catch2
diff --git a/ext/benchmark b/ext/benchmark
diff --git a/ext/pybind11 b/ext/pybind11
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "hammingdist"
-version = "1.2.0"
+version = "1.3.0"
 description = "A fast tool to calculate Hamming distances"
 readme = "README.md"
 license = {text = "MIT"}
@@ -23,6 +23,7 @@ classifiers=[
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",
     "Programming Language :: Python :: 3.12",
+    "Programming Language :: Python :: 3.13",
     "Programming Language :: Python :: Implementation :: CPython",
     "Programming Language :: Python :: Implementation :: PyPy",
     "Operating System :: MacOS :: MacOS X",
@@ -39,7 +40,7 @@ test = ["pytest", "numpy"]
 
 [tool.scikit-build]
 cmake.version = ">=3.23"
-cmake.verbose = true
+build.verbose = true
 
 [tool.scikit-build.cmake.define]
 BUILD_TESTING = "OFF"
@@ -48,7 +49,7 @@ HAMMING_BUILD_PYTHON = "ON"
 
 [tool.cibuildwheel]
 skip = "*-manylinux_i686 *-musllinux*"
-test-skip = "pp*"
+test-skip = "pp* cp313-manylinux_x86_64"
 test-extras = "test"
 test-command = "pytest {project}/python/tests -v"
 environment = { BLAS="None", LAPACK="None", ATLAS="None" }

diff --git a/python/hammingdist.cc b/python/hammingdist.cc
@@ -43,7 +43,10 @@ PYBIND11_MODULE(hammingdist, m) {
            &DataSet<DefaultDistIntType>::dump_sequence_indices,
            "Dump row index in distances matrix for each input sequence")
       .def("__getitem__", &DataSet<DefaultDistIntType>::operator[])
-      .def_readonly("_distances", &DataSet<DefaultDistIntType>::result);
+      .def_readonly("_distances", &DataSet<DefaultDistIntType>::result)
+      .def_property_readonly("lt_array", [](DataSet<DefaultDistIntType> &self) {
+        return py::array(self.result.size(), self.result.data());
+      });
 
   py::class_<DataSet<uint16_t>>(m, "DataSetLarge")
       .def("dump", &DataSet<uint16_t>::dump,
@@ -58,7 +61,10 @@ PYBIND11_MODULE(hammingdist, m) {
       .def("dump_sequence_indices", &DataSet<uint16_t>::dump_sequence_indices,
            "Dump row index in distances matrix for each input sequence")
       .def("__getitem__", &DataSet<uint16_t>::operator[])
-      .def_readonly("_distances", &DataSet<uint16_t>::result);
+      .def_readonly("_distances", &DataSet<uint16_t>::result)
+      .def_property_readonly("lt_array", [](DataSet<uint16_t> &self) {
+        return py::array(self.result.size(), self.result.data());
+      });
 
   m.def("from_stringlist", &from_stringlist,
         "Creates a dataset from a list of strings");

diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py
@@ -13,6 +13,8 @@ def write_fasta_file(filename, sequences):
 
 
 def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=None):
+    assert dat.lt_array.shape == (n_out * (n_out - 1) // 2,)
+
     dat.dump(tmp_out_file)
     dump = np.loadtxt(tmp_out_file, delimiter=",")
     assert len(dump) == n_out
@@ -97,8 +99,9 @@ def test_from_fasta(from_fasta_func, use_gpu, tmp_path):
 )
 @pytest.mark.parametrize("max_distance", [0, 1, 2, 3, 89, 497, 9999999])
 def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
-    # generate 50 sequences, each with 25 characters
-    sequences = ["".join(random.choices(chars, k=25)) for i in range(50)]
+    n_seq = 50
+    n_chars = 25
+    sequences = ["".join(random.choices(chars, k=n_chars)) for i in range(n_seq)]
     fasta_file = str(tmp_path / "fasta.txt")
     write_fasta_file(fasta_file, sequences)
     # calculate distances matrix
@@ -108,6 +111,12 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
         include_x=include_x,
         max_distance=max_distance,
     )
+    # get lower-triangular data as 1-d array
+    lt_array = data.lt_array
+    assert lt_array.shape == (n_seq * (n_seq - 1) // 2,)
+    # reshape to lower-triangular matrix
+    lt_matrix = np.zeros((n_seq, n_seq), dtype=np.uint8)
+    lt_matrix[np.tril_indices(n_seq, -1)] = lt_array
     # use each sequence in turn as the reference sequence & calculate reference distances
     for i, sequence in enumerate(sequences):
         vec = hammingdist.fasta_reference_distances(
@@ -120,6 +129,7 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path):
             # if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction
             if include_x or i != j:
                 assert data[i, j] == min(max_distance, dist)
+                assert lt_matrix[max(i, j), min(i, j)] == min(max_distance, dist)
             # should also agree with output of distance function for these two sequences
             assert dist == hammingdist.distance(
                 sequences[i], sequences[j], include_x=include_x