From 1b53c3fff418d53dfe2a51576aab3be9ff61fbd4 Mon Sep 17 00:00:00 2001 From: Liam Keegan Date: Mon, 19 Aug 2024 15:54:03 +0200 Subject: [PATCH] Allow users to access the distances data as a numpy array - add read-only property `lt_array` to `Dataset` that provides the raw distances data as a 1-d numpy array - add example of use to readme - bump deps - add python 3.13 - temporarily skip tests for Python 3.13 wheel on linux due to numpy import error - bump version - resolves #134 --- .github/workflows/ci.yml | 2 +- .github/workflows/wheels.yml | 4 ++-- .pre-commit-config.yaml | 4 ++-- CMakeLists.txt | 4 ++-- README.md | 7 +++++++ ext/Catch2 | 2 +- ext/benchmark | 2 +- ext/pybind11 | 2 +- pyproject.toml | 7 ++++--- python/hammingdist.cc | 10 ++++++++-- python/tests/test_hammingdist.py | 14 ++++++++++++-- 11 files changed, 41 insertions(+), 17 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f005aa8..20e976e 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -32,7 +32,7 @@ jobs: - os: macos-13 open-mp: "OFF" neon: "OFF" - - os: macos-14 + - os: macos-latest open-mp: "OFF" neon: "ON" - os: windows-latest diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml index fc68223..2b9f874 100644 --- a/.github/workflows/wheels.yml +++ b/.github/workflows/wheels.yml @@ -20,13 +20,13 @@ jobs: strategy: matrix: - os: [ubuntu-latest, macos-13, macos-14, windows-latest] + os: [ubuntu-latest, macos-13, macos-latest, windows-latest] steps: - uses: actions/checkout@v4 with: submodules: "recursive" - - uses: pypa/cibuildwheel@v2.19 + - uses: pypa/cibuildwheel@v2.20 env: CIBW_MANYLINUX_X86_64_IMAGE: sameli/manylinux2014_x86_64_cuda_11.8 - uses: actions/upload-artifact@v4 diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 21c8675..cd42e2c 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -8,7 +8,7 @@ repos: - id: mixed-line-ending - repo: https://github.com/psf/black - rev: 24.4.2 + rev: 24.8.0 hooks: - id: black @@ -29,7 +29,7 @@ repos: - id: prettier - repo: https://github.com/python-jsonschema/check-jsonschema - rev: 0.28.6 + rev: 0.29.1 hooks: - id: check-github-workflows - id: check-readthedocs diff --git a/CMakeLists.txt b/CMakeLists.txt index ec3b1a3..8c708a4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,8 @@ -cmake_minimum_required(VERSION 3.23..3.29) +cmake_minimum_required(VERSION 3.23..3.30) project( hammingdist - VERSION 1.2.0 + VERSION 1.3.0 LANGUAGES CXX) include(CTest) diff --git a/README.md b/README.md index f86b15b..04e0145 100644 --- a/README.md +++ b/README.md @@ -58,6 +58,13 @@ data.dump_sparse("sparse.txt", threshold=3) # If the `remove_duplicates` option was used, the sequence indices can also be written. # For each input sequence, this prints the corresponding index in the output: data.dump_sequence_indices("indices.txt") + +# The lower-triangular distance elements can also be directly accessed as a 1-d numpy array: +lt_array = data.lt_array +# The elements in this array correspond to the 2-d indices (row=1,col=0), (row=2,col=0), (row=2,col=1), ... +# These indices can be generated using the numpy tril_indices function, e.g. to construct the lower-triangular matrix: +lt_matrix = np.zeros((n_seq, n_seq)) +lt_matrix[np.tril_indices(n_seq, -1)] = lt_array ``` ## Duplicates diff --git a/ext/Catch2 b/ext/Catch2 index f981c9c..4e8d92b 160000 --- a/ext/Catch2 +++ b/ext/Catch2 @@ -1 +1 @@ -Subproject commit f981c9cbcac07a2690e5a86767eba490b5465463 +Subproject commit 4e8d92bf02f7d1c8006a0e7a5ecabd8e62d98502 diff --git a/ext/benchmark b/ext/benchmark index 015d1a0..12235e2 160000 --- a/ext/benchmark +++ b/ext/benchmark @@ -1 +1 @@ -Subproject commit 015d1a091af6937488242b70121858bce8fd40e9 +Subproject commit 12235e24652fc7f809373e7c11a5f73c5763fc4c diff --git a/ext/pybind11 b/ext/pybind11 index 941f45b..c6239a8 160000 --- a/ext/pybind11 +++ b/ext/pybind11 @@ -1 +1 @@ -Subproject commit 941f45bcb51457884fa1afd6e24a67377d70f75c +Subproject commit c6239a8a1b6871cc0fb5f7af885a02ffd1349f9d diff --git a/pyproject.toml b/pyproject.toml index d50c2d7..cc6794d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build" [project] name = "hammingdist" -version = "1.2.0" +version = "1.3.0" description = "A fast tool to calculate Hamming distances" readme = "README.md" license = {text = "MIT"} @@ -23,6 +23,7 @@ classifiers=[ "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", "Programming Language :: Python :: 3.12", + "Programming Language :: Python :: 3.13", "Programming Language :: Python :: Implementation :: CPython", "Programming Language :: Python :: Implementation :: PyPy", "Operating System :: MacOS :: MacOS X", @@ -39,7 +40,7 @@ test = ["pytest", "numpy"] [tool.scikit-build] cmake.version = ">=3.23" -cmake.verbose = true +build.verbose = true [tool.scikit-build.cmake.define] BUILD_TESTING = "OFF" @@ -48,7 +49,7 @@ HAMMING_BUILD_PYTHON = "ON" [tool.cibuildwheel] skip = "*-manylinux_i686 *-musllinux*" -test-skip = "pp*" +test-skip = "pp* cp313-manylinux_x86_64" test-extras = "test" test-command = "pytest {project}/python/tests -v" environment = { BLAS="None", LAPACK="None", ATLAS="None" } diff --git a/python/hammingdist.cc b/python/hammingdist.cc index f0641c3..68aea31 100644 --- a/python/hammingdist.cc +++ b/python/hammingdist.cc @@ -43,7 +43,10 @@ PYBIND11_MODULE(hammingdist, m) { &DataSet::dump_sequence_indices, "Dump row index in distances matrix for each input sequence") .def("__getitem__", &DataSet::operator[]) - .def_readonly("_distances", &DataSet::result); + .def_readonly("_distances", &DataSet::result) + .def_property_readonly("lt_array", [](DataSet &self) { + return py::array(self.result.size(), self.result.data()); + }); py::class_>(m, "DataSetLarge") .def("dump", &DataSet::dump, @@ -58,7 +61,10 @@ PYBIND11_MODULE(hammingdist, m) { .def("dump_sequence_indices", &DataSet::dump_sequence_indices, "Dump row index in distances matrix for each input sequence") .def("__getitem__", &DataSet::operator[]) - .def_readonly("_distances", &DataSet::result); + .def_readonly("_distances", &DataSet::result) + .def_property_readonly("lt_array", [](DataSet &self) { + return py::array(self.result.size(), self.result.data()); + }); m.def("from_stringlist", &from_stringlist, "Creates a dataset from a list of strings"); diff --git a/python/tests/test_hammingdist.py b/python/tests/test_hammingdist.py index 1a885b1..35c1d2e 100644 --- a/python/tests/test_hammingdist.py +++ b/python/tests/test_hammingdist.py @@ -13,6 +13,8 @@ def write_fasta_file(filename, sequences): def check_output_sizes(dat, n_in, n_out, tmp_out_file, fasta_sequence_indices=None): + assert dat.lt_array.shape == (n_out * (n_out - 1) // 2,) + dat.dump(tmp_out_file) dump = np.loadtxt(tmp_out_file, delimiter=",") assert len(dump) == n_out @@ -97,8 +99,9 @@ def test_from_fasta(from_fasta_func, use_gpu, tmp_path): ) @pytest.mark.parametrize("max_distance", [0, 1, 2, 3, 89, 497, 9999999]) def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path): - # generate 50 sequences, each with 25 characters - sequences = ["".join(random.choices(chars, k=25)) for i in range(50)] + n_seq = 50 + n_chars = 25 + sequences = ["".join(random.choices(chars, k=n_chars)) for i in range(n_seq)] fasta_file = str(tmp_path / "fasta.txt") write_fasta_file(fasta_file, sequences) # calculate distances matrix @@ -108,6 +111,12 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path): include_x=include_x, max_distance=max_distance, ) + # get lower-triangular data as 1-d array + lt_array = data.lt_array + assert lt_array.shape == (n_seq * (n_seq - 1) // 2,) + # reshape to lower-triangular matrix + lt_matrix = np.zeros((n_seq, n_seq), dtype=np.uint8) + lt_matrix[np.tril_indices(n_seq, -1)] = lt_array # use each sequence in turn as the reference sequence & calculate reference distances for i, sequence in enumerate(sequences): vec = hammingdist.fasta_reference_distances( @@ -120,6 +129,7 @@ def test_fasta_reference_distances(chars, include_x, max_distance, tmp_path): # if x is not included, invalid chars have distance 1 but data[i,i] returns 0 by construction if include_x or i != j: assert data[i, j] == min(max_distance, dist) + assert lt_matrix[max(i, j), min(i, j)] == min(max_distance, dist) # should also agree with output of distance function for these two sequences assert dist == hammingdist.distance( sequences[i], sequences[j], include_x=include_x