Merge remote-tracking branch 'origin/latest' into lirber/mastiff

sourmash-bio · Nov 25, 2023 · e8e2c51 · e8e2c51
2 parents d4ddd8a + 17bc030
commit e8e2c51
Show file tree

Hide file tree

Showing 27 changed files with 296 additions and 289 deletions.
diff --git a/.github/workflows/build_wheel.yml b/.github/workflows/build_wheel.yml
@@ -42,7 +42,7 @@ jobs:
       - uses: actions/setup-python@v4
         name: Install Python
         with:
-          python-version: '3.9'
+          python-version: '3.10'
 
       - name: Build wheels
         uses: pypa/[email protected]

diff --git a/.github/workflows/build_wheel_all_archs.yml b/.github/workflows/build_wheel_all_archs.yml
@@ -42,7 +42,7 @@ jobs:
       - uses: actions/setup-python@v4
         name: Install Python
         with:
-          python-version: '3.9'
+          python-version: '3.10'
 
       # Added due to weird error when building inside docker container
       # for other platforms...

diff --git a/.github/workflows/dev_envs.yml b/.github/workflows/dev_envs.yml
@@ -45,7 +45,7 @@ jobs:
       uses: conda-incubator/setup-miniconda@3b0f2504dd76ef23b6d31f291f4913fb60ab5ff3
       with:
         auto-update-conda: true
-        python-version: 3.9
+        python-version: "3.10"
         channels: conda-forge,bioconda
         miniforge-variant: Mambaforge
         miniforge-version: latest
@@ -59,6 +59,6 @@ jobs:
       shell: bash -l {0}
       run: mamba install 'tox>=3.27,<4' tox-conda rust git compilers pandoc libstdcxx-ng
 
-    - name: run tests for 3.9
+    - name: run tests for 3.10
       shell: bash -l {0}
-      run: tox -e py39
+      run: tox -e py310
diff --git a/.github/workflows/python.yml b/.github/workflows/python.yml
@@ -14,7 +14,7 @@ jobs:
     strategy:
       matrix:
         os: [ubuntu-22.04, macos-latest]
-        py: ["3.11", "3.10", "3.9", "3.8"]
+        py: ["3.12", "3.11", "3.10"]
       fail-fast: false
 
     steps:
@@ -49,14 +49,14 @@ jobs:
         uses: r-lib/actions/setup-pandoc@v2
 
       - name: Set up IPFS
-        if: startsWith(runner.os, 'Linux') && (matrix.py == '3.9')
+        if: startsWith(runner.os, 'Linux') && (matrix.py == '3.10')
         uses: ibnesayeed/setup-ipfs@master
         with:
           ipfs_version: 0.6
           run_daemon: true
 
       - name: Start Redis
-        if: startsWith(runner.os, 'Linux') && (matrix.py == '3.9')
+        if: startsWith(runner.os, 'Linux') && (matrix.py == '3.10')
         uses: supercharge/[email protected]
         with:
           redis-version: 6

diff --git a/README.md b/README.md
@@ -61,7 +61,7 @@ A quickstart tutorial [is available](https://sourmash.readthedocs.io/en/latest/t
 
 ### Requirements
 
-sourmash runs under Python 3.7 and later.  The base
+sourmash runs under Python 3.10 and later.  The base
 requirements are screed, cffi, numpy, matplotlib, and scipy.  Conda
 (see below) will install everything necessary, and is our recommended
 installation method.

diff --git a/binder/environment.yml b/binder/environment.yml
@@ -3,8 +3,8 @@ channels:
   - bioconda
   - defaults
 dependencies:
-  - python>=3.9
-  - sourmash>=4.8.2
+  - python>=3.10
+  - sourmash>=4.8.4
   - screed
   - matplotlib
   - pandas

diff --git a/doc/developer.md b/doc/developer.md
@@ -10,7 +10,7 @@ You can get the latest development branch with:
 ```
 git clone https://github.com/sourmash-bio/sourmash.git
 ```
-sourmash runs under Python 3.8 and later.
+sourmash runs under Python 3.10 and later.
 
 We recommend using `conda` or `Nix` for setting up an environment for developing
 new features, running tests and code quality checks.
@@ -87,7 +87,7 @@ running tests and checks during development.
 `tox -l` lists available tasks.
 
 You can run tests by invoking `make test` in the sourmash directory;
-`tox -e py39` will run the Python tests with Python 3.9,
+`tox -e py310` will run the Python tests with Python 3.10,
 and `cargo test` will run the Rust tests.
 
 ## Adding new changes

diff --git a/doc/requirements.md b/doc/requirements.md
@@ -11,7 +11,7 @@ in a second or so on a rather slow 2016 Mac laptop.
 MinHash sketches and signatures are quite small on disk.
 
 sourmash should run with no modification on Linux and Mac OS X,
-under Python 3.8 and later.  Please see [the development repository README][0]
+under Python 3.10 and later.  Please see [the development repository README][0]
 for
 information on source code, tests, and continuous integration.
 

diff --git a/doc/runtime.txt b/doc/runtime.txt
@@ -1 +1 @@
-3.7
+3.10
diff --git a/doc/sourmash-sketch.md b/doc/sourmash-sketch.md
@@ -204,6 +204,7 @@ A parameter string is a space-delimited collection that can contain one or more
 * `num=<int>` - create a standard MinHash with no more than `<num>` k-mers kept. This will produce sketches identical to [mash sketches](https://mash.readthedocs.io/en/latest/). `num` is incompatible with `scaled`. See [our guide to signature resolution](using-sourmash-a-guide.md#what-resolution-should-my-signatures-be--how-should-i-create-them) for more information.
 * `abund` / `noabund` - create abundance-weighted (or not) sketches. See [Classify signatures: Abundance Weighting](classifying-signatures.md#abundance-weighting) for details of how this works.
 * `dna`, `protein`, `dayhoff`, `hp` - create this kind of sketch. Note that `sourmash sketch dna -p protein` and `sourmash sketch protein -p dna` are invalid; please use `sourmash sketch translate` for the former.
+* `seed=<int>` - set the random number seed used for k-mer hashing. This is for advanced users who want to choose a completely different set of k-mers for sketches! The default is 42.
 
 For all field names but `k`, if multiple fields in a parameter string are provided, the last one encountered overrides the previous values. For `k`, if multiple ksizes are specified in a single parameter string, sketches for all ksizes specified are created.
 

diff --git a/doc/support.md b/doc/support.md
@@ -101,9 +101,9 @@ and our intent is that it will support as-yet unreleased versions of Python 3.x
 (e.g. 3.10) moving forward.
 
 For future versions of sourmash, we plan to follow the
-[Numpy NEP 29](https://numpy.org/neps/nep-0029-deprecation_policy.html)
+[Scientific Python SPEC 0](https://scientific-python.org/specs/spec-0000/)
 proposal for Python version support. For example, this
-would mean that we would drop support for Python 3.8 on April 14,
+means that we dropped support for Python 3.9 on October 10,
 2023.
 
 ### Rust API

diff --git a/flake.nix b/flake.nix
@@ -93,9 +93,9 @@
 
             git
             stdenv.cc.cc.lib
+            (python312.withPackages (ps: with ps; [ virtualenv ]))
             (python311.withPackages (ps: with ps; [ virtualenv tox cffi ]))
             (python310.withPackages (ps: with ps; [ virtualenv ]))
-            (python39.withPackages (ps: with ps; [ virtualenv ]))
 
             rust-cbindgen
             maturin

diff --git a/pyproject.toml b/pyproject.toml
@@ -64,8 +64,8 @@ classifiers = [
   "Operating System :: POSIX :: Linux",
   "Operating System :: MacOS :: MacOS X",
   "Programming Language :: Rust",
-  "Programming Language :: Python :: 3.8",
-  "Programming Language :: Python :: 3.9",
+  "Programming Language :: Python :: 3.12",
+  "Programming Language :: Python :: 3.11",
   "Programming Language :: Python :: 3.10",
   "Topic :: Scientific/Engineering :: Bio-Informatics",
 ]
@@ -79,10 +79,9 @@ dependencies = [
   "deprecation>=2.0.6",
   "cachetools>=4,<6",
   "bitstring>=3.1.9,<5",
-  "importlib_metadata>=3.6;python_version<'3.10'"
 ]
 
-requires-python = ">=3.8"
+requires-python = ">=3.10"
 
 [metadata]
 license = { text = "BSD 3-Clause License" }
@@ -164,7 +163,7 @@ line_length = 88
 known_first_party = ["sourmash"]
 
 [tool.cibuildwheel]
-build = "cp39-*"
+build = "cp310-*"
 skip = "*-win32 *-manylinux_i686 *-musllinux_*"
 before-all = [
   "curl https://sh.rustup.rs -sSf | sh -s -- -y --default-toolchain=stable",

diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py
@@ -270,12 +270,13 @@ def _compute_merged(args, signatures_factory):
         notify('... reading sequences from {}', filename)
 
         n = None
-        for n, record in enumerate(screed.open(filename)):
-            if n % 10000 == 0 and n:
-                notify('\r... {} {}', filename, n, end='')
+        with screed.open(filename) as f:
+            for n, record in enumerate(f):
+                if n % 10000 == 0 and n:
+                    notify('\r... {} {}', filename, n, end='')
 
-            add_seq(sigs, record.sequence,
-                    args.input_is_protein, args.check_sequence)
+                add_seq(sigs, record.sequence,
+                        args.input_is_protein, args.check_sequence)
         if n is not None:
             notify('... {} {} sequences', filename, n + 1)
             total_seq += n + 1

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -248,7 +248,8 @@ def plot(args):
     D_filename = args.distances
 
     notify(f'loading comparison matrix from {D_filename}...')
-    D = numpy.load(open(D_filename, 'rb'))
+    with open(D_filename, 'rb') as f:
+        D = numpy.load(f)
     # not sure how to change this to use f-strings
     notify('...got {} x {} matrix.', *D.shape)
 
@@ -274,7 +275,8 @@ def plot(args):
             labelfilename = D_filename + '.labels.txt'
 
         notify(f'loading labels from {labelfilename}')
-        labeltext = [ x.strip() for x in open(labelfilename) ]
+        with open(labelfilename) as f:
+            labeltext = [ x.strip() for x in f ]
 
         if len(labeltext) != D.shape[0]:
             error('{} labels != matrix size, exiting', len(labeltext))
@@ -1204,24 +1206,24 @@ def do_search():
         return results
 
     notify('reading sequences from stdin')
-    screed_iter = screed.open(args.inp_file)
     watermark = WATERMARK_SIZE
 
     # iterate over input records
     n = 0
-    for n, record in enumerate(screed_iter):
-        # at each watermark, print status & check cardinality
-        if n >= watermark:
-            notify(f'\r... read {n} sequences', end='')
-            watermark += WATERMARK_SIZE
-
-            if do_search():
-                break
-
-        if args.input_is_protein:
-            E.add_protein(record.sequence)
-        else:
-            E.add_sequence(record.sequence, False)
+    with screed.open(args.inp_file) as screed_iter:
+        for n, record in enumerate(screed_iter):
+            # at each watermark, print status & check cardinality
+            if n >= watermark:
+                notify(f'\r... read {n} sequences', end='')
+                watermark += WATERMARK_SIZE
+
+                if do_search():
+                    break
+
+            if args.input_is_protein:
+                E.add_protein(record.sequence)
+            else:
+                E.add_sequence(record.sequence, False)
 
     results = do_search()
     if not results:

diff --git a/src/sourmash/save_load.py b/src/sourmash/save_load.py
@@ -229,7 +229,7 @@ def _error_on_fastaq(filename, **kwargs):
     success = False
     try:
         with screed.open(filename) as it:
-            _ = next(iter(it))
+            _ = next(it)
 
             success = True
     except:
@@ -288,7 +288,7 @@ def _get_signatures_from_rust(siglist):
     # Rust supports multiple. For now, go through serializing
     # and deserializing the signature! See issue #1167 for more.
     json_str = sourmash.save_signatures(siglist)
-    for ss in sourmash.load_signatures(json_str):
+    for ss in sourmash.signature.load_signatures(json_str):
         yield ss
 
 

diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py
@@ -1134,62 +1134,63 @@ def kmers(args):
         notify(f"opening sequence file '{filename}'")
         n_files_searched += 1
 
-        for record in screed.open(filename):
-            seq_mh = query_mh.copy_and_clear()
-
-            # protein? dna?
-            if is_protein:
-                seq_mh.add_protein(record.sequence)
-            else:
-                try:
-                    seq_mh.add_sequence(record.sequence,
-                                        not args.check_sequence)
-                except ValueError as exc:
-                    seqname = record.name
-                    if len(seqname) > 40:
-                        seqname = seqname[:37] + '...'
-                    notify(f"ERROR in sequence '{seqname}', file '{filename}'")
-                    notify(str(exc))
-                    if args.force:
-                        notify("(continuing)")
-                        continue
-                    else:
-                        sys.exit(-1)
-
-            if seq_mh.intersection(query_mh):
-                # match!
-
-                # output matching sequences:
-                if save_seqs:
-                    save_seqs.fp.write(f">{record.name}\n{record.sequence}\n")
-                    n_sequences_found += 1
-                    n_bp_saved += len(record.sequence)
-
-                # output matching k-mers:
-                if kmer_w:
-                    seq = record.sequence
-                    kh_iter = seq_mh.kmers_and_hashes(seq, force=False,
-                                                      is_protein=is_protein)
-                    for kmer, hashval in kh_iter:
-                        if hashval in query_mh.hashes:
-                            found_mh.add_hash(hashval)
-                            n_kmers_found += 1
-                            d = dict(sequence_file=filename,
-                                     sequence_name=record.name,
-                                     kmer=kmer, hashval=hashval)
-                            kmer_w.writerow(d)
-
-                # add seq_mh to found_mh
-                found_mh += seq_mh.intersection(query_mh)
-
-            # provide progress indicator based on bp...
-            n_sequences_searched += 1
-            n_bp_searched += len(record.sequence)
-
-            if n_bp_searched >= progress_threshold:
-                notify(f"... searched {n_bp_searched} from {n_files_searched} files so far")
-                while n_bp_searched >= progress_threshold:
-                    progress_threshold += progress_interval
+        with screed.open(filename) as f:
+            for record in f:
+                seq_mh = query_mh.copy_and_clear()
+
+                # protein? dna?
+                if is_protein:
+                    seq_mh.add_protein(record.sequence)
+                else:
+                    try:
+                        seq_mh.add_sequence(record.sequence,
+                                            not args.check_sequence)
+                    except ValueError as exc:
+                        seqname = record.name
+                        if len(seqname) > 40:
+                            seqname = seqname[:37] + '...'
+                        notify(f"ERROR in sequence '{seqname}', file '{filename}'")
+                        notify(str(exc))
+                        if args.force:
+                            notify("(continuing)")
+                            continue
+                        else:
+                            sys.exit(-1)
+
+                if seq_mh.intersection(query_mh):
+                    # match!
+
+                    # output matching sequences:
+                    if save_seqs:
+                        save_seqs.fp.write(f">{record.name}\n{record.sequence}\n")
+                        n_sequences_found += 1
+                        n_bp_saved += len(record.sequence)
+
+                    # output matching k-mers:
+                    if kmer_w:
+                        seq = record.sequence
+                        kh_iter = seq_mh.kmers_and_hashes(seq, force=False,
+                                                          is_protein=is_protein)
+                        for kmer, hashval in kh_iter:
+                            if hashval in query_mh.hashes:
+                                found_mh.add_hash(hashval)
+                                n_kmers_found += 1
+                                d = dict(sequence_file=filename,
+                                         sequence_name=record.name,
+                                         kmer=kmer, hashval=hashval)
+                                kmer_w.writerow(d)
+
+                    # add seq_mh to found_mh
+                    found_mh += seq_mh.intersection(query_mh)
+
+                # provide progress indicator based on bp...
+                n_sequences_searched += 1
+                n_bp_searched += len(record.sequence)
+
+                if n_bp_searched >= progress_threshold:
+                    notify(f"... searched {n_bp_searched} from {n_files_searched} files so far")
+                    while n_bp_searched >= progress_threshold:
+                        progress_threshold += progress_interval
 
     # END major for loop. Now, clean up!
     if save_kmers: