Add Stephenson datasets (#565)

* Add Stephenson datasets * Add changelog * Added preprocessing description regarding stephenson_5k dataset * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --------- Co-authored-by: Mario Kanetscheider <[email protected]> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
scverse · Oct 22, 2024 · e02a82b · e02a82b
1 parent 7c5726b
commit e02a82b
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -13,6 +13,7 @@ and this project adheres to [Semantic Versioning][].
 ### Additions
 
 -   Add a `mask_obs` argument to `tl.clonotype_network` that allows to compute the clonotype networks on a subset of the cells ([#557](https://github.com/scverse/scirpy/pull/557)).
+-   Add `datasets.stephenson2021_5k`, an example dataset for the upcoming BCR tutorial ([#565](https://github.com/scverse/scirpy/pull/565))
 
 ### Fixes
 

diff --git a/docs/api.rst b/docs/api.rst
@@ -246,6 +246,7 @@ Example datasets
    datasets.wu2020
    datasets.wu2020_3k
    datasets.maynard2020
+   datasets.stephenson2021_5k
 
 Reference databases
 ^^^^^^^^^^^^^^^^^^^

diff --git a/src/scirpy/datasets/__init__.py b/src/scirpy/datasets/__init__.py
@@ -28,14 +28,15 @@
 
 _FIGSHARE = pooch.create(
     path=pooch.os_cache("scirpy"),
-    base_url="doi:10.6084/m9.figshare.22249894.v1",
+    base_url="doi:10.6084/m9.figshare.22249894.v2",
     version=version("scirpy"),
     version_dev="main",
     env="SCIRPY_DATA_DIR",
     registry={
         "wu2020.h5mu": "md5:ed30d9c1c44cae544f4c080a2451118b",
         "wu2020_3k.h5mu": "md5:12c57c790f8a403751304c9de5a18cbf",
         "maynard2020.h5mu": "md5:da64ac62e3e92c80eaf0e8eef6537ac7",
+        "stephenson2021_5k.h5mu": "md5:6ea26f9d95525371ff9028f8e99ed474",
     },
 )
 _POOCH_INFO = dedent(
@@ -124,6 +125,29 @@ def maynard2020() -> MuData:
     return mudata.read_h5mu(fname)
 
 
+@_doc_params(
+    processing_code=indent(_read_to_str(HERE / "_processing_scripts/maynard2020.py"), " " * 8),
+    pooch_info=_POOCH_INFO,
+)
+def stephenson2021_5k() -> MuData:
+    """\
+    Return the dataset from :cite:`Stephenson2021` as MuData object, downsampled
+    to 5000 BCR-containing cells.
+
+    The original study sequenced 1,141,860 cells from 143 PBMC samples collected from patients with different severity of COVID-19 and control groups.
+    Gene expression, TCR-enriched and BCR-enriched libraries were prepared for each sample according to 10x Genomics protocol and NovaSeq 6000 was used for sequencing.
+
+    A preprocessed dataset for the transciptome library was obtained from `Array Express <https://www.ebi.ac.uk/biostudies/arrayexpress/studies/E-MTAB-10026>`__
+    A preprocessed dataset for the BCR-enriched library was obtained from `clatworthylab's GitHub <https://github.com/clatworthylab/COVID_analysis>`__
+    Both dataset have already passed quality control and all cells that didn't express BCR were discarded.
+
+    To  speed up computation time, we solely included 5 samples from each of the COVID-19-positive groups and randomly subsampled down to a total of 5k cells.
+
+    """
+    fname = cast(PathLike, _FIGSHARE.fetch("stephenson2021_5k.h5mu", progressbar=True))
+    return mudata.read_h5mu(fname)
+
+
 def vdjdb(cached: bool = True, *, cache_path="data/vdjdb.h5ad") -> AnnData:
     """\
     Download VDJdb and process it into an AnnData object.