Merge pull request #211 from icbi-lab/maynard-example-data3

Add Maynard2020 example dataset
scverse · Oct 20, 2020 · cce2a03 · cce2a03
2 parents c1cd6ff + 951680e
commit cce2a03
Show file tree

Hide file tree

Showing 7 changed files with 191 additions and 4 deletions.
diff --git a/docs/api.rst b/docs/api.rst
@@ -179,6 +179,7 @@ Datasets: `datasets`
 
    wu2020
    wu2020_3k
+   maynard2020
 
 
 

diff --git a/docs/references.bib b/docs/references.bib
@@ -1,3 +1,17 @@
+@article{Maynard2020,
+  doi       = {10.1016/j.cell.2020.07.017},
+  url       = {https://doi.org/10.1016/j.cell.2020.07.017},
+  year      = {2020},
+  month     = sep,
+  publisher = {Elsevier {BV}},
+  volume    = {182},
+  number    = {5},
+  pages     = {1232--1251.e22},
+  author    = {Ashley Maynard and Caroline E. McCoach and Julia K. Rotow and Lincoln Harris and Franziska Haderk and D. Lucas Kerr and Elizabeth A. Yu and Erin L. Schenk and Weilun Tan and Alexander Zee and Michelle Tan and Philippe Gui and Tasha Lea and Wei Wu and Anatoly Urisman and Kirk Jones and Rene Sit and Pallav K. Kolli and Eric Seeley and Yaron Gesthalter and Daniel D. Le and Kevin A. Yamauchi and David M. Naeger and Sourav Bandyopadhyay and Khyati Shah and Lauren Cech and Nicholas J. Thomas and Anshal Gupta and Mayra Gonzalez and Hien Do and Lisa Tan and Bianca Bacaltos and Rafael Gomez-Sjoberg and Matthew Gubens and Thierry Jahan and Johannes R. Kratz and David Jablons and Norma Neff and Robert C. Doebele and Jonathan Weissman and Collin M. Blakely and Spyros Darmanis and Trever G. Bivona},
+  title     = {Therapy-Induced Evolution of Human Lung Cancer Revealed by Single-Cell {RNA} Sequencing},
+  journal   = {Cell}
+}
+
 @article{Vettermann2010,
   doi       = {10.1111/j.1600-065x.2010.00935.x},
   url       = {https://doi.org/10.1111/j.1600-065x.2010.00935.x},

diff --git a/pyproject.toml b/pyproject.toml
@@ -79,3 +79,5 @@ minversion = 6.0
 testpaths = 'scirpy/tests'
 norecursedirs = [ '.*', 'build', 'dist', '*.egg', 'data', '__pycache__']
 
+[tool.setuptools_scm]
+git_describe_command = "git describe --dirty --tags --long --match v*.*.*"
diff --git a/scirpy/_metadata.py b/scirpy/_metadata.py
@@ -11,7 +11,13 @@
     proj = pytoml.loads((here.parent / "pyproject.toml").read_text())
     metadata = proj["tool"]["flit"]["metadata"]
 
-    __version__ = get_version(root="..", relative_to=__file__)
+    __version__ = get_version(
+        # Allegedly, the parameters from pyproject.toml should be passed automatically.
+        # However, this didn't work, so I pass them explicitly here.
+        root="..",
+        relative_to=__file__,
+        **proj["tool"]["setuptools_scm"]
+    )
     __author__ = metadata["author"]
     __email__ = metadata["author-email"]
 

diff --git a/scirpy/datasets/__init__.py b/scirpy/datasets/__init__.py
@@ -13,7 +13,7 @@
 )
 def wu2020() -> AnnData:
     """\
-    Return the dataset from [Wu2020]_ as AnnData object.
+    Return the dataset from :cite:`Wu2020` as AnnData object.
 
     200k cells, of which 100k have TCRs.
 
@@ -23,7 +23,7 @@ def wu2020() -> AnnData:
 
     {processing_code}
     """
-    url = "https://github.com/icbi-lab/scirpy/releases/download/v0.4.2/wu2020.h5ad"
+    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020.h5ad"
     filename = settings.datasetdir / "wu2020.h5ad"
     adata = read(filename, backup_url=url)
     return adata
@@ -47,7 +47,35 @@ def wu2020_3k() -> AnnData:
     """
     # os.makedirs(settings.datasetdir, exist_ok=True)
     # TODO host it on github or similar
-    url = "https://github.com/icbi-lab/scirpy/releases/download/v0.4.2/wu2020_3k.h5ad"
+    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad"
     filename = settings.datasetdir / "wu2020_3k.h5ad"
     adata = read(filename, backup_url=url)
     return adata
+
+
+@_doc_params(
+    processing_code=indent(
+        _read_to_str(HERE / "_processing_scripts/maynard2020.py"), "   "
+    )
+)
+def maynard2020() -> AnnData:
+    """\
+    Return the dataset from :cite:`Maynard2020` as AnnData object.
+
+    21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs<TCR>`
+    and 1,500 have :term:`BCRs<BCR>`.
+
+    The raw FASTQ files have been obtained from `PRJNA591860 <https://www.ebi.ac.uk/ena/browser/view/PRJNA591860>`__
+    and processed using the nf-core `Smart-seq2 pipeline <https://github.com/nf-core/smartseq2/>`__.
+
+    The processed files have been imported and transformed into an :class:`anndata.AnnData`
+    object using the following script:
+
+    .. code-block:: python
+
+    {processing_code}
+    """
+    url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/maynard2020.h5ad"
+    filename = settings.datasetdir / "maynard2020.h5ad"
+    adata = read(filename, backup_url=url)
+    return adata
diff --git a/scirpy/datasets/_processing_scripts/.jupytext b/scirpy/datasets/_processing_scripts/.jupytext
@@ -0,0 +1,3 @@
+# Always pair ipynb notebooks to md files
+default_jupytext_formats = "py:light"
+default_notebook_metadata_filter = "-kernelspec"
diff --git a/scirpy/datasets/_processing_scripts/maynard2020.py b/scirpy/datasets/_processing_scripts/maynard2020.py
@@ -0,0 +1,133 @@
+# %env OPENBLAS_NUM_THREADS=16
+# %env OMP_NUM_THREADS=16
+# %env MKL_NUM_THREADS=16
+# %env OMP_NUM_cpus=16
+# %env MKL_NUM_cpus=16
+# %env OPENBLAS_NUM_cpus=16
+import sys
+
+sys.path.insert(0, "../../..")
+
+import scirpy as ir
+import scanpy as sc
+import pandas as pd
+import numpy as np
+from scipy.sparse import csr_matrix
+from pathlib import Path
+
+# The dataset has been downloaded from ENA and then processed using the Smart-seq2 Pipeline:
+# https://github.com/nf-core/smartseq2/
+
+DATASET_DIR = Path("/data/datasets/Maynard_Bivona_2020_NSCLC/")
+
+# ### Read counts and TPMs
+
+count_mat = pd.read_csv(
+    DATASET_DIR / "smartseq2_pipeline/resultCOUNT.txt",
+    sep="\t",
+    low_memory=False,
+    index_col="Geneid",
+)
+
+tpm_mat = pd.read_csv(
+    DATASET_DIR / "smartseq2_pipeline/resultTPM.txt", sep="\t", low_memory=False
+)
+
+# summarize to gene symbol for the ~300 duplicated symbols.
+tpm_mat_symbol = tpm_mat.drop("gene_id", axis="columns").groupby("gene_symbol").sum()
+
+# ### Read and sanitize metadata
+
+# +
+sample_info = pd.read_csv(DATASET_DIR / "scripts/sra_sample_info.csv", low_memory=False)
+cell_metadata = pd.read_csv(
+    DATASET_DIR / "scripts/cell_metadata.csv", low_memory=False, index_col=0
+)
+
+# combine metadata
+meta = sample_info.merge(
+    cell_metadata, left_on="cell_ID", right_on="cell_id"
+).set_index("Run")
+# -
+
+meta = meta.drop(
+    [
+        "Assay Type",
+        "AvgSpotLen",
+        "SRA Study",
+        "ReleaseDate",
+        "Bases",
+        "disease",
+        "Biomaterial_provider",
+        "BioProject",
+        "Isolate",
+        "Sample Name",
+        "BioSample",
+        "BioSampleModel",
+        "Bytes",
+        "Center Name",
+        "Consent",
+        "DATASTORE filetype",
+        "DATASTORE provider",
+        "DATASTORE region",
+        "Experiment",
+        "Instrument",
+        "LibraryLayout",
+        "Library Name",
+        "LibrarySelection",
+        "cell_ID",
+        "LibrarySource",
+        "Organism",
+        "Platform",
+        "gender",
+        "SAMPLE_TYPE",
+        "TISSUE",
+    ],
+    axis="columns",
+).rename(
+    {
+        "Age": "age",
+        "smokingHx": "smoking_status",
+        "stage.at.dx": "stage_at_diagnosis",
+    },
+    axis="columns",
+)
+
+meta.tail()
+
+# ### Find all cells for which we have both counts, TPM and annotation
+
+has_counts = set(count_mat.columns)
+has_tpm = set(tpm_mat.columns)
+has_meta = set(meta.index.values)
+
+cell_ids = np.array(list(has_counts & has_tpm & has_meta))
+
+# ### Build adata
+
+var = (
+    pd.DataFrame(count_mat.index)
+    .rename({"Geneid": "gene_symbol"}, axis="columns")
+    .set_index("gene_symbol")
+    .sort_index()
+)
+
+adata = sc.AnnData(
+    X=csr_matrix(tpm_mat_symbol.loc[var.index, cell_ids].values.T),
+    layers={"raw_counts": csr_matrix(count_mat.loc[var.index, cell_ids].values.T)},
+    var=var,
+    obs=meta.loc[cell_ids, :],
+)
+
+adata_tcr = ir.io.read_tracer(
+    "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/TraCeR"
+)
+adata_bcr = ir.io.read_bracer(
+    "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/BraCeR/filtered_BCR_summary/changeodb.tab"
+)
+
+ir.pp.merge_with_ir(adata, adata_tcr)
+ir.pp.merge_with_ir(adata, adata_bcr)
+
+# Write out the dataset
+adata.write_h5ad("maynard2020.h5ad", compression="lzf")