Skip to content

Commit

Permalink
Merge pull request #211 from icbi-lab/maynard-example-data3
Browse files Browse the repository at this point in the history
Add Maynard2020 example dataset
  • Loading branch information
grst authored Oct 20, 2020
2 parents c1cd6ff + 951680e commit cce2a03
Show file tree
Hide file tree
Showing 7 changed files with 191 additions and 4 deletions.
1 change: 1 addition & 0 deletions docs/api.rst
Original file line number Diff line number Diff line change
Expand Up @@ -179,6 +179,7 @@ Datasets: `datasets`

wu2020
wu2020_3k
maynard2020



Expand Down
14 changes: 14 additions & 0 deletions docs/references.bib
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
@article{Maynard2020,
doi = {10.1016/j.cell.2020.07.017},
url = {https://doi.org/10.1016/j.cell.2020.07.017},
year = {2020},
month = sep,
publisher = {Elsevier {BV}},
volume = {182},
number = {5},
pages = {1232--1251.e22},
author = {Ashley Maynard and Caroline E. McCoach and Julia K. Rotow and Lincoln Harris and Franziska Haderk and D. Lucas Kerr and Elizabeth A. Yu and Erin L. Schenk and Weilun Tan and Alexander Zee and Michelle Tan and Philippe Gui and Tasha Lea and Wei Wu and Anatoly Urisman and Kirk Jones and Rene Sit and Pallav K. Kolli and Eric Seeley and Yaron Gesthalter and Daniel D. Le and Kevin A. Yamauchi and David M. Naeger and Sourav Bandyopadhyay and Khyati Shah and Lauren Cech and Nicholas J. Thomas and Anshal Gupta and Mayra Gonzalez and Hien Do and Lisa Tan and Bianca Bacaltos and Rafael Gomez-Sjoberg and Matthew Gubens and Thierry Jahan and Johannes R. Kratz and David Jablons and Norma Neff and Robert C. Doebele and Jonathan Weissman and Collin M. Blakely and Spyros Darmanis and Trever G. Bivona},
title = {Therapy-Induced Evolution of Human Lung Cancer Revealed by Single-Cell {RNA} Sequencing},
journal = {Cell}
}

@article{Vettermann2010,
doi = {10.1111/j.1600-065x.2010.00935.x},
url = {https://doi.org/10.1111/j.1600-065x.2010.00935.x},
Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -79,3 +79,5 @@ minversion = 6.0
testpaths = 'scirpy/tests'
norecursedirs = [ '.*', 'build', 'dist', '*.egg', 'data', '__pycache__']

[tool.setuptools_scm]
git_describe_command = "git describe --dirty --tags --long --match v*.*.*"
8 changes: 7 additions & 1 deletion scirpy/_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,13 @@
proj = pytoml.loads((here.parent / "pyproject.toml").read_text())
metadata = proj["tool"]["flit"]["metadata"]

__version__ = get_version(root="..", relative_to=__file__)
__version__ = get_version(
# Allegedly, the parameters from pyproject.toml should be passed automatically.
# However, this didn't work, so I pass them explicitly here.
root="..",
relative_to=__file__,
**proj["tool"]["setuptools_scm"]
)
__author__ = metadata["author"]
__email__ = metadata["author-email"]

Expand Down
34 changes: 31 additions & 3 deletions scirpy/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
)
def wu2020() -> AnnData:
"""\
Return the dataset from [Wu2020]_ as AnnData object.
Return the dataset from :cite:`Wu2020` as AnnData object.
200k cells, of which 100k have TCRs.
Expand All @@ -23,7 +23,7 @@ def wu2020() -> AnnData:
{processing_code}
"""
url = "https://github.com/icbi-lab/scirpy/releases/download/v0.4.2/wu2020.h5ad"
url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020.h5ad"
filename = settings.datasetdir / "wu2020.h5ad"
adata = read(filename, backup_url=url)
return adata
Expand All @@ -47,7 +47,35 @@ def wu2020_3k() -> AnnData:
"""
# os.makedirs(settings.datasetdir, exist_ok=True)
# TODO host it on github or similar
url = "https://github.com/icbi-lab/scirpy/releases/download/v0.4.2/wu2020_3k.h5ad"
url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad"
filename = settings.datasetdir / "wu2020_3k.h5ad"
adata = read(filename, backup_url=url)
return adata


@_doc_params(
processing_code=indent(
_read_to_str(HERE / "_processing_scripts/maynard2020.py"), " "
)
)
def maynard2020() -> AnnData:
"""\
Return the dataset from :cite:`Maynard2020` as AnnData object.
21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs<TCR>`
and 1,500 have :term:`BCRs<BCR>`.
The raw FASTQ files have been obtained from `PRJNA591860 <https://www.ebi.ac.uk/ena/browser/view/PRJNA591860>`__
and processed using the nf-core `Smart-seq2 pipeline <https://github.com/nf-core/smartseq2/>`__.
The processed files have been imported and transformed into an :class:`anndata.AnnData`
object using the following script:
.. code-block:: python
{processing_code}
"""
url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/maynard2020.h5ad"
filename = settings.datasetdir / "maynard2020.h5ad"
adata = read(filename, backup_url=url)
return adata
3 changes: 3 additions & 0 deletions scirpy/datasets/_processing_scripts/.jupytext
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Always pair ipynb notebooks to md files
default_jupytext_formats = "py:light"
default_notebook_metadata_filter = "-kernelspec"
133 changes: 133 additions & 0 deletions scirpy/datasets/_processing_scripts/maynard2020.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
# %env OPENBLAS_NUM_THREADS=16
# %env OMP_NUM_THREADS=16
# %env MKL_NUM_THREADS=16
# %env OMP_NUM_cpus=16
# %env MKL_NUM_cpus=16
# %env OPENBLAS_NUM_cpus=16
import sys

sys.path.insert(0, "../../..")

import scirpy as ir
import scanpy as sc
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from pathlib import Path

# The dataset has been downloaded from ENA and then processed using the Smart-seq2 Pipeline:
# https://github.com/nf-core/smartseq2/

DATASET_DIR = Path("/data/datasets/Maynard_Bivona_2020_NSCLC/")

# ### Read counts and TPMs

count_mat = pd.read_csv(
DATASET_DIR / "smartseq2_pipeline/resultCOUNT.txt",
sep="\t",
low_memory=False,
index_col="Geneid",
)

tpm_mat = pd.read_csv(
DATASET_DIR / "smartseq2_pipeline/resultTPM.txt", sep="\t", low_memory=False
)

# summarize to gene symbol for the ~300 duplicated symbols.
tpm_mat_symbol = tpm_mat.drop("gene_id", axis="columns").groupby("gene_symbol").sum()

# ### Read and sanitize metadata

# +
sample_info = pd.read_csv(DATASET_DIR / "scripts/sra_sample_info.csv", low_memory=False)
cell_metadata = pd.read_csv(
DATASET_DIR / "scripts/cell_metadata.csv", low_memory=False, index_col=0
)

# combine metadata
meta = sample_info.merge(
cell_metadata, left_on="cell_ID", right_on="cell_id"
).set_index("Run")
# -

meta = meta.drop(
[
"Assay Type",
"AvgSpotLen",
"SRA Study",
"ReleaseDate",
"Bases",
"disease",
"Biomaterial_provider",
"BioProject",
"Isolate",
"Sample Name",
"BioSample",
"BioSampleModel",
"Bytes",
"Center Name",
"Consent",
"DATASTORE filetype",
"DATASTORE provider",
"DATASTORE region",
"Experiment",
"Instrument",
"LibraryLayout",
"Library Name",
"LibrarySelection",
"cell_ID",
"LibrarySource",
"Organism",
"Platform",
"gender",
"SAMPLE_TYPE",
"TISSUE",
],
axis="columns",
).rename(
{
"Age": "age",
"smokingHx": "smoking_status",
"stage.at.dx": "stage_at_diagnosis",
},
axis="columns",
)

meta.tail()

# ### Find all cells for which we have both counts, TPM and annotation

has_counts = set(count_mat.columns)
has_tpm = set(tpm_mat.columns)
has_meta = set(meta.index.values)

cell_ids = np.array(list(has_counts & has_tpm & has_meta))

# ### Build adata

var = (
pd.DataFrame(count_mat.index)
.rename({"Geneid": "gene_symbol"}, axis="columns")
.set_index("gene_symbol")
.sort_index()
)

adata = sc.AnnData(
X=csr_matrix(tpm_mat_symbol.loc[var.index, cell_ids].values.T),
layers={"raw_counts": csr_matrix(count_mat.loc[var.index, cell_ids].values.T)},
var=var,
obs=meta.loc[cell_ids, :],
)

adata_tcr = ir.io.read_tracer(
"/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/TraCeR"
)
adata_bcr = ir.io.read_bracer(
"/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/BraCeR/filtered_BCR_summary/changeodb.tab"
)

ir.pp.merge_with_ir(adata, adata_tcr)
ir.pp.merge_with_ir(adata, adata_bcr)

# Write out the dataset
adata.write_h5ad("maynard2020.h5ad", compression="lzf")

0 comments on commit cce2a03

Please sign in to comment.