From 174c6f4b3a7e00a2f78a51d2791a47fe2f851cba Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 20 Oct 2020 15:22:34 +0200 Subject: [PATCH 1/6] Add preprocessing script for maynard2020 --- scirpy/datasets/_processing_scripts/.jupytext | 3 + .../_processing_scripts/maynard2020.py | 218 ++++++++++++++++++ 2 files changed, 221 insertions(+) create mode 100644 scirpy/datasets/_processing_scripts/.jupytext create mode 100644 scirpy/datasets/_processing_scripts/maynard2020.py diff --git a/scirpy/datasets/_processing_scripts/.jupytext b/scirpy/datasets/_processing_scripts/.jupytext new file mode 100644 index 000000000..d08f6a76b --- /dev/null +++ b/scirpy/datasets/_processing_scripts/.jupytext @@ -0,0 +1,3 @@ +# Always pair ipynb notebooks to md files +default_jupytext_formats = "py:light" +default_notebook_metadata_filter = "-kernelspec" diff --git a/scirpy/datasets/_processing_scripts/maynard2020.py b/scirpy/datasets/_processing_scripts/maynard2020.py new file mode 100644 index 000000000..f689e8a86 --- /dev/null +++ b/scirpy/datasets/_processing_scripts/maynard2020.py @@ -0,0 +1,218 @@ +# --- +# jupyter: +# jupytext: +# formats: ipynb,py:light +# notebook_metadata_filter: -kernelspec +# text_representation: +# extension: .py +# format_name: light +# format_version: '1.5' +# jupytext_version: 1.6.0 +# --- + +# %load_ext autoreload +# %autoreload 2 + +# + +# %env OPENBLAS_NUM_THREADS=16 +# %env OMP_NUM_THREADS=16 +# %env MKL_NUM_THREADS=16 +# %env OMP_NUM_cpus=16 +# %env MKL_NUM_cpus=16 +# %env OPENBLAS_NUM_cpus=16 +import sys + +sys.path.insert(0, "../../..") + +import scirpy as ir +import scanpy as sc +import pandas as pd +import numpy as np +from scipy.sparse import csr_matrix +from pathlib import Path + +# - + +# The dataset has been downloaded from ENA and then processed using the Smart-seq2 Pipeline: +# https://github.com/nf-core/smartseq2/ + +DATASET_DIR = Path("/data/datasets/Maynard_Bivona_2020_NSCLC/") + +# ### Read counts and TPMs + +count_mat = pd.read_csv( + DATASET_DIR / "smartseq2_pipeline/resultCOUNT.txt", + sep="\t", + low_memory=False, + index_col="Geneid", +) + +tpm_mat = pd.read_csv( + DATASET_DIR / "smartseq2_pipeline/resultTPM.txt", sep="\t", low_memory=False +) + +# summarize to gene symbol for the ~300 duplicated symbols. +tpm_mat_symbol = tpm_mat.drop("gene_id", axis="columns").groupby("gene_symbol").sum() + +# ### Read and sanitize metadata + +# + +sample_info = pd.read_csv(DATASET_DIR / "scripts/sra_sample_info.csv", low_memory=False) +cell_metadata = pd.read_csv( + DATASET_DIR / "scripts/cell_metadata.csv", low_memory=False, index_col=0 +) + +# combine metadata +meta = sample_info.merge( + cell_metadata, left_on="cell_ID", right_on="cell_id" +).set_index("Run") +# - + +meta = meta.drop( + [ + "Assay Type", + "AvgSpotLen", + "SRA Study", + "ReleaseDate", + "Bases", + "disease", + "Biomaterial_provider", + "BioProject", + "Isolate", + "Sample Name", + "BioSample", + "BioSampleModel", + "Bytes", + "Center Name", + "Consent", + "DATASTORE filetype", + "DATASTORE provider", + "DATASTORE region", + "Experiment", + "Instrument", + "LibraryLayout", + "Library Name", + "LibrarySelection", + "cell_ID", + "LibrarySource", + "Organism", + "Platform", + "gender", + "SAMPLE_TYPE", + "TISSUE", + ], + axis="columns", +).rename( + { + "Age": "age", + "smokingHx": "smoking_status", + "stage.at.dx": "stage_at_diagnosis", + }, + axis="columns", +) + +meta.tail() + +# ### Find all cells for which we have both counts, TPM and annotation + +has_counts = set(count_mat.columns) +has_tpm = set(tpm_mat.columns) +has_meta = set(meta.index.values) + +cell_ids = np.array(list(has_counts & has_tpm & has_meta)) + +# ### Build adata + +var = ( + pd.DataFrame(count_mat.index) + .rename({"Geneid": "gene_symbol"}, axis="columns") + .set_index("gene_symbol") + .sort_index() +) + +adata = sc.AnnData( + X=csr_matrix(tpm_mat_symbol.loc[var.index, cell_ids].values.T), + layers={"raw_counts": csr_matrix(count_mat.loc[var.index, cell_ids].values.T)}, + var=var, + obs=meta.loc[cell_ids, :], +) + +adata_tcr = ir.io.read_tracer( + "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/TraCeR" +) + +adata_bcr = ir.io.read_bracer( + "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/BraCeR/filtered_BCR_summary/changeodb.tab" +) + +ir.pp.merge_with_ir(adata, adata_tcr) + +ir.pp.merge_with_ir(adata, adata_bcr) + +# Write out the dataset +adata.write_h5ad("maynard2020.h5ad", compression="lzf") + +# ### Some quick peeks at the data + +ir.tl.chain_qc(adata) +ir.pl.group_abundance(adata, groupby="receptor_type") + +# Quality control - calculate QC covariates +adata.obs["n_counts"] = adata.layers["raw_counts"].sum(axis=1) +adata.obs["log_counts"] = np.log(adata.obs["n_counts"]) +adata.obs["n_genes"] = (adata.layers["raw_counts"] > 0).sum(1) +mt_gene_mask = np.array([gene.lower().startswith("mt-") for gene in adata.var_names]) +adata.layers["raw_counts"][:, mt_gene_mask] +adata.obs["mt_counts"] = adata.layers["raw_counts"][:, mt_gene_mask].sum(axis=1) +adata.obs["mt_frac"] = np.divide(adata.obs["mt_counts"], adata.obs["n_counts"]) + +import seaborn as sns + +# Quality control - plot QC metrics +# Sample quality plots +t1 = sc.pl.violin(adata, "n_counts", groupby="sample_name", size=2, log=True, cut=0) +t2 = sc.pl.violin(adata, "mt_frac", groupby="sample_name") + +adata_pp = adata[adata.obs["mt_frac"] < 0.2, :].copy() + +adata_pp.shape + +# don't need this als already TPM +# sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6) +sc.pp.log1p(adata_pp) + +sc.pp.pca(adata_pp, n_comps=50, svd_solver="arpack") + +sc.pp.neighbors(adata_pp) + +sc.tl.umap(adata_pp) + +sc.pl.umap(adata_pp, color=["sample_name", "patient_id"]) + +sc.pl.umap(adata_pp, color=["CD8A", "CD3E", "PDCD1"]) + +sc.pl.umap(adata_pp, color=["receptor_type"]) + +sc.pl.umap(adata_pp, color="receptor_type", groups=["ambiguous", "multichain"]) + +from IPython.display import display + +with pd.option_context("display.max_rows", 30, "display.max_columns", 9999): + display(adata_pp.obs.loc[adata_pp.obs["receptor_type"] == "ambiguous", :]) + +ir.pp.ir_neighbors(adata_pp, receptor_arms="all", dual_ir="primary_only") + +ir.pp.ir_neighbors( + adata_pp, + metric="alignment", + sequence="aa", + receptor_arms="all", + dual_ir="primary_only", +) + +ir.tl.define_clonotypes(adata_pp) + +ir.tl.clonotype_network(adata_pp, min_size=2) +ir.pl.clonotype_network(adata_pp, color="clonotype", legend_loc="none") + +ir.pl.clonotype_network(adata_pp, color=["receptor_subtype"]) From 94e5a4c07572d9bb114992dbaee8eef394aac6ab Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 20 Oct 2020 16:53:30 +0200 Subject: [PATCH 2/6] Add maynard2020 dataset --- docs/references.bib | 14 +++ scirpy/datasets/__init__.py | 34 +++++++- .../_processing_scripts/maynard2020.py | 85 ------------------- 3 files changed, 45 insertions(+), 88 deletions(-) diff --git a/docs/references.bib b/docs/references.bib index 6eaf93002..509b0f99c 100644 --- a/docs/references.bib +++ b/docs/references.bib @@ -1,3 +1,17 @@ +@article{Maynard2020, + doi = {10.1016/j.cell.2020.07.017}, + url = {https://doi.org/10.1016/j.cell.2020.07.017}, + year = {2020}, + month = sep, + publisher = {Elsevier {BV}}, + volume = {182}, + number = {5}, + pages = {1232--1251.e22}, + author = {Ashley Maynard and Caroline E. McCoach and Julia K. Rotow and Lincoln Harris and Franziska Haderk and D. Lucas Kerr and Elizabeth A. Yu and Erin L. Schenk and Weilun Tan and Alexander Zee and Michelle Tan and Philippe Gui and Tasha Lea and Wei Wu and Anatoly Urisman and Kirk Jones and Rene Sit and Pallav K. Kolli and Eric Seeley and Yaron Gesthalter and Daniel D. Le and Kevin A. Yamauchi and David M. Naeger and Sourav Bandyopadhyay and Khyati Shah and Lauren Cech and Nicholas J. Thomas and Anshal Gupta and Mayra Gonzalez and Hien Do and Lisa Tan and Bianca Bacaltos and Rafael Gomez-Sjoberg and Matthew Gubens and Thierry Jahan and Johannes R. Kratz and David Jablons and Norma Neff and Robert C. Doebele and Jonathan Weissman and Collin M. Blakely and Spyros Darmanis and Trever G. Bivona}, + title = {Therapy-Induced Evolution of Human Lung Cancer Revealed by Single-Cell {RNA} Sequencing}, + journal = {Cell} +} + @article{Vettermann2010, doi = {10.1111/j.1600-065x.2010.00935.x}, url = {https://doi.org/10.1111/j.1600-065x.2010.00935.x}, diff --git a/scirpy/datasets/__init__.py b/scirpy/datasets/__init__.py index dbafec4ac..9e42ee2bf 100644 --- a/scirpy/datasets/__init__.py +++ b/scirpy/datasets/__init__.py @@ -13,7 +13,7 @@ ) def wu2020() -> AnnData: """\ - Return the dataset from [Wu2020]_ as AnnData object. + Return the dataset from :cite:`Wu2020` as AnnData object. 200k cells, of which 100k have TCRs. @@ -23,7 +23,7 @@ def wu2020() -> AnnData: {processing_code} """ - url = "https://github.com/icbi-lab/scirpy/releases/download/v0.4.2/wu2020.h5ad" + url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020.h5ad" filename = settings.datasetdir / "wu2020.h5ad" adata = read(filename, backup_url=url) return adata @@ -47,7 +47,35 @@ def wu2020_3k() -> AnnData: """ # os.makedirs(settings.datasetdir, exist_ok=True) # TODO host it on github or similar - url = "https://github.com/icbi-lab/scirpy/releases/download/v0.4.2/wu2020_3k.h5ad" + url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/wu2020_3k.h5ad" filename = settings.datasetdir / "wu2020_3k.h5ad" adata = read(filename, backup_url=url) return adata + + +@_doc_params( + processing_code=indent( + _read_to_str(HERE / "_processing_scripts/maynard2020.py"), " " + ) +) +def maynard2020() -> AnnData: + """\ + Return the dataset from :cite:`Maynard2020` as AnnData object. + + 21k cells profiled with Smart-seq2, of which 3,500 have :term:`TCRs`: and + 1,500 have :term:`BCRs`. + + The raw FASTQ files have been obtained from `PRJNA591860 `__ + and processed using the nf-core `Smart-seq2 pipeline `__. + + The processed files have been imported and transformed into an :class:`anndata.AnnData` + object using the following script: + + .. code-block:: python + + {processing_code} + """ + url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/maynard2020.h5ad" + filename = settings.datasetdir / "wu2020.h5ad" + adata = read(filename, backup_url=url) + return adata diff --git a/scirpy/datasets/_processing_scripts/maynard2020.py b/scirpy/datasets/_processing_scripts/maynard2020.py index f689e8a86..ddb4cb8b8 100644 --- a/scirpy/datasets/_processing_scripts/maynard2020.py +++ b/scirpy/datasets/_processing_scripts/maynard2020.py @@ -1,19 +1,3 @@ -# --- -# jupyter: -# jupytext: -# formats: ipynb,py:light -# notebook_metadata_filter: -kernelspec -# text_representation: -# extension: .py -# format_name: light -# format_version: '1.5' -# jupytext_version: 1.6.0 -# --- - -# %load_ext autoreload -# %autoreload 2 - -# + # %env OPENBLAS_NUM_THREADS=16 # %env OMP_NUM_THREADS=16 # %env MKL_NUM_THREADS=16 @@ -31,8 +15,6 @@ from scipy.sparse import csr_matrix from pathlib import Path -# - - # The dataset has been downloaded from ENA and then processed using the Smart-seq2 Pipeline: # https://github.com/nf-core/smartseq2/ @@ -140,79 +122,12 @@ adata_tcr = ir.io.read_tracer( "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/TraCeR" ) - adata_bcr = ir.io.read_bracer( "/data/datasets/Maynard_Bivona_2020_NSCLC/smartseq2_pipeline/BraCeR/filtered_BCR_summary/changeodb.tab" ) ir.pp.merge_with_ir(adata, adata_tcr) - ir.pp.merge_with_ir(adata, adata_bcr) # Write out the dataset adata.write_h5ad("maynard2020.h5ad", compression="lzf") - -# ### Some quick peeks at the data - -ir.tl.chain_qc(adata) -ir.pl.group_abundance(adata, groupby="receptor_type") - -# Quality control - calculate QC covariates -adata.obs["n_counts"] = adata.layers["raw_counts"].sum(axis=1) -adata.obs["log_counts"] = np.log(adata.obs["n_counts"]) -adata.obs["n_genes"] = (adata.layers["raw_counts"] > 0).sum(1) -mt_gene_mask = np.array([gene.lower().startswith("mt-") for gene in adata.var_names]) -adata.layers["raw_counts"][:, mt_gene_mask] -adata.obs["mt_counts"] = adata.layers["raw_counts"][:, mt_gene_mask].sum(axis=1) -adata.obs["mt_frac"] = np.divide(adata.obs["mt_counts"], adata.obs["n_counts"]) - -import seaborn as sns - -# Quality control - plot QC metrics -# Sample quality plots -t1 = sc.pl.violin(adata, "n_counts", groupby="sample_name", size=2, log=True, cut=0) -t2 = sc.pl.violin(adata, "mt_frac", groupby="sample_name") - -adata_pp = adata[adata.obs["mt_frac"] < 0.2, :].copy() - -adata_pp.shape - -# don't need this als already TPM -# sc.pp.normalize_per_cell(adata_pp, counts_per_cell_after=1e6) -sc.pp.log1p(adata_pp) - -sc.pp.pca(adata_pp, n_comps=50, svd_solver="arpack") - -sc.pp.neighbors(adata_pp) - -sc.tl.umap(adata_pp) - -sc.pl.umap(adata_pp, color=["sample_name", "patient_id"]) - -sc.pl.umap(adata_pp, color=["CD8A", "CD3E", "PDCD1"]) - -sc.pl.umap(adata_pp, color=["receptor_type"]) - -sc.pl.umap(adata_pp, color="receptor_type", groups=["ambiguous", "multichain"]) - -from IPython.display import display - -with pd.option_context("display.max_rows", 30, "display.max_columns", 9999): - display(adata_pp.obs.loc[adata_pp.obs["receptor_type"] == "ambiguous", :]) - -ir.pp.ir_neighbors(adata_pp, receptor_arms="all", dual_ir="primary_only") - -ir.pp.ir_neighbors( - adata_pp, - metric="alignment", - sequence="aa", - receptor_arms="all", - dual_ir="primary_only", -) - -ir.tl.define_clonotypes(adata_pp) - -ir.tl.clonotype_network(adata_pp, min_size=2) -ir.pl.clonotype_network(adata_pp, color="clonotype", legend_loc="none") - -ir.pl.clonotype_network(adata_pp, color=["receptor_subtype"]) From 9af72b91236189e1172df5fe2d229003fad56a8b Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 20 Oct 2020 16:58:20 +0200 Subject: [PATCH 3/6] Add maynard2020 to APIdoc --- docs/api.rst | 1 + scirpy/datasets/__init__.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index f280c0be0..71a7a2a81 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -179,6 +179,7 @@ Datasets: `datasets` wu2020 wu2020_3k + maynard2020 diff --git a/scirpy/datasets/__init__.py b/scirpy/datasets/__init__.py index 9e42ee2bf..4e66fd62b 100644 --- a/scirpy/datasets/__init__.py +++ b/scirpy/datasets/__init__.py @@ -62,8 +62,8 @@ def maynard2020() -> AnnData: """\ Return the dataset from :cite:`Maynard2020` as AnnData object. - 21k cells profiled with Smart-seq2, of which 3,500 have :term:`TCRs`: and - 1,500 have :term:`BCRs`. + 21k cells from NSCLC profiled with Smart-seq2, of which 3,500 have :term:`TCRs` + and 1,500 have :term:`BCRs`. The raw FASTQ files have been obtained from `PRJNA591860 `__ and processed using the nf-core `Smart-seq2 pipeline `__. From 5372f11541fc124bee78a16113c4de077c74bfd7 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 20 Oct 2020 17:32:16 +0200 Subject: [PATCH 4/6] Fix git describe command for setuptools_scm Adding the data-releases broke setuptools_scm. By changing the `git describe` command in pyproject.toml, I can make sure that only vX.X.X tags are considered. --- pyproject.toml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index cef749de9..db4df58e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -79,3 +79,5 @@ minversion = 6.0 testpaths = 'scirpy/tests' norecursedirs = [ '.*', 'build', 'dist', '*.egg', 'data', '__pycache__'] +[tool.setuptools_scm] +git_describe_command = "git describe --dirty --tags --long --match v*.*.*" From 54fc69f9f6a7c6f61bf5b6d0f89e749d7893ecc4 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 20 Oct 2020 17:44:39 +0200 Subject: [PATCH 5/6] Fix filename --- scirpy/datasets/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scirpy/datasets/__init__.py b/scirpy/datasets/__init__.py index 4e66fd62b..660fd8cde 100644 --- a/scirpy/datasets/__init__.py +++ b/scirpy/datasets/__init__.py @@ -76,6 +76,6 @@ def maynard2020() -> AnnData: {processing_code} """ url = "https://github.com/icbi-lab/scirpy/releases/download/d0.1.0/maynard2020.h5ad" - filename = settings.datasetdir / "wu2020.h5ad" + filename = settings.datasetdir / "maynard2020.h5ad" adata = read(filename, backup_url=url) return adata From 951680ed1def800ae84cbb01d7c8559f2ba7cd2a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 20 Oct 2020 17:53:40 +0200 Subject: [PATCH 6/6] Explicitly pass pyproject.toml parameters to get_version --- scirpy/_metadata.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scirpy/_metadata.py b/scirpy/_metadata.py index cbee97fc3..9ef210071 100644 --- a/scirpy/_metadata.py +++ b/scirpy/_metadata.py @@ -11,7 +11,13 @@ proj = pytoml.loads((here.parent / "pyproject.toml").read_text()) metadata = proj["tool"]["flit"]["metadata"] - __version__ = get_version(root="..", relative_to=__file__) + __version__ = get_version( + # Allegedly, the parameters from pyproject.toml should be passed automatically. + # However, this didn't work, so I pass them explicitly here. + root="..", + relative_to=__file__, + **proj["tool"]["setuptools_scm"] + ) __author__ = metadata["author"] __email__ = metadata["author-email"]