From f7f556cdcaeef3824e81d32d43f9fee261a403e9 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 7 Apr 2021 19:14:16 +0200 Subject: [PATCH 1/6] WIP update docs --- docs/tutorials/tutorial_io.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/tutorials/tutorial_io.md b/docs/tutorials/tutorial_io.md index 381001065..6d8952402 100644 --- a/docs/tutorials/tutorial_io.md +++ b/docs/tutorials/tutorial_io.md @@ -46,6 +46,7 @@ AnnData and how Scirpy makes use of it, check out the :ref:`data structure `__. +.. TODO update!! .. important:: **The Scirpy data model** From 1943e53e60c2f9790e25eff827ada15a69706644 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 8 Apr 2021 09:48:56 +0200 Subject: [PATCH 2/6] Properly rename clonotype in upgrade_schema --- scirpy/io/_io.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scirpy/io/_io.py b/scirpy/io/_io.py index 4066661f2..00e1a1d24 100644 --- a/scirpy/io/_io.py +++ b/scirpy/io/_io.py @@ -618,10 +618,10 @@ def upgrade_schema(adata) -> None: "j_gene": "j_call", "c_gene": "c_call", "cdr3_nt": "junction", - "clonotype": "clone_id", }.items(), ) } + rename_dict["clonotype"] = "clone_id" adata.obs.rename(columns=rename_dict, inplace=True) adata.obs["extra_chains"] = None adata.uns["scirpy_version"] = __version__ From 66c845f1f9d75465088b1674d03da2b820ea4066 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 12 Apr 2021 10:09:27 +0200 Subject: [PATCH 3/6] Constrain CI to py 3.8 for win and macos --- .github/workflows/test.yml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml index 9faada178..fee4afba8 100644 --- a/.github/workflows/test.yml +++ b/.github/workflows/test.yml @@ -9,12 +9,18 @@ on: jobs: test: if: "!contains(github.event.head_commit.message, 'skip ci')" - runs-on: ${{ matrix.os }} + runs-on: ${{ matrix.config.os }} strategy: fail-fast: false matrix: - python-version: [3.7, 3.8, 3.9] - os: [ubuntu-latest, macos-latest, windows-latest] + config: + - { python-version: 3.7, os: ubuntu-latest } + - { python-version: 3.8, os: ubuntu-latest } + - { python-version: 3.9, os: ubuntu-latest } + # 3.8 is enough for macos and linux. For 3.9 essential wheels are still missing and building + # from source is very painful, especially on windows. + - { python-version: 3.8, os: macos-latest } + - { python-version: 3.8, os: windows-latest } steps: - uses: actions/checkout@v2 @@ -26,18 +32,19 @@ jobs: key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }} restore-keys: | ${{ runner.os }}-pip- + # The HDF5 system requirements are necessary until pytables provides a wheel for python 3.9 - name: Install Ubuntu system dependencies - if: matrix.os == 'ubuntu-latest' + if: matrix.config.os == 'ubuntu-latest' run: | sudo apt-get install libhdf5-serial-dev - name: Install macOS system dependencies - if: matrix.os == 'macos-latest' + if: matrix.config.os == 'macos-latest' run: | brew install cairo pkg-config autoconf automake libtool - - name: Set up Python ${{ matrix.python-version }} + - name: Set up Python ${{ matrix.config.python-version }} uses: actions/setup-python@v1 with: - python-version: ${{ matrix.python-version }} + python-version: ${{ matrix.config.python-version }} - name: Install dependencies # TODO the separate numpy installation is to workaround an issue with dandelion's skbio dependency run: | From 45184103520110939500c21c2fce34243fa26a48 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 12 Apr 2021 13:03:34 +0200 Subject: [PATCH 4/6] Update IO tutorial --- docs/glossary.rst | 8 ++++--- docs/tutorials/tutorial_3k_tcr.md | 5 ++-- docs/tutorials/tutorial_io.md | 39 +++++++++++++++++++------------ scirpy/io/_io.py | 2 +- scirpy/io/_util.py | 13 +++++++---- 5 files changed, 40 insertions(+), 27 deletions(-) diff --git a/docs/glossary.rst b/docs/glossary.rst index 27144d926..8afb0e401 100644 --- a/docs/glossary.rst +++ b/docs/glossary.rst @@ -206,10 +206,12 @@ Glossary page about our :ref:`IR model`. AIRR - Adaptive Immune Receptor Repertoire. - See also the `AIRR community `_. + Adaptive Immune Receptor Repertoire. Within the Scirpy documentation, we simply + speak of :term:`immune receptors (IR)`. - Within the Scirpy documentation, we simply speak of :term:`immune receptors (IR)`. + The `AIRR community `_ + defines standards around AIRR data. Scirpy supports the `AIRR Rearrangement `_ + schema and complies with the `AIRR Software Guidelines `_. Chain locus Scirpy supports all valid `IGMT locus names `_: diff --git a/docs/tutorials/tutorial_3k_tcr.md b/docs/tutorials/tutorial_3k_tcr.md index 3e0051317..fde72de2d 100644 --- a/docs/tutorials/tutorial_3k_tcr.md +++ b/docs/tutorials/tutorial_3k_tcr.md @@ -231,9 +231,6 @@ ax = ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="source") ## Define clonotypes and clonotype clusters - - - .. warning:: @@ -277,6 +274,8 @@ The function :func:`scirpy.tl.define_clonotypes` matches cells based on the dist detects connected modules in the graph and annotates them as clonotypes. This will add a `clone_id` and `clone_id_size` column to `adata.obs`. +The `dual_ir` parameter defines how scirpy handles cells with :term:`more than one pair of receptors `. The default value is `any` which implies that cells with any of their primary or secondary receptor chain matching will be considered to be of the same clonotype. + Here, we define :term:`clonotypes ` based on nt-sequence identity. In a later step, we will define :term:`clonotype clusters ` based on amino-acid similarity. diff --git a/docs/tutorials/tutorial_io.md b/docs/tutorials/tutorial_io.md index 6d8952402..24bcc8138 100644 --- a/docs/tutorials/tutorial_io.md +++ b/docs/tutorials/tutorial_io.md @@ -12,7 +12,22 @@ jupyter: ```python %load_ext autoreload %autoreload 2 +import anndata +import logging + + +class NoCategoricalWarningFilter(logging.Filter): + """suppress "storing XXX as categorical" warnings.""" + + def filter(self, record): + m = record.getMessage() + return not m.startswith("storing") and m.endswith("as categorical.") + + +anndata.logging.anndata_logger.addFilter(NoCategoricalWarningFilter) +``` +```python import scirpy as ir import scanpy as sc from glob import glob @@ -21,14 +36,6 @@ import tarfile import anndata import warnings -# from numba import NumbaPerformanceWarning - -# # ignore numba performance warnings -# warnings.filterwarnings("ignore", category=NumbaPerformanceWarning) - -# suppress "storing XXX as categorical" warnings. -anndata.logging.anndata_logger.setLevel("ERROR") - sc.set_figure_params(figsize=(4, 4)) sc.settings.verbosity = 2 # verbosity: errors (0), warnings (1), info (2), hints (3) ``` @@ -46,21 +53,23 @@ AnnData and how Scirpy makes use of it, check out the :ref:`data structure `__. -.. TODO update!! - .. important:: **The Scirpy data model** Currently, the Scirpy data model has the following constraints: * BCR and TCR chains are supported. Chain loci must be valid :term:`Chain locus`, i.e. one of `TRA`, `TRG`, `IGK`, or `IGL` (chains with a :term:`VJ` junction) or - `TRB`, `TRD`, or `IGH` (chains with a :term:`VDJ` junction). Other chains are discarded. - * Non-productive chains are removed. *CellRanger*, *TraCeR*, and the *AIRR rearrangment format* + `TRB`, `TRD`, or `IGH` (chains with a :term:`VDJ` junction). + * Each cell can contain up to two `VJ` and two `VDJ` chains (:term:`Dual IR`). + Excess chains are ignored (those with lowest read count/:term:`UMI` count) + and cells flagged as :term:`Multichain-cell`. + * Non-productive chains are ignored. *CellRanger*, *TraCeR*, and the *AIRR rearrangment format* flag these cells appropriately. When reading :ref:`custom formats `, you need to pass the flag explicitly or filter the chains beforehand. - * Each chain can contain up to two `VJ` and two `VDJ` chains (:term:`Dual IR`). - Excess chains are removed (those with lowest read count/:term:`UMI` count) - and cells flagged as :term:`Multichain-cell`. + * Excess chains, non-productive chains, or chains with invalid loci + are serialized to JSON and stored in the `extra_chains` column. They are not + used by scirpy except when exporting the `AnnData` object to :term:`AIRR` format. + For more information, see :ref:`receptor-model`. diff --git a/scirpy/io/_io.py b/scirpy/io/_io.py index 00e1a1d24..db271a8bb 100644 --- a/scirpy/io/_io.py +++ b/scirpy/io/_io.py @@ -665,7 +665,7 @@ def to_dandelion(adata: AnnData): def from_dandelion(dandelion, transfer=False) -> AnnData: - """Import data from dandelion (:cite:`Stephenson2021`). + """Import data from `Dandelion `_ (:cite:`Stephenson2021`). Parameters ---------- diff --git a/scirpy/io/_util.py b/scirpy/io/_util.py index 407f6d4dd..249074555 100644 --- a/scirpy/io/_util.py +++ b/scirpy/io/_util.py @@ -7,12 +7,15 @@ .. note:: Reading data into *Scirpy* has the following constraints: - * each cell can have up to four chains (:term:`Dual IR`): - two :term:`VJ` and two :term:`VDJ` chains. - * Excess chains are removed (those with lowest read count/:term:`UMI` count) + * each cell can have up to four productive chains chains (:term:`Dual IR`): + two :term:`VJ` and two :term:`VDJ` chains. + * Excess chains are ignored (those with lowest read count/:term:`UMI` count) and cells flagged as :term:`Multichain-cell`. - * non-productive chains are removed - * chain loci must be :term:`IGMT locus names`. + * non-productive chains are ignored. + * chain loci must be valid :term:`IGMT locus names`. + * excess chains, non-productive chains, or chains with invalid loci + are serialized to JSON and stored in the `extra_chains` column. They are not + used by scirpy except when exporting the `AnnData` object to AIRR format. For more information, see :ref:`receptor-model`. """ From 4e922df4b4bd5497b54599649b4bcc21754563ab Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 12 Apr 2021 13:27:14 +0200 Subject: [PATCH 5/6] Update API page --- docs/api.rst | 18 ++++++++++++++---- docs/usage-principles.rst | 2 +- scirpy/io/_io.py | 16 ++++++++-------- scirpy/io/_util.py | 8 ++++---- 4 files changed, 27 insertions(+), 17 deletions(-) diff --git a/docs/api.rst b/docs/api.rst index fa2d42c6f..a84e102cd 100644 --- a/docs/api.rst +++ b/docs/api.rst @@ -16,18 +16,23 @@ as closely as possible. Input/Output: `io` ------------------ +.. module:: scirpy.io + .. note:: In scirpy v0.7.0 the way VDJ data is stored in `adata.obs` has changed to be fully compliant with the `AIRR Rearrangement `__ schema. Please use :func:`~scirpy.io.upgrade_schema` to make `AnnData` objects from previous scirpy versions compatible with the most recent scirpy workflow. + .. autosummary:: + :toctree: ./generated + + upgrade_schema + The following functions allow to import :term:`V(D)J` information from various formats. -.. module:: scirpy.io - .. autosummary:: :toctree: ./generated @@ -36,8 +41,14 @@ formats. read_tracer read_bracer read_airr - write_airr from_dandelion + +Scirpy can export data to the following formats: + +.. autosummary:: + :toctree: ./generated + + write_airr to_dandelion To convert own formats into the scirpy :ref:`data-structure`, we recommend building @@ -51,7 +62,6 @@ For more details, check the :ref:`Data loading tutorial `. AirrCell from_airr_cells to_airr_cells - upgrade_schema Preprocessing: `pp` diff --git a/docs/usage-principles.rst b/docs/usage-principles.rst index c17ffd234..366aef9c0 100644 --- a/docs/usage-principles.rst +++ b/docs/usage-principles.rst @@ -22,7 +22,7 @@ Scirpy is an extension to `Scanpy `_ and adheres * The :class:`~anndata.AnnData` instance is modified inplace, unless the functions is called with the keyword argument `inplace=False`. -We decided to handle a few minor points differenlty to Scanpy: +We decided to handle a few minor points differently to Scanpy: * Plotting functions with inexpensive computations (e.g. :func:`scirpy.pl.clonal_expansion`) call the corresponding tool (:func:`scirpy.tl.clonal_expansion`) on-the-fly and diff --git a/scirpy/io/_io.py b/scirpy/io/_io.py index db271a8bb..1f3060568 100644 --- a/scirpy/io/_io.py +++ b/scirpy/io/_io.py @@ -339,9 +339,7 @@ def read_airr( include_fields: Optional[Collection[str]] = DEFAULT_AIRR_FIELDS, ) -> AnnData: """\ - Read AIRR-compliant data. - - Reads data organized in the `AIRR rearrangement schema `_. + Read data from `AIRR rearrangement `_ format. The following columns are required by scirpy: * `cell_id` @@ -558,7 +556,7 @@ def read_bracer(path: Union[str, Path]) -> AnnData: @_check_upgrade_schema() def write_airr(adata: AnnData, filename: Union[str, Path]) -> None: - """Write immune receptor fields from `adata.obs` in AIRR Rearrangement TSV format. + """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format. Parameters ---------- @@ -629,8 +627,7 @@ def upgrade_schema(adata) -> None: @_check_upgrade_schema() def to_dandelion(adata: AnnData): - """ - Convert a scirpy-initialized AnnData object to Dandelion format using `to_ir_objs`. + """Export data to `Dandelion `_ (:cite:`Stephenson2021`). Parameters ---------- @@ -640,7 +637,6 @@ def to_dandelion(adata: AnnData): Returns ------- `Dandelion` object. - """ try: import dandelion as ddl @@ -664,8 +660,12 @@ def to_dandelion(adata: AnnData): return ddl.Dandelion(ddl.load_data(data)) +@_doc_params(doc_working_model=doc_working_model) def from_dandelion(dandelion, transfer=False) -> AnnData: - """Import data from `Dandelion `_ (:cite:`Stephenson2021`). + """\ + Import data from `Dandelion `_ (:cite:`Stephenson2021`). + + {doc_working_model} Parameters ---------- diff --git a/scirpy/io/_util.py b/scirpy/io/_util.py index 249074555..31c9af294 100644 --- a/scirpy/io/_util.py +++ b/scirpy/io/_util.py @@ -7,13 +7,13 @@ .. note:: Reading data into *Scirpy* has the following constraints: - * each cell can have up to four productive chains chains (:term:`Dual IR`): + * Each cell can have up to four productive chains chains (:term:`Dual IR`): two :term:`VJ` and two :term:`VDJ` chains. * Excess chains are ignored (those with lowest read count/:term:`UMI` count) and cells flagged as :term:`Multichain-cell`. - * non-productive chains are ignored. - * chain loci must be valid :term:`IGMT locus names`. - * excess chains, non-productive chains, or chains with invalid loci + * Non-productive chains are ignored. + * Chain loci must be valid :term:`IGMT locus names`. + * Excess chains, non-productive chains, or chains with invalid loci are serialized to JSON and stored in the `extra_chains` column. They are not used by scirpy except when exporting the `AnnData` object to AIRR format. From 8187b26a40674964deb9a42c42fea57e968f5a8f Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 12 Apr 2021 14:06:05 +0200 Subject: [PATCH 6/6] Fix log filtering --- docs/tutorials/tutorial_io.md | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/docs/tutorials/tutorial_io.md b/docs/tutorials/tutorial_io.md index 24bcc8138..8994896b3 100644 --- a/docs/tutorials/tutorial_io.md +++ b/docs/tutorials/tutorial_io.md @@ -13,18 +13,11 @@ jupyter: %load_ext autoreload %autoreload 2 import anndata -import logging - -class NoCategoricalWarningFilter(logging.Filter): - """suppress "storing XXX as categorical" warnings.""" - - def filter(self, record): - m = record.getMessage() - return not m.startswith("storing") and m.endswith("as categorical.") - - -anndata.logging.anndata_logger.addFilter(NoCategoricalWarningFilter) +anndata.logging.anndata_logger.addFilter( + lambda r: not r.getMessage().startswith("storing") + and r.getMessage().endswith("as categorical.") +) ``` ```python