Merge pull request #259 from icbi-lab/update-docs

Update documentation to reflect changes to IO module
scverse · Apr 12, 2021 · 6099f0c · 6099f0c
2 parents fa5f602 + 8187b26
commit 6099f0c
Show file tree

Hide file tree

Showing 8 changed files with 70 additions and 46 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -9,12 +9,18 @@ on:
 jobs:
   test:
     if: "!contains(github.event.head_commit.message, 'skip ci')"
-    runs-on: ${{ matrix.os }}
+    runs-on: ${{ matrix.config.os }}
     strategy:
       fail-fast: false
       matrix:
-        python-version: [3.7, 3.8, 3.9]
-        os: [ubuntu-latest, macos-latest, windows-latest]
+        config:
+          - { python-version: 3.7, os: ubuntu-latest }
+          - { python-version: 3.8, os: ubuntu-latest }
+          - { python-version: 3.9, os: ubuntu-latest }
+          # 3.8 is enough for macos and linux. For 3.9 essential wheels are still missing and building
+          # from source is very painful, especially on windows.
+          - { python-version: 3.8, os: macos-latest }
+          - { python-version: 3.8, os: windows-latest }
 
     steps:
       - uses: actions/checkout@v2
@@ -26,18 +32,19 @@ jobs:
           key: ${{ runner.os }}-pip-${{ hashFiles('pyproject.toml') }}
           restore-keys: |
             ${{ runner.os }}-pip-
+      # The HDF5 system requirements are necessary until pytables provides a wheel for python 3.9
       - name: Install Ubuntu system dependencies
-        if: matrix.os == 'ubuntu-latest'
+        if: matrix.config.os == 'ubuntu-latest'
         run: |
           sudo apt-get install libhdf5-serial-dev
       - name: Install macOS system dependencies
-        if: matrix.os == 'macos-latest'
+        if: matrix.config.os == 'macos-latest'
         run: |
           brew install cairo pkg-config autoconf automake libtool
-      - name: Set up Python ${{ matrix.python-version }}
+      - name: Set up Python ${{ matrix.config.python-version }}
         uses: actions/setup-python@v1
         with:
-          python-version: ${{ matrix.python-version }}
+          python-version: ${{ matrix.config.python-version }}
       - name: Install dependencies
         # TODO the separate numpy installation is to workaround an issue with dandelion's skbio dependency
         run: |

diff --git a/docs/api.rst b/docs/api.rst
@@ -16,18 +16,23 @@ as closely as possible.
 Input/Output: `io`
 ------------------
 
+.. module:: scirpy.io
+
 .. note::
    In scirpy v0.7.0 the way VDJ data is stored in `adata.obs` has changed to 
    be fully compliant with the `AIRR Rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html#productive>`__ 
    schema. Please use :func:`~scirpy.io.upgrade_schema` to make `AnnData` objects
    from previous scirpy versions compatible with the most recent scirpy workflow. 
 
+   .. autosummary::
+      :toctree: ./generated
+
+      upgrade_schema
+
 
 The following functions allow to import :term:`V(D)J` information from various
 formats.
 
-.. module:: scirpy.io
-
 .. autosummary::
    :toctree: ./generated
 
@@ -36,8 +41,14 @@ formats.
    read_tracer
    read_bracer
    read_airr
-   write_airr
    from_dandelion
+
+Scirpy can export data to the following formats:
+
+.. autosummary::
+   :toctree: ./generated
+
+   write_airr
    to_dandelion
 
 To convert own formats into the scirpy :ref:`data-structure`, we recommend building
@@ -51,7 +62,6 @@ For more details, check the :ref:`Data loading tutorial <importing-data>`.
    AirrCell
    from_airr_cells
    to_airr_cells
-   upgrade_schema
 
 
 Preprocessing: `pp`

diff --git a/docs/glossary.rst b/docs/glossary.rst
@@ -206,10 +206,12 @@ Glossary
         page about our :ref:`IR model<receptor-model>`.
 
     AIRR
-        Adaptive Immune Receptor Repertoire.
-        See also the `AIRR community <https://www.antibodysociety.org/the-airr-community/>`_.
+        Adaptive Immune Receptor Repertoire. Within the Scirpy documentation, we simply 
+        speak of :term:`immune receptors (IR)<IR>`.
 
-        Within the Scirpy documentation, we simply speak of :term:`immune receptors (IR)<IR>`.
+        The `AIRR community <https://www.antibodysociety.org/the-airr-community/>`_ 
+        defines standards around AIRR data. Scirpy supports the `AIRR Rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_
+        schema and complies with the `AIRR Software Guidelines <https://docs.airr-community.org/en/latest/swtools/airr_swtools_standard.html>`_.
 
     Chain locus
         Scirpy supports all valid `IGMT locus names <http://www.imgt.org/IMGTScientificChart/Nomenclature/IMGTnomenclature.html>`_:

diff --git a/docs/tutorials/tutorial_3k_tcr.md b/docs/tutorials/tutorial_3k_tcr.md
@@ -231,9 +231,6 @@ ax = ir.pl.group_abundance(adata, groupby="chain_pairing", target_col="source")
 
 ## Define clonotypes and clonotype clusters
 
-<!-- TODO explain that there are different values for dual_ir -->
-
-
 <!-- #raw raw_mimetype="text/restructuredtext" -->
 .. warning::
 
@@ -277,6 +274,8 @@ The function :func:`scirpy.tl.define_clonotypes` matches cells based on the dist
 detects connected modules in the graph and annotates them as clonotypes. This will add a `clone_id` and
 `clone_id_size` column to `adata.obs`.
 
+The `dual_ir` parameter defines how scirpy handles cells with :term:`more than one pair of receptors <Dual IR>`. The default value is `any` which implies that cells with any of their primary or secondary receptor chain matching will be considered to be of the same clonotype. 
+
 Here, we define :term:`clonotypes <Clonotype>` based on nt-sequence identity.
 In a later step, we will define :term:`clonotype clusters <Clonotype cluster>` based on
 amino-acid similarity.

diff --git a/docs/tutorials/tutorial_io.md b/docs/tutorials/tutorial_io.md
@@ -12,7 +12,15 @@ jupyter:
 ```python
 %load_ext autoreload
 %autoreload 2
+import anndata
+
+anndata.logging.anndata_logger.addFilter(
+    lambda r: not r.getMessage().startswith("storing")
+    and r.getMessage().endswith("as categorical.")
+)
+```
 
+```python
 import scirpy as ir
 import scanpy as sc
 from glob import glob
@@ -21,14 +29,6 @@ import tarfile
 import anndata
 import warnings
 
-# from numba import NumbaPerformanceWarning
-
-# # ignore numba performance warnings
-# warnings.filterwarnings("ignore", category=NumbaPerformanceWarning)
-
-# suppress "storing XXX as categorical" warnings.
-anndata.logging.anndata_logger.setLevel("ERROR")
-
 sc.set_figure_params(figsize=(4, 4))
 sc.settings.verbosity = 2  # verbosity: errors (0), warnings (1), info (2), hints (3)
 ```
@@ -46,20 +46,23 @@ AnnData and how Scirpy makes use of it, check out the :ref:`data structure <data
 The example data used in this notebook are available from the
 `Scirpy repository <https://github.com/icbi-lab/scirpy/tree/master/docs/tutorials/example_data>`__.
 
-
 .. important:: **The Scirpy data model**
 
     Currently, the Scirpy data model has the following constraints:
 
      * BCR and TCR chains are supported. Chain loci must be valid :term:`Chain locus`,
        i.e. one of `TRA`, `TRG`, `IGK`, or `IGL` (chains with a :term:`VJ<V(D)J>` junction) or
-       `TRB`, `TRD`, or `IGH` (chains with a :term:`VDJ<V(D)J>` junction). Other chains are discarded.
-     * Non-productive chains are removed. *CellRanger*, *TraCeR*, and the *AIRR rearrangment format*
+       `TRB`, `TRD`, or `IGH` (chains with a :term:`VDJ<V(D)J>` junction). 
+     * Each cell can contain up to two `VJ` and two `VDJ` chains (:term:`Dual IR`).
+       Excess chains are ignored (those with lowest read count/:term:`UMI` count)
+       and cells flagged as :term:`Multichain-cell`.
+     * Non-productive chains are ignored. *CellRanger*, *TraCeR*, and the *AIRR rearrangment format*
        flag these cells appropriately. When reading :ref:`custom formats <importing-custom-formats>`,
        you need to pass the flag explicitly or filter the chains beforehand.
-     * Each chain can contain up to two `VJ` and two `VDJ` chains (:term:`Dual IR`).
-       Excess chains are removed (those with lowest read count/:term:`UMI` count)
-       and cells flagged as :term:`Multichain-cell`.
+     * Excess chains, non-productive chains, or chains with invalid loci
+       are serialized to JSON and stored in the `extra_chains` column. They are not 
+       used by scirpy except when exporting the `AnnData` object to :term:`AIRR` format. 
+     
 
     For more information, see :ref:`receptor-model`.
 

diff --git a/docs/usage-principles.rst b/docs/usage-principles.rst
@@ -22,7 +22,7 @@ Scirpy is an extension to `Scanpy <https://scanpy.readthedocs.io>`_ and adheres
  * The :class:`~anndata.AnnData` instance is modified inplace, unless the functions
    is called with the keyword argument `inplace=False`.
 
-We decided to handle a few minor points differenlty to Scanpy:
+We decided to handle a few minor points differently to Scanpy:
 
  * Plotting functions with inexpensive computations (e.g. :func:`scirpy.pl.clonal_expansion`)
    call the corresponding tool (:func:`scirpy.tl.clonal_expansion`) on-the-fly and

diff --git a/scirpy/io/_io.py b/scirpy/io/_io.py
@@ -339,9 +339,7 @@ def read_airr(
     include_fields: Optional[Collection[str]] = DEFAULT_AIRR_FIELDS,
 ) -> AnnData:
     """\
-    Read AIRR-compliant data.
-
-    Reads data organized in the `AIRR rearrangement schema <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_.
+    Read data from `AIRR rearrangement <https://docs.airr-community.org/en/latest/datarep/rearrangements.html>`_ format.
 
     The following columns are required by scirpy: 
      * `cell_id`
@@ -558,7 +556,7 @@ def read_bracer(path: Union[str, Path]) -> AnnData:
 
 @_check_upgrade_schema()
 def write_airr(adata: AnnData, filename: Union[str, Path]) -> None:
-    """Write immune receptor fields from `adata.obs` in AIRR Rearrangement TSV format.
+    """Export :term:`IR` data to :term:`AIRR` Rearrangement `tsv` format.
 
     Parameters
     ----------
@@ -618,19 +616,18 @@ def upgrade_schema(adata) -> None:
                 "j_gene": "j_call",
                 "c_gene": "c_call",
                 "cdr3_nt": "junction",
-                "clonotype": "clone_id",
             }.items(),
         )
     }
+    rename_dict["clonotype"] = "clone_id"
     adata.obs.rename(columns=rename_dict, inplace=True)
     adata.obs["extra_chains"] = None
     adata.uns["scirpy_version"] = __version__
 
 
 @_check_upgrade_schema()
 def to_dandelion(adata: AnnData):
-    """
-    Convert a scirpy-initialized AnnData object to Dandelion format using `to_ir_objs`.
+    """Export data to `Dandelion <https://github.com/zktuong/dandelion>`_ (:cite:`Stephenson2021`).
 
     Parameters
     ----------
@@ -640,7 +637,6 @@ def to_dandelion(adata: AnnData):
     Returns
     -------
     `Dandelion` object.
-
     """
     try:
         import dandelion as ddl
@@ -664,8 +660,12 @@ def to_dandelion(adata: AnnData):
     return ddl.Dandelion(ddl.load_data(data))
 
 
+@_doc_params(doc_working_model=doc_working_model)
 def from_dandelion(dandelion, transfer=False) -> AnnData:
-    """Import data from dandelion (:cite:`Stephenson2021`).
+    """\
+    Import data from `Dandelion <https://github.com/zktuong/dandelion>`_ (:cite:`Stephenson2021`).
+
+    {doc_working_model}
 
     Parameters
     ----------

diff --git a/scirpy/io/_util.py b/scirpy/io/_util.py
@@ -7,12 +7,15 @@
 
 .. note::
     Reading data into *Scirpy* has the following constraints:
-     * each cell can have up to four chains (:term:`Dual IR`):
-       two :term:`VJ<V(D)J>` and two :term:`VDJ<V(D)J>` chains.
-     * Excess chains are removed (those with lowest read count/:term:`UMI` count)
+     * Each cell can have up to four productive chains chains (:term:`Dual IR`):
+       two :term:`VJ<V(D)J>` and two :term:`VDJ<V(D)J>` chains. 
+     * Excess chains are ignored (those with lowest read count/:term:`UMI` count)
        and cells flagged as :term:`Multichain-cell`.
-     * non-productive chains are removed
-     * chain loci must be :term:`IGMT locus names<Chain locus>`.
+     * Non-productive chains are ignored. 
+     * Chain loci must be valid :term:`IGMT locus names<Chain locus>`.
+     * Excess chains, non-productive chains, or chains with invalid loci
+       are serialized to JSON and stored in the `extra_chains` column. They are not 
+       used by scirpy except when exporting the `AnnData` object to AIRR format. 
 
     For more information, see :ref:`receptor-model`.
 """