Merge pull request #261 from icbi-lab/pin-anndata

Pin anndata v0.7.6 and fix issue with saving adata object
scverse · Apr 13, 2021 · ff6b048 · ff6b048
2 parents 6099f0c + e836cdc
commit ff6b048
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 9 deletions.
diff --git a/pyproject.toml b/pyproject.toml
@@ -24,7 +24,7 @@ classifiers = [
 ]
 requires-python = '>= 3.7'
 requires = [
-    'anndata>=0.7.3',
+    'anndata>=0.7.6',
     'scanpy>=1.6.0',
     'pandas>=1.0',
     'numpy>=1.17.0',

diff --git a/scirpy/io/_convert_anndata.py b/scirpy/io/_convert_anndata.py
@@ -8,6 +8,7 @@
 from typing import Collection, Iterable, List, Optional
 from .. import __version__
 import numpy as np
+from pandas.api.types import is_object_dtype
 
 
 def _sanitize_anndata(adata: AnnData) -> None:
@@ -17,12 +18,14 @@ def _sanitize_anndata(adata: AnnData) -> None:
         len(adata.X.shape) == 2
     ), "X needs to have dimensions, otherwise concat doesn't work. "
 
-    CATEGORICAL_COLS = ("locus", "v_call", "d_call", "j_call", "c_call")
-
     # Pending updates to anndata to properly handle boolean columns.
     # For now, let's turn them into a categorical with "True/False"
     BOOLEAN_COLS = ("has_ir", "is_cell", "multi_chain", "high_confidence", "productive")
 
+    # explicitly convert those to categoricals. All IR_ columns that are strings
+    # will be converted to categoricals, too
+    CATEGORICAL_COLS = ("extra_chains",)
+
     # Sanitize has_ir column into categorical
     # This should always be a categorical with True / False
     for col in adata.obs.columns:
@@ -34,13 +37,14 @@ def _sanitize_anndata(adata: AnnData) -> None:
                 ],
                 categories=["True", "False", "None"],
             )
-
-    # Turn other columns into categorical
-    for col in adata.obs.columns:
-        if col.endswith(CATEGORICAL_COLS):
+        elif col.endswith(CATEGORICAL_COLS) or (
+            col.startswith("IR_") and is_object_dtype(adata.obs[col])
+        ):
+            # Turn all IR_VJ columns that are of type string or object to categoricals
+            # otherwise saving anndata doesn't work.
             adata.obs[col] = pd.Categorical(adata.obs[col])
 
-    adata._sanitize()
+    adata.strings_to_categoricals()
 
 
 @_doc_params(doc_working_model=doc_working_model)

diff --git a/scirpy/io/_io.py b/scirpy/io/_io.py
@@ -20,7 +20,7 @@
 from pathlib import Path
 import airr
 from ..util import _doc_params, _is_true, _is_true2, _translate_dna_to_protein
-from ._convert_anndata import from_airr_cells, to_airr_cells
+from ._convert_anndata import from_airr_cells, to_airr_cells, _sanitize_anndata
 from ._util import doc_working_model, _IOLogger, _check_upgrade_schema
 from .._compat import Literal
 from airr import RearrangementSchema
@@ -623,6 +623,7 @@ def upgrade_schema(adata) -> None:
     adata.obs.rename(columns=rename_dict, inplace=True)
     adata.obs["extra_chains"] = None
     adata.uns["scirpy_version"] = __version__
+    _sanitize_anndata(adata)
 
 
 @_check_upgrade_schema()

diff --git a/scirpy/tests/test_io.py b/scirpy/tests/test_io.py
@@ -375,16 +375,19 @@ def test_read_airr():
         anndata.obs.loc[anndata.obs["IR_VJ_1_locus"] == "TRA", tra_cols],
         anndata_tra.obs.loc[:, tra_cols],
         check_categorical=False,  # categories differ, obviously
+        check_dtype=False,
     )
     pdt.assert_frame_equal(
         anndata.obs.loc[anndata.obs["IR_VDJ_1_locus"] == "TRB", trb_cols],
         anndata_trb.obs.loc[:, trb_cols],
         check_categorical=False,  # categories differ, obviously
+        check_dtype=False,
     )
     pdt.assert_frame_equal(
         anndata.obs.loc[anndata.obs["IR_VDJ_1_locus"] == "IGH", ig_cols],
         anndata_ig.obs.loc[:, ig_cols],
         check_categorical=False,  # categories differ, obviously
+        check_dtype=False,
     )
 
     # test some fundamental values