diff --git a/pyproject.toml b/pyproject.toml index 817011654..efede3d32 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -24,7 +24,7 @@ classifiers = [ ] requires-python = '>= 3.7' requires = [ - 'anndata>=0.7.3', + 'anndata>=0.7.6', 'scanpy>=1.6.0', 'pandas>=1.0', 'numpy>=1.17.0', diff --git a/scirpy/io/_convert_anndata.py b/scirpy/io/_convert_anndata.py index 0121bc576..0274fc8e0 100644 --- a/scirpy/io/_convert_anndata.py +++ b/scirpy/io/_convert_anndata.py @@ -8,6 +8,7 @@ from typing import Collection, Iterable, List, Optional from .. import __version__ import numpy as np +from pandas.api.types import is_object_dtype def _sanitize_anndata(adata: AnnData) -> None: @@ -17,12 +18,14 @@ def _sanitize_anndata(adata: AnnData) -> None: len(adata.X.shape) == 2 ), "X needs to have dimensions, otherwise concat doesn't work. " - CATEGORICAL_COLS = ("locus", "v_call", "d_call", "j_call", "c_call") - # Pending updates to anndata to properly handle boolean columns. # For now, let's turn them into a categorical with "True/False" BOOLEAN_COLS = ("has_ir", "is_cell", "multi_chain", "high_confidence", "productive") + # explicitly convert those to categoricals. All IR_ columns that are strings + # will be converted to categoricals, too + CATEGORICAL_COLS = ("extra_chains",) + # Sanitize has_ir column into categorical # This should always be a categorical with True / False for col in adata.obs.columns: @@ -34,13 +37,14 @@ def _sanitize_anndata(adata: AnnData) -> None: ], categories=["True", "False", "None"], ) - - # Turn other columns into categorical - for col in adata.obs.columns: - if col.endswith(CATEGORICAL_COLS): + elif col.endswith(CATEGORICAL_COLS) or ( + col.startswith("IR_") and is_object_dtype(adata.obs[col]) + ): + # Turn all IR_VJ columns that are of type string or object to categoricals + # otherwise saving anndata doesn't work. adata.obs[col] = pd.Categorical(adata.obs[col]) - adata._sanitize() + adata.strings_to_categoricals() @_doc_params(doc_working_model=doc_working_model) diff --git a/scirpy/io/_io.py b/scirpy/io/_io.py index 1f3060568..f23e0cbc9 100644 --- a/scirpy/io/_io.py +++ b/scirpy/io/_io.py @@ -20,7 +20,7 @@ from pathlib import Path import airr from ..util import _doc_params, _is_true, _is_true2, _translate_dna_to_protein -from ._convert_anndata import from_airr_cells, to_airr_cells +from ._convert_anndata import from_airr_cells, to_airr_cells, _sanitize_anndata from ._util import doc_working_model, _IOLogger, _check_upgrade_schema from .._compat import Literal from airr import RearrangementSchema @@ -623,6 +623,7 @@ def upgrade_schema(adata) -> None: adata.obs.rename(columns=rename_dict, inplace=True) adata.obs["extra_chains"] = None adata.uns["scirpy_version"] = __version__ + _sanitize_anndata(adata) @_check_upgrade_schema() diff --git a/scirpy/tests/test_io.py b/scirpy/tests/test_io.py index 01bf9a181..2434522ed 100644 --- a/scirpy/tests/test_io.py +++ b/scirpy/tests/test_io.py @@ -375,16 +375,19 @@ def test_read_airr(): anndata.obs.loc[anndata.obs["IR_VJ_1_locus"] == "TRA", tra_cols], anndata_tra.obs.loc[:, tra_cols], check_categorical=False, # categories differ, obviously + check_dtype=False, ) pdt.assert_frame_equal( anndata.obs.loc[anndata.obs["IR_VDJ_1_locus"] == "TRB", trb_cols], anndata_trb.obs.loc[:, trb_cols], check_categorical=False, # categories differ, obviously + check_dtype=False, ) pdt.assert_frame_equal( anndata.obs.loc[anndata.obs["IR_VDJ_1_locus"] == "IGH", ig_cols], anndata_ig.obs.loc[:, ig_cols], check_categorical=False, # categories differ, obviously + check_dtype=False, ) # test some fundamental values