From 5cd69a4445b323f6ede36960eae96c07bdcdf165 Mon Sep 17 00:00:00 2001
From: "Lumberbot (aka Jack)"
 <39504233+meeseeksmachine@users.noreply.github.com>
Date: Thu, 4 Jan 2024 14:35:06 +0100
Subject: [PATCH] Backport PR #1273 on branch 0.10.x (Fix IO error reporting)
 (#1281)

Co-authored-by: Philipp A <flying-sheep@web.de>
---
 .pre-commit-config.yaml              |  2 +
 anndata/_core/anndata.py             |  2 +-
 anndata/_io/h5ad.py                  | 19 ++++---
 anndata/_io/specs/registry.py        | 81 +++++++++++++---------------
 anndata/_io/utils.py                 | 63 ++++++++++++----------
 anndata/_io/zarr.py                  |  2 +-
 anndata/compat/__init__.py           | 17 +++++-
 anndata/experimental/_dispatch_io.py |  4 +-
 anndata/tests/test_awkward.py        | 13 ++---
 anndata/tests/test_io_utils.py       | 37 +++++++++----
 anndata/tests/test_readwrite.py      |  2 +-
 docs/release-notes/0.10.4.md         |  1 +
 pyproject.toml                       |  2 +-
 13 files changed, 140 insertions(+), 105 deletions(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f9626c3d4..2ce52a6d7 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -11,6 +11,8 @@ repos:
     rev: v4.0.0-alpha.8
     hooks:
       - id: prettier
+        exclude_types:
+          - markdown
   - repo: https://github.com/pre-commit/pre-commit-hooks
     rev: v4.5.0
     hooks:
diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
index b37e33177..a2b010bdf 100644
--- a/anndata/_core/anndata.py
+++ b/anndata/_core/anndata.py
@@ -74,7 +74,7 @@ class StorageType(Enum):
     DaskArray = DaskArray
     CupyArray = CupyArray
     CupySparseMatrix = CupySparseMatrix
-    BackedSparseMAtrix = BaseCompressedSparseDataset
+    BackedSparseMatrix = BaseCompressedSparseDataset
 
     @classmethod
     def classes(cls):
diff --git a/anndata/_io/h5ad.py b/anndata/_io/h5ad.py
index 337f13f92..5f31da04a 100644
--- a/anndata/_io/h5ad.py
+++ b/anndata/_io/h5ad.py
@@ -6,6 +6,7 @@
 from types import MappingProxyType
 from typing import (
     TYPE_CHECKING,
+    Any,
     Callable,
     Literal,
     TypeVar,
@@ -112,7 +113,13 @@ def write_h5ad(
 
 @report_write_key_on_error
 @write_spec(IOSpec("array", "0.2.0"))
-def write_sparse_as_dense(f, key, value, dataset_kwargs=MappingProxyType({})):
+def write_sparse_as_dense(
+    f: h5py.Group,
+    key: str,
+    value: sparse.spmatrix | BaseCompressedSparseDataset,
+    *,
+    dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
+):
     real_key = None  # Flag for if temporary key was used
     if key in f:
         if isinstance(value, BaseCompressedSparseDataset) and (
@@ -269,7 +276,7 @@ def callback(func, elem_name: str, elem, iospec):
 def _read_raw(
     f: h5py.File | AnnDataFileManager,
     as_sparse: Collection[str] = (),
-    rdasp: Callable[[h5py.Dataset], sparse.spmatrix] = None,
+    rdasp: Callable[[h5py.Dataset], sparse.spmatrix] | None = None,
     *,
     attrs: Collection[str] = ("X", "var", "varm"),
 ) -> dict:
@@ -286,7 +293,7 @@ def _read_raw(
 
 
 @report_read_key_on_error
-def read_dataframe_legacy(dataset) -> pd.DataFrame:
+def read_dataframe_legacy(dataset: h5py.Dataset) -> pd.DataFrame:
     """Read pre-anndata 0.7 dataframes."""
     warn(
         f"'{dataset.name}' was written with a very old version of AnnData. "
@@ -305,7 +312,7 @@ def read_dataframe_legacy(dataset) -> pd.DataFrame:
     return df
 
 
-def read_dataframe(group) -> pd.DataFrame:
+def read_dataframe(group: h5py.Group | h5py.Dataset) -> pd.DataFrame:
     """Backwards compat function"""
     if not isinstance(group, h5py.Group):
         return read_dataframe_legacy(group)
@@ -352,7 +359,7 @@ def read_dense_as_sparse(
         raise ValueError(f"Cannot read dense array as type: {sparse_format}")
 
 
-def read_dense_as_csr(dataset, axis_chunk=6000):
+def read_dense_as_csr(dataset: h5py.Dataset, axis_chunk: int = 6000):
     sub_matrices = []
     for idx in idx_chunks_along_axis(dataset.shape, 0, axis_chunk):
         dense_chunk = dataset[idx]
@@ -361,7 +368,7 @@ def read_dense_as_csr(dataset, axis_chunk=6000):
     return sparse.vstack(sub_matrices, format="csr")
 
 
-def read_dense_as_csc(dataset, axis_chunk=6000):
+def read_dense_as_csc(dataset: h5py.Dataset, axis_chunk: int = 6000):
     sub_matrices = []
     for idx in idx_chunks_along_axis(dataset.shape, 1, axis_chunk):
         sub_matrix = sparse.csc_matrix(dataset[idx])
diff --git a/anndata/_io/specs/registry.py b/anndata/_io/specs/registry.py
index 1f0b137f4..a8357295d 100644
--- a/anndata/_io/specs/registry.py
+++ b/anndata/_io/specs/registry.py
@@ -1,6 +1,6 @@
 from __future__ import annotations
 
-from collections.abc import Callable, Iterable, Mapping
+from collections.abc import Mapping
 from dataclasses import dataclass
 from functools import singledispatch, wraps
 from types import MappingProxyType
@@ -10,12 +10,13 @@
 from anndata.compat import _read_attr
 
 if TYPE_CHECKING:
+    from collections.abc import Callable, Generator, Iterable
+
     from anndata._types import GroupStorageType, StorageType
 
+
 # TODO: This probably should be replaced by a hashable Mapping due to conversion b/w "_" and "-"
 # TODO: Should filetype be included in the IOSpec if it changes the encoding? Or does the intent that these things be "the same" overrule that?
-
-
 @dataclass(frozen=True)
 class IOSpec:
     encoding_type: str
@@ -25,7 +26,9 @@ class IOSpec:
 # TODO: Should this subclass from LookupError?
 class IORegistryError(Exception):
     @classmethod
-    def _from_write_parts(cls, dest_type, typ, modifiers) -> IORegistryError:
+    def _from_write_parts(
+        cls, dest_type: type, typ: type, modifiers: frozenset[str]
+    ) -> IORegistryError:
         msg = f"No method registered for writing {typ} into {dest_type}"
         if modifiers:
             msg += f" with {modifiers}"
@@ -36,7 +39,7 @@ def _from_read_parts(
         cls,
         method: str,
         registry: Mapping,
-        src_typ: StorageType,
+        src_typ: type[StorageType],
         spec: IOSpec,
     ) -> IORegistryError:
         # TODO: Improve error message if type exists, but version does not
@@ -50,7 +53,7 @@ def _from_read_parts(
 def write_spec(spec: IOSpec):
     def decorator(func: Callable):
         @wraps(func)
-        def wrapper(g, k, *args, **kwargs):
+        def wrapper(g: GroupStorageType, k: str, *args, **kwargs):
             result = func(g, k, *args, **kwargs)
             g[k].attrs.setdefault("encoding-type", spec.encoding_type)
             g[k].attrs.setdefault("encoding-version", spec.encoding_version)
@@ -193,12 +196,12 @@ def proc_spec(spec) -> IOSpec:
 
 
 @proc_spec.register(IOSpec)
-def proc_spec_spec(spec) -> IOSpec:
+def proc_spec_spec(spec: IOSpec) -> IOSpec:
     return spec
 
 
 @proc_spec.register(Mapping)
-def proc_spec_mapping(spec) -> IOSpec:
+def proc_spec_mapping(spec: Mapping[str, str]) -> IOSpec:
     return IOSpec(**{k.replace("-", "_"): v for k, v in spec.items()})
 
 
@@ -213,7 +216,9 @@ def get_spec(
     )
 
 
-def _iter_patterns(elem):
+def _iter_patterns(
+    elem,
+) -> Generator[tuple[type, type | str] | tuple[type, type, str], None, None]:
     """Iterates over possible patterns for an element in order of precedence."""
     from anndata.compat import DaskArray
 
@@ -236,40 +241,27 @@ def __init__(self, registry: IORegistry, callback: Callable | None = None) -> No
     def read_elem(
         self,
         elem: StorageType,
-        modifiers: frozenset(str) = frozenset(),
+        modifiers: frozenset[str] = frozenset(),
     ) -> Any:
         """Read an element from a store. See exported function for more details."""
         from functools import partial
 
-        read_func = self.registry.get_reader(
-            type(elem), get_spec(elem), frozenset(modifiers)
+        iospec = get_spec(elem)
+        read_func = partial(
+            self.registry.get_reader(type(elem), iospec, modifiers),
+            _reader=self,
         )
-        read_func = partial(read_func, _reader=self)
-        if self.callback is not None:
-            return self.callback(read_func, elem.name, elem, iospec=get_spec(elem))
-        else:
+        if self.callback is None:
             return read_func(elem)
+        return self.callback(read_func, elem.name, elem, iospec=iospec)
 
 
 class Writer:
-    def __init__(
-        self,
-        registry: IORegistry,
-        callback: Callable[
-            [
-                GroupStorageType,
-                str,
-                StorageType,
-                dict,
-            ],
-            None,
-        ]
-        | None = None,
-    ):
+    def __init__(self, registry: IORegistry, callback: Callable | None = None):
         self.registry = registry
         self.callback = callback
 
-    def find_writer(self, dest_type, elem, modifiers):
+    def find_writer(self, dest_type: type, elem, modifiers: frozenset[str]):
         for pattern in _iter_patterns(elem):
             if self.registry.has_writer(dest_type, pattern, modifiers):
                 return self.registry.get_writer(dest_type, pattern, modifiers)
@@ -281,10 +273,10 @@ def write_elem(
         self,
         store: GroupStorageType,
         k: str,
-        elem,
+        elem: Any,
         *,
-        dataset_kwargs=MappingProxyType({}),
-        modifiers=frozenset(),
+        dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
+        modifiers: frozenset[str] = frozenset(),
     ):
         from functools import partial
         from pathlib import PurePosixPath
@@ -313,17 +305,16 @@ def write_elem(
             _writer=self,
         )
 
-        if self.callback is not None:
-            return self.callback(
-                write_func,
-                store,
-                k,
-                elem,
-                dataset_kwargs=dataset_kwargs,
-                iospec=self.registry.get_spec(elem),
-            )
-        else:
+        if self.callback is None:
             return write_func(store, k, elem, dataset_kwargs=dataset_kwargs)
+        return self.callback(
+            write_func,
+            store,
+            k,
+            elem,
+            dataset_kwargs=dataset_kwargs,
+            iospec=self.registry.get_spec(elem),
+        )
 
 
 def read_elem(elem: StorageType) -> Any:
@@ -346,7 +337,7 @@ def write_elem(
     k: str,
     elem: Any,
     *,
-    dataset_kwargs: Mapping = MappingProxyType({}),
+    dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> None:
     """
     Write an element to a storage group using anndata encoding.
diff --git a/anndata/_io/utils.py b/anndata/_io/utils.py
index cd90be473..6f46ad7eb 100644
--- a/anndata/_io/utils.py
+++ b/anndata/_io/utils.py
@@ -1,15 +1,19 @@
 from __future__ import annotations
 
 from functools import wraps
-from typing import Callable, Literal
+from typing import TYPE_CHECKING, Callable, Literal, Union, cast
 from warnings import warn
 
 import h5py
 from packaging.version import Version
 
-from anndata.compat import H5Group, ZarrGroup, add_note
-
 from .._core.sparse_dataset import BaseCompressedSparseDataset
+from ..compat import H5Group, ZarrGroup, add_note, pairwise
+
+if TYPE_CHECKING:
+    from .._types import StorageType
+
+    Storage = Union[StorageType, BaseCompressedSparseDataset]
 
 # For allowing h5py v3
 # https://github.com/scverse/anndata/issues/442
@@ -151,31 +155,26 @@ class AnnDataReadError(OSError):
     pass
 
 
-def _get_parent(elem):
-    try:
-        import zarr
-    except ImportError:
-        zarr = None
-    if zarr and isinstance(elem, (zarr.Group, zarr.Array)):
-        parent = elem.store  # Not sure how to always get a name out of this
-    elif isinstance(elem, BaseCompressedSparseDataset):
-        parent = elem.group.file.name
-    else:
-        parent = elem.file.name
-    return parent
+def _get_display_path(store: Storage) -> str:
+    """Return an absolute path of an element (always starts with “/”)."""
+    if isinstance(store, BaseCompressedSparseDataset):
+        store = store.group
+    path = store.name or "??"  # can be None
+    return f'/{path.removeprefix("/")}'
 
 
-def add_key_note(e: BaseException, elem, key, op=Literal["read", "writ"]) -> None:
+def add_key_note(
+    e: BaseException, store: Storage, path: str, key: str, op: Literal["read", "writ"]
+) -> None:
     if any(
         f"Error raised while {op}ing key" in note
         for note in getattr(e, "__notes__", [])
     ):
         return
-    parent = _get_parent(elem)
-    add_note(
-        e,
-        f"Error raised while {op}ing key {key!r} of {type(elem)} to " f"{parent}",
-    )
+
+    dir = "to" if op == "writ" else "from"
+    msg = f"Error raised while {op}ing key {key!r} of {type(store)} {dir} {path}"
+    add_note(e, msg)
 
 
 def report_read_key_on_error(func):
@@ -198,13 +197,17 @@ def func_wrapper(*args, **kwargs):
         from anndata._io.specs import Reader
 
         # Figure out signature (method vs function) by going through args
-        for elem in args:
-            if not isinstance(elem, Reader):
+        for arg in args:
+            if not isinstance(arg, Reader):
+                store = cast("Storage", arg)
                 break
+        else:
+            raise ValueError("No element found in args.")
         try:
             return func(*args, **kwargs)
         except Exception as e:
-            add_key_note(e, elem, elem.name, "read")
+            path, key = _get_display_path(store).rsplit("/", 1)
+            add_key_note(e, store, path or "/", key, "read")
             raise
 
     return func_wrapper
@@ -230,15 +233,17 @@ def func_wrapper(*args, **kwargs):
         from anndata._io.specs import Writer
 
         # Figure out signature (method vs function) by going through args
-        for i in range(len(args)):
-            elem = args[i]
-            key = args[i + 1]
-            if not isinstance(elem, Writer):
+        for arg, key in pairwise(args):
+            if not isinstance(arg, Writer):
+                store = cast("Storage", arg)
                 break
+        else:
+            raise ValueError("No element found in args.")
         try:
             return func(*args, **kwargs)
         except Exception as e:
-            add_key_note(e, elem, key, "writ")
+            path = _get_display_path(store)
+            add_key_note(e, store, path, key, "writ")
             raise
 
     return func_wrapper
diff --git a/anndata/_io/zarr.py b/anndata/_io/zarr.py
index 00f9766f0..864475848 100644
--- a/anndata/_io/zarr.py
+++ b/anndata/_io/zarr.py
@@ -133,7 +133,7 @@ def read_dataframe_legacy(dataset: zarr.Array) -> pd.DataFrame:
 
 
 @report_read_key_on_error
-def read_dataframe(group) -> pd.DataFrame:
+def read_dataframe(group: zarr.Group | zarr.Array) -> pd.DataFrame:
     # Fast paths
     if isinstance(group, zarr.Array):
         return read_dataframe_legacy(group)
diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py
index 24f3875b3..39323d73a 100644
--- a/anndata/compat/__init__.py
+++ b/anndata/compat/__init__.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 import os
+import sys
 from codecs import decode
 from collections.abc import Mapping
 from contextlib import AbstractContextManager
@@ -35,9 +36,9 @@ class Empty:
 #############################
 
 
-try:
+if sys.version_info >= (3, 11):
     from contextlib import chdir
-except ImportError:  # Python < 3.11
+else:
 
     @dataclass
     class chdir(AbstractContextManager):
@@ -52,6 +53,18 @@ def __exit__(self, *_exc_info) -> None:
             os.chdir(self._old_cwd.pop())
 
 
+if sys.version_info >= (3, 10):
+    from itertools import pairwise
+else:
+
+    def pairwise(iterable):
+        from itertools import tee
+
+        a, b = tee(iterable)
+        next(b, None)
+        return zip(a, b)
+
+
 #############################
 # Optional deps
 #############################
diff --git a/anndata/experimental/_dispatch_io.py b/anndata/experimental/_dispatch_io.py
index 4df4d417a..2a399d540 100644
--- a/anndata/experimental/_dispatch_io.py
+++ b/anndata/experimental/_dispatch_io.py
@@ -4,6 +4,8 @@
 from typing import TYPE_CHECKING, Any, Callable
 
 if TYPE_CHECKING:
+    from collections.abc import Mapping
+
     from anndata._io.specs import IOSpec
     from anndata._types import GroupStorageType, StorageType
 
@@ -55,7 +57,7 @@ def write_dispatched(
         None,
     ],
     *,
-    dataset_kwargs=MappingProxyType({}),
+    dataset_kwargs: Mapping[str, Any] = MappingProxyType({}),
 ) -> None:
     """
     Write elem to store, recursively calling callback at each sub-element.
diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py
index 9e780c8a8..2e996bfc6 100644
--- a/anndata/tests/test_awkward.py
+++ b/anndata/tests/test_awkward.py
@@ -1,7 +1,7 @@
 """Tests related to awkward arrays"""
 from __future__ import annotations
 
-from contextlib import nullcontext
+import warnings
 
 import numpy as np
 import numpy.testing as npt
@@ -381,15 +381,12 @@ def test_concat_mixed_types(key, arrays, expected, join):
         to_concat.append(tmp_adata)
 
     if isinstance(expected, type) and issubclass(expected, Exception):
-        ctx = (
-            pytest.warns(
+        with pytest.raises(expected), warnings.catch_warnings():
+            warnings.filterwarnings(
+                "ignore",
+                r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
                 FutureWarning,
-                match=r"The behavior of DataFrame concatenation with empty or all-NA entries is deprecated",
             )
-            if any(df.empty for df in arrays if isinstance(df, pd.DataFrame))
-            else nullcontext()
-        )
-        with pytest.raises(expected), ctx:
             anndata.concat(to_concat, axis=axis, join=join)
     else:
         result_adata = anndata.concat(to_concat, axis=axis, join=join)
diff --git a/anndata/tests/test_io_utils.py b/anndata/tests/test_io_utils.py
index c70091474..803a4ad72 100644
--- a/anndata/tests/test_io_utils.py
+++ b/anndata/tests/test_io_utils.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
-from contextlib import suppress
+from contextlib import AbstractContextManager, suppress
+from typing import TYPE_CHECKING
 
 import h5py
 import pandas as pd
@@ -9,13 +10,15 @@
 
 import anndata as ad
 from anndata._io.specs.registry import IORegistryError
-from anndata._io.utils import (
-    report_read_key_on_error,
-)
+from anndata._io.utils import report_read_key_on_error
 from anndata.compat import _clean_uns
 from anndata.experimental import read_elem, write_elem
 from anndata.tests.helpers import pytest_8_raises
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pathlib import Path
+
 
 @pytest.fixture(params=["h5ad", "zarr"])
 def diskfmt(request):
@@ -29,20 +32,32 @@ def diskfmt(request):
         pytest.param(lambda p: h5py.File(p / "test.h5", mode="a"), id="h5py"),
     ],
 )
-def test_key_error(tmp_path, group_fn):
+@pytest.mark.parametrize("nested", [True, False], ids=["nested", "root"])
+def test_key_error(
+    tmp_path, group_fn: Callable[[Path], zarr.Group | h5py.Group], nested: bool
+):
     @report_read_key_on_error
     def read_attr(_):
         raise NotImplementedError()
 
     group = group_fn(tmp_path)
-    with group if hasattr(group, "__enter__") else suppress():
+    with group if isinstance(group, AbstractContextManager) else suppress():
+        if nested:
+            group = group.create_group("nested")
+            path = "/nested"
+        else:
+            path = "/"
         group["X"] = [1, 2, 3]
         group.create_group("group")
 
-        with pytest_8_raises(NotImplementedError, match=r"/X"):
+        with pytest_8_raises(
+            NotImplementedError, match=rf"reading key 'X'.*from {path}$"
+        ):
             read_attr(group["X"])
 
-        with pytest_8_raises(NotImplementedError, match=r"/group"):
+        with pytest_8_raises(
+            NotImplementedError, match=rf"reading key 'group'.*from {path}$"
+        ):
             read_attr(group["group"])
 
 
@@ -53,7 +68,9 @@ def test_write_error_info(diskfmt, tmp_path):
     # Assuming we don't define a writer for tuples
     a = ad.AnnData(uns={"a": {"b": {"c": (1, 2, 3)}}})
 
-    with pytest_8_raises(IORegistryError, match=r"Error raised while writing key 'c'"):
+    with pytest_8_raises(
+        IORegistryError, match=r"Error raised while writing key 'c'.*to /uns/a/b"
+    ):
         write(a)
 
 
@@ -89,7 +106,7 @@ class Foo:
     # (?!...) is a negative lookahead
     # (?s) enables the dot to match newlines
     # https://stackoverflow.com/a/406408/130164 <- copilot suggested lol
-    pattern = r"(?s)((?!Error raised while writing key '/?a').)*$"
+    pattern = r"(?s)^((?!Error raised while writing key '/?a').)*$"
 
     with pytest_8_raises(IORegistryError, match=pattern):
         write_elem(group, "/", {"a": {"b": Foo()}})
diff --git a/anndata/tests/test_readwrite.py b/anndata/tests/test_readwrite.py
index 22b1aaffc..67341fe57 100644
--- a/anndata/tests/test_readwrite.py
+++ b/anndata/tests/test_readwrite.py
@@ -277,7 +277,7 @@ def test_read_full_io_error(tmp_path, name, read, write):
         store["obs"].attrs["encoding-type"] = "invalid"
     with pytest_8_raises(
         IORegistryError,
-        match=r"raised while reading key '/obs'",
+        match=r"raised while reading key 'obs'.*from /$",
     ) as exc_info:
         read(path)
     assert re.search(
diff --git a/docs/release-notes/0.10.4.md b/docs/release-notes/0.10.4.md
index 2739f5c7f..ee1560ebe 100644
--- a/docs/release-notes/0.10.4.md
+++ b/docs/release-notes/0.10.4.md
@@ -7,6 +7,7 @@
 * `adata[:, []]` now returns an `AnnData` object empty on the appropriate dimensions instead of erroring {pr}`1243` {user}`ilan-gold`
 * `adata.X[mask]` works in newer `numpy` versions when `X` is `backed` {pr}`1255` {user}`ilan-gold`
 * `adata.X[...]` fixed for `X` as a `BaseCompressedSparseDataset` with `zarr` backend {pr}`1265` {user}`ilan-gold`
+* Improve read/write error reporting {pr}`1273` {user}`flying-sheep`
 
 ```{rubric} Documentation
 ```
diff --git a/pyproject.toml b/pyproject.toml
index 7f4f1e6de..e8a6bac71 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,7 +80,7 @@ doc = [
 ]
 test = [
     "loompy>=3.0.5",
-    "pytest>=6.0",
+    "pytest >=6.0, !=8.0.0rc1", # https://github.com/pytest-dev/pytest/issues/11759
     "pytest-cov>=2.10",
     "zarr",
     "matplotlib",