diff --git a/docs/release-notes/1.10.2.md b/docs/release-notes/1.10.2.md index 9ea0e0bdc1..9cc5547d63 100644 --- a/docs/release-notes/1.10.2.md +++ b/docs/release-notes/1.10.2.md @@ -16,6 +16,7 @@ ``` * Compatibility with `matplotlib` 3.9 {pr}`2999` {smaller}`I Virshup` +* Add clear errors where `backed` mode-like matrices (i.e., from `sparse_dataset`) are not supported {pr}`3048` {smaller}`I gold` ```{rubric} Performance ``` diff --git a/scanpy/_utils/__init__.py b/scanpy/_utils/__init__.py index 20282a9819..f28acf2e89 100644 --- a/scanpy/_utils/__init__.py +++ b/scanpy/_utils/__init__.py @@ -22,6 +22,7 @@ from typing import TYPE_CHECKING, overload from weakref import WeakSet +import h5py import numpy as np from anndata import __version__ as anndata_version from packaging.version import Version @@ -33,6 +34,13 @@ from .._settings import settings from .compute.is_constant import is_constant # noqa: F401 +if Version(anndata_version) >= Version("0.10.0"): + from anndata._core.sparse_dataset import ( + BaseCompressedSparseDataset as SparseDataset, + ) +else: + from anndata._core.sparse_dataset import SparseDataset + if TYPE_CHECKING: from collections.abc import Mapping from pathlib import Path @@ -1084,3 +1092,14 @@ def _resolve_axis( if axis in {1, "var"}: return (1, "var") raise ValueError(f"`axis` must be either 0, 1, 'obs', or 'var', was {axis!r}") + + +def is_backed_type(X: object) -> bool: + return isinstance(X, (SparseDataset, h5py.File, h5py.Dataset)) + + +def raise_not_implemented_error_if_backed_type(X: object, method_name: str) -> None: + if is_backed_type(X): + raise NotImplementedError( + f"{method_name} is not implemented for matrices of type {type(X)}" + ) diff --git a/scanpy/preprocessing/_pca.py b/scanpy/preprocessing/_pca.py index a38aeaed76..e0b09b7c87 100644 --- a/scanpy/preprocessing/_pca.py +++ b/scanpy/preprocessing/_pca.py @@ -16,7 +16,7 @@ from .. import logging as logg from .._compat import DaskArray, pkg_version from .._settings import settings -from .._utils import _doc_params, _empty +from .._utils import _doc_params, _empty, is_backed_type from ..get import _check_mask, _get_obs_rep from ._docs import doc_mask_var_hvg from ._utils import _get_mean_var @@ -173,6 +173,10 @@ def pca( ) data_is_AnnData = isinstance(data, AnnData) if data_is_AnnData: + if layer is None and not chunked and is_backed_type(data.X): + raise NotImplementedError( + f"PCA is not implemented for matrices of type {type(data.X)} with chunked as False" + ) adata = data.copy() if copy else data else: if pkg_version("anndata") < Version("0.8.0rc1"): @@ -195,7 +199,10 @@ def pca( logg.info(f" with n_comps={n_comps}") X = _get_obs_rep(adata_comp, layer=layer) - + if is_backed_type(X) and layer is not None: + raise NotImplementedError( + f"PCA is not implemented for matrices of type {type(X)} from layers" + ) # See: https://github.com/scverse/scanpy/pull/2816#issuecomment-1932650529 if ( Version(ad.__version__) < Version("0.9") diff --git a/scanpy/preprocessing/_scale.py b/scanpy/preprocessing/_scale.py index 5f7f526aea..edd9843c59 100644 --- a/scanpy/preprocessing/_scale.py +++ b/scanpy/preprocessing/_scale.py @@ -15,6 +15,7 @@ from .._utils import ( _check_array_function_arguments, axis_mul_or_truediv, + raise_not_implemented_error_if_backed_type, renamed_arg, view_to_actual, ) @@ -298,6 +299,7 @@ def scale_anndata( mask_obs = _check_mask(adata, mask_obs, "obs") view_to_actual(adata) X = _get_obs_rep(adata, layer=layer, obsm=obsm) + raise_not_implemented_error_if_backed_type(X, "scale") X, adata.var[str_mean_std[0]], adata.var[str_mean_std[1]] = scale( X, zero_center=zero_center, diff --git a/scanpy/preprocessing/_simple.py b/scanpy/preprocessing/_simple.py index d4d00003ef..1e61b60ace 100644 --- a/scanpy/preprocessing/_simple.py +++ b/scanpy/preprocessing/_simple.py @@ -23,6 +23,8 @@ from .._utils import ( _check_array_function_arguments, axis_sum, + is_backed_type, + raise_not_implemented_error_if_backed_type, renamed_arg, sanitize_anndata, view_to_actual, @@ -145,6 +147,7 @@ def filter_cells( "`min_genes`, `max_counts`, `max_genes` per call." ) if isinstance(data, AnnData): + raise_not_implemented_error_if_backed_type(data.X, "filter_cells") adata = data.copy() if copy else data cell_subset, number = materialize_as_ndarray( filter_cells( @@ -260,6 +263,7 @@ def filter_genes( ) if isinstance(data, AnnData): + raise_not_implemented_error_if_backed_type(data.X, "filter_genes") adata = data.copy() if copy else data gene_subset, number = materialize_as_ndarray( filter_genes( @@ -405,10 +409,19 @@ def log1p_anndata( raise NotImplementedError( "Currently cannot perform chunked operations on arrays not stored in X." ) + if adata.isbacked and adata.file._filemode != "r+": + raise NotImplementedError( + "log1p is not implemented for backed AnnData with backed mode not r+" + ) for chunk, start, end in adata.chunked_X(chunk_size): adata.X[start:end] = log1p(chunk, base=base, copy=False) else: X = _get_obs_rep(adata, layer=layer, obsm=obsm) + if is_backed_type(X): + msg = f"log1p is not implemented for matrices of type {type(X)}" + if layer is not None: + raise NotImplementedError(f"{msg} from layers") + raise NotImplementedError(f"{msg} without `chunked=True`") X = log1p(X, copy=False, base=base) _set_obs_rep(adata, X, layer=layer, obsm=obsm) @@ -647,6 +660,7 @@ def regress_out( keys = [keys] X = _get_obs_rep(adata, layer=layer) + raise_not_implemented_error_if_backed_type(X, "regress_out") if issparse(X): logg.info(" sparse input is densified and may " "lead to high memory use") @@ -855,6 +869,7 @@ def downsample_counts( `adata.X` : :class:`numpy.ndarray` | :class:`scipy.sparse.spmatrix` (dtype `float`) Downsampled counts matrix. """ + raise_not_implemented_error_if_backed_type(adata.X, "downsample_counts") # This logic is all dispatch total_counts_call = total_counts is not None counts_per_cell_call = counts_per_cell is not None diff --git a/scanpy/tests/test_backed.py b/scanpy/tests/test_backed.py new file mode 100644 index 0000000000..787edf9c21 --- /dev/null +++ b/scanpy/tests/test_backed.py @@ -0,0 +1,98 @@ +from __future__ import annotations + +from functools import partial + +import pytest +from anndata import read_h5ad + +import scanpy as sc + + +@pytest.mark.parametrize( + ("name", "func", "msg"), + [ + pytest.param("PCA", sc.pp.pca, " with chunked as False", id="pca"), + pytest.param( + "PCA", partial(sc.pp.pca, layer="X_copy"), " from layers", id="pca_layer" + ), + pytest.param( + "regress_out", + partial(sc.pp.regress_out, keys=["n_counts", "percent_mito"]), + "", + id="regress_out", + ), + pytest.param( + "dendrogram", partial(sc.tl.dendrogram, groupby="cat"), "", id="dendrogram" + ), + pytest.param("tsne", sc.tl.tsne, "", id="tsne"), + pytest.param("scale", sc.pp.scale, "", id="scale"), + pytest.param( + "downsample_counts", + partial(sc.pp.downsample_counts, counts_per_cell=1000), + "", + id="downsample_counts", + ), + pytest.param( + "filter_genes", + partial(sc.pp.filter_genes, max_cells=1000), + "", + id="filter_genes", + ), + pytest.param( + "filter_cells", + partial(sc.pp.filter_cells, max_genes=1000), + "", + id="filter_cells", + ), + pytest.param( + "rank_genes_groups", + partial(sc.tl.rank_genes_groups, groupby="cat"), + "", + id="rank_genes_groups", + ), + pytest.param( + "score_genes", + partial(sc.tl.score_genes, gene_list=map(str, range(100))), + "", + id="score_genes", + ), + ], +) +def test_backed_error(backed_adata, name, func, msg): + with pytest.raises( + NotImplementedError, + match=f"{name} is not implemented for matrices of type {type(backed_adata.X)}{msg}", + ): + func(backed_adata) + + +def test_log1p_backed_errors(backed_adata): + with pytest.raises( + NotImplementedError, + match="log1p is not implemented for backed AnnData with backed mode not r+", + ): + sc.pp.log1p(backed_adata, chunked=True) + backed_adata.file.close() + backed_adata = read_h5ad(backed_adata.filename, backed="r+") + with pytest.raises( + NotImplementedError, + match=f"log1p is not implemented for matrices of type {type(backed_adata.X)} without `chunked=True`", + ): + sc.pp.log1p(backed_adata) + backed_adata.layers["X_copy"] = backed_adata.X + layer_type = type(backed_adata.layers["X_copy"]) + with pytest.raises( + NotImplementedError, + match=f"log1p is not implemented for matrices of type {layer_type} from layers", + ): + sc.pp.log1p(backed_adata, layer="X_copy") + backed_adata.file.close() + + +def test_scatter_backed(backed_adata): + sc.pp.pca(backed_adata, chunked=True) + sc.pl.scatter(backed_adata, color="0", basis="pca") + + +def test_dotplot_backed(backed_adata): + sc.pl.dotplot(backed_adata, ["0", "1", "2", "3"], groupby="cat") diff --git a/scanpy/tests/test_ingest.py b/scanpy/tests/test_ingest.py index 1a307a9602..2c57342de1 100644 --- a/scanpy/tests/test_ingest.py +++ b/scanpy/tests/test_ingest.py @@ -1,5 +1,6 @@ from __future__ import annotations +import anndata import numpy as np import pytest from sklearn.neighbors import KDTree @@ -153,3 +154,19 @@ def test_ingest_map_embedding_umap(): umap_transformed_t = reducer.transform(T) assert np.allclose(ing._obsm["X_umap"], umap_transformed_t) + + +def test_ingest_backed(adatas, tmp_path): + adata_ref = adatas[0].copy() + adata_new = adatas[1].copy() + + adata_new.write_h5ad(f"{tmp_path}/new.h5ad") + + adata_new = anndata.read_h5ad(f"{tmp_path}/new.h5ad", backed="r") + + ing = sc.tl.Ingest(adata_ref) + with pytest.raises( + NotImplementedError, + match=f"Ingest.fit is not implemented for matrices of type {type(adata_new.X)}", + ): + ing.fit(adata_new) diff --git a/scanpy/tools/_dendrogram.py b/scanpy/tools/_dendrogram.py index 4ffdb553e7..f60f0ae2e9 100644 --- a/scanpy/tools/_dendrogram.py +++ b/scanpy/tools/_dendrogram.py @@ -11,7 +11,7 @@ from .. import logging as logg from .._compat import old_positionals -from .._utils import _doc_params +from .._utils import _doc_params, raise_not_implemented_error_if_backed_type from ..neighbors._doc import doc_n_pcs, doc_use_rep from ._utils import _choose_representation @@ -117,6 +117,8 @@ def dendrogram( >>> markers = ['C1QA', 'PSAP', 'CD79A', 'CD79B', 'CST3', 'LYZ'] >>> sc.pl.dotplot(adata, markers, groupby='bulk_labels', dendrogram=True) """ + + raise_not_implemented_error_if_backed_type(adata.X, "dendrogram") if isinstance(groupby, str): # if not a list, turn into a list groupby = [groupby] diff --git a/scanpy/tools/_ingest.py b/scanpy/tools/_ingest.py index 0066cdef8a..3d05937d81 100644 --- a/scanpy/tools/_ingest.py +++ b/scanpy/tools/_ingest.py @@ -12,7 +12,7 @@ from .. import logging as logg from .._compat import old_positionals, pkg_version from .._settings import settings -from .._utils import NeighborsView +from .._utils import NeighborsView, raise_not_implemented_error_if_backed_type from .._utils._doctests import doctest_skip from ..neighbors import FlatTree @@ -392,6 +392,7 @@ def fit(self, adata_new): `adata` refers to the :class:`~anndata.AnnData` object that is passed during the initialization of an Ingest instance. """ + raise_not_implemented_error_if_backed_type(adata_new.X, "Ingest.fit") ref_var_names = self._adata_ref.var_names.str.upper() new_var_names = adata_new.var_names.str.upper() diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py index 0180faedf2..1fc2727091 100644 --- a/scanpy/tools/_rank_genes_groups.py +++ b/scanpy/tools/_rank_genes_groups.py @@ -12,7 +12,10 @@ from .. import _utils from .. import logging as logg from .._compat import old_positionals -from .._utils import check_nonnegative_integers +from .._utils import ( + check_nonnegative_integers, + raise_not_implemented_error_if_backed_type, +) from ..get import _check_mask from ..preprocessing._utils import _get_mean_var @@ -134,6 +137,7 @@ def __init__( if use_raw and adata.raw is not None: adata_comp = adata.raw X = adata_comp.X + raise_not_implemented_error_if_backed_type(X, "rank_genes_groups") # for correct getnnz calculation if issparse(X): @@ -594,7 +598,6 @@ def rank_genes_groups( >>> # to visualize the results >>> sc.pl.rank_genes_groups(adata) """ - if mask_var is not None: mask_var = _check_mask(adata, mask_var, "var") diff --git a/scanpy/tools/_score_genes.py b/scanpy/tools/_score_genes.py index b27269c8aa..c5be974a99 100644 --- a/scanpy/tools/_score_genes.py +++ b/scanpy/tools/_score_genes.py @@ -8,7 +8,7 @@ import pandas as pd from scipy.sparse import issparse -from scanpy._utils import _check_use_raw +from scanpy._utils import _check_use_raw, is_backed_type from .. import logging as logg from .._compat import old_positionals @@ -110,6 +110,10 @@ def score_genes( start = logg.info(f"computing score {score_name!r}") adata = adata.copy() if copy else adata use_raw = _check_use_raw(adata, use_raw) + if is_backed_type(adata.X) and not use_raw: + raise NotImplementedError( + f"score_genes is not implemented for matrices of type {type(adata.X)}" + ) if random_state is not None: np.random.seed(random_state) diff --git a/scanpy/tools/_tsne.py b/scanpy/tools/_tsne.py index ff8346c3f7..1e3ade92e6 100644 --- a/scanpy/tools/_tsne.py +++ b/scanpy/tools/_tsne.py @@ -8,7 +8,7 @@ from .. import logging as logg from .._compat import old_positionals from .._settings import settings -from .._utils import _doc_params +from .._utils import _doc_params, raise_not_implemented_error_if_backed_type from ..neighbors._doc import doc_n_pcs, doc_use_rep from ._utils import _choose_representation @@ -106,6 +106,7 @@ def tsne( start = logg.info("computing tSNE") adata = adata.copy() if copy else adata X = _choose_representation(adata, use_rep=use_rep, n_pcs=n_pcs) + raise_not_implemented_error_if_backed_type(X, "tsne") # params for sklearn n_jobs = settings.n_jobs if n_jobs is None else n_jobs params_sklearn = dict( diff --git a/src/testing/scanpy/_pytest/fixtures/__init__.py b/src/testing/scanpy/_pytest/fixtures/__init__.py index e225ace290..16c3f8a4df 100644 --- a/src/testing/scanpy/_pytest/fixtures/__init__.py +++ b/src/testing/scanpy/_pytest/fixtures/__init__.py @@ -13,6 +13,7 @@ from .data import ( _pbmc3ks_parametrized_session, + backed_adata, pbmc3k_parametrized, pbmc3k_parametrized_small, ) @@ -27,6 +28,7 @@ "_pbmc3ks_parametrized_session", "pbmc3k_parametrized", "pbmc3k_parametrized_small", + "backed_adata", ] diff --git a/src/testing/scanpy/_pytest/fixtures/data.py b/src/testing/scanpy/_pytest/fixtures/data.py index 24919aea65..3d907c9fec 100644 --- a/src/testing/scanpy/_pytest/fixtures/data.py +++ b/src/testing/scanpy/_pytest/fixtures/data.py @@ -7,12 +7,29 @@ import numpy as np import pytest +from anndata import AnnData, read_h5ad +from anndata import __version__ as anndata_version +from packaging.version import Version from scipy import sparse +if Version(anndata_version) >= Version("0.10.0"): + from anndata._core.sparse_dataset import ( + BaseCompressedSparseDataset as SparseDataset, + ) + from anndata.experimental import sparse_dataset + + def make_sparse(x): + return sparse_dataset(x) +else: + from anndata._core.sparse_dataset import SparseDataset + + def make_sparse(x): + return SparseDataset(x) + + if TYPE_CHECKING: from collections.abc import Callable - from anndata import AnnData from numpy.typing import DTypeLike @@ -43,6 +60,37 @@ def pbmc3k_parametrized_small(_pbmc3ks_parametrized_session) -> Callable[[], Ann return _pbmc3ks_parametrized_session[True].copy +@pytest.fixture( + scope="session", + params=[np.random.randn, lambda *x: sparse.random(*x, format="csr")], + ids=["sparse", "dense"], +) +# worker_id for xdist since we don't want to override open files +def backed_adata( + request: pytest.FixtureRequest, + tmp_path_factory: pytest.TempPathFactory, + worker_id: str = "serial", +) -> AnnData: + tmp_path = tmp_path_factory.mktemp("backed_adata") + rand_func = request.param + tmp_path = tmp_path / f"test_{rand_func.__name__}_{worker_id}.h5ad" + X = rand_func(200, 10).astype(np.float32) + cat = np.random.randint(0, 3, (X.shape[0],)).ravel() + adata = AnnData(X, obs={"cat": cat}) + adata.obs["percent_mito"] = np.random.rand(X.shape[0]) + adata.obs["n_counts"] = X.sum(axis=1) + adata.obs["cat"] = adata.obs["cat"].astype("category") + adata.layers["X_copy"] = adata.X[...] + adata.write_h5ad(tmp_path) + adata = read_h5ad(tmp_path, backed="r") + adata.layers["X_copy"] = ( + make_sparse(adata.file["X"]) + if isinstance(adata.X, SparseDataset) + else adata.file["X"] + ) + return adata + + def _prepare_pbmc_testdata( adata: AnnData, sparsity_func: Callable[ @@ -63,7 +111,6 @@ def _prepare_pbmc_testdata( small False (default) returns full data, True returns small subset of the data. """ - import scanpy as sc if small: