diff --git a/cellxgene_schema_cli/cellxgene_schema/utils.py b/cellxgene_schema_cli/cellxgene_schema/utils.py index 5b0e8b0f..9de83d6c 100644 --- a/cellxgene_schema_cli/cellxgene_schema/utils.py +++ b/cellxgene_schema_cli/cellxgene_schema/utils.py @@ -115,10 +115,12 @@ def getattr_anndata(adata: ad.AnnData, attr: str = None): return None else: return getattr(adata, attr) - + + import anndata as ad from anndata.experimental import read_elem, read_dispatched, sparse_dataset + def read_backed(f): def callback(func, elem_name: str, elem, iospec): if "layers" in elem_name or ("X" in elem_name and "X_" not in elem_name): @@ -131,7 +133,8 @@ def callback(func, elem_name: str, elem, iospec): return read_elem(elem) elif iospec.encoding_type == "array" and len(elem.shape) > 1: return elem - else: func(elem) + else: + func(elem) else: return func(elem) diff --git a/cellxgene_schema_cli/cellxgene_schema/validate.py b/cellxgene_schema_cli/cellxgene_schema/validate.py index 4aa81adc..367949b0 100644 --- a/cellxgene_schema_cli/cellxgene_schema/validate.py +++ b/cellxgene_schema_cli/cellxgene_schema/validate.py @@ -394,7 +394,9 @@ def _chunk_matrix( if start < n: yield (matrix[start:n], start, n) - def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, sparse.spmatrix]) -> int: + def _count_matrix_nonzero( + self, matrix_name: str, matrix: Union[np.ndarray, sparse.spmatrix], filter_by_column: pd.Series = None + ) -> int: if matrix_name in self.number_non_zero: return self.number_non_zero[matrix_name] @@ -403,6 +405,8 @@ def _count_matrix_nonzero(self, matrix_name: str, matrix: Union[np.ndarray, spar nnz = 0 matrix_format = get_matrix_format(self.adata, matrix) for matrix_chunk, _, _ in self._chunk_matrix(matrix): + if filter_by_column is not None: + matrix_chunk = matrix_chunk[:, filter_by_column] nnz += matrix_chunk.count_nonzero() if matrix_format != "dense" else np.count_nonzero(matrix_chunk) self.number_non_zero[matrix_name] = nnz @@ -424,21 +428,7 @@ def _validate_column_feature_is_filtered(self, column: pd.Series, column_name: s return if sum(column) > 0: - n_nonzero = 0 - - X_format = get_matrix_format(self.adata, self.adata.X) - if X_format in SPARSE_MATRIX_TYPES: - n_nonzero = self.adata.X[:, column].count_nonzero() - - elif X_format == "dense": - n_nonzero = np.count_nonzero(self.adata.X[:, column]) - - else: - self.errors.append( - f"X matrix is of type {type(self.adata.X)}, validation of 'feature_is_filtered' " - f"cannot be completed." - ) - + n_nonzero = self._count_matrix_nonzero("feature_is_filtered", self.adata.X, column) if n_nonzero > 0: self.errors.append( f"Some features are 'True' in '{column_name}' of dataframe '{df_name}', but there are " diff --git a/cellxgene_schema_cli/tests/test_utils.py b/cellxgene_schema_cli/tests/test_utils.py index 69d8808e..4a88a515 100644 --- a/cellxgene_schema_cli/tests/test_utils.py +++ b/cellxgene_schema_cli/tests/test_utils.py @@ -155,4 +155,3 @@ def test_read_h5ad(self): h5ad_path = h5ad_valid adata = read_h5ad(h5ad_path) assert isinstance(adata, AnnData) - assert adata.isbacked