From 4578d63d7e6c7962e8203d50e3be6eb2a5a43612 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 19 Feb 2021 15:59:56 +1100 Subject: [PATCH 1/6] Cleanup normalize_total --- scanpy/preprocessing/_normalization.py | 91 ++++++++++++++------------ 1 file changed, 50 insertions(+), 41 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index 2d745634f6..f5d13c2297 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -1,4 +1,5 @@ from typing import Optional, Union, Iterable, Dict +from warnings import warn import numpy as np from anndata import AnnData @@ -8,6 +9,7 @@ from .. import logging as logg from .._compat import Literal from .._utils import view_to_actual +from scanpy.get import _get_obs_rep, _set_obs_rep def _normalize_data(X, counts, after=None, copy=False): @@ -31,6 +33,7 @@ def normalize_total( exclude_highly_expressed: bool = False, max_fraction: float = 0.05, key_added: Optional[str] = None, + layer: Optional[str] = None, layers: Union[Literal['all'], Iterable[str]] = None, layer_norm: Optional[str] = None, inplace: bool = True, @@ -72,20 +75,8 @@ def normalize_total( key_added Name of the field in `adata.obs` where the normalization factor is stored. - layers - List of layers to normalize. Set to `'all'` to normalize all layers. - layer_norm - Specifies how to normalize layers: - - * If `None`, after normalization, for each layer in *layers* each cell - has a total count equal to the median of the *counts_per_cell* before - normalization of the layer. - * If `'after'`, for each layer in *layers* each cell has - a total count equal to `target_sum`. - * If `'X'`, for each layer in *layers* each cell has a total count - equal to the median of total counts for observations (cells) of - `adata.X` before normalization. - + layer + Layer to normalize instead of `X`. If `None`, `X` is normalized. inplace Whether to update `adata` or return dictionary with normalized copies of `adata.X` and `adata.layers`. @@ -130,6 +121,22 @@ def normalize_total( if max_fraction < 0 or max_fraction > 1: raise ValueError('Choose max_fraction between 0 and 1.') + # Deprecated features + if layers is not None: + warn( + FutureWarning( + "The `layers` argument is deprecated. Instead, specify individual " + "layers to normalize with `layer`." + ) + ) + if layer_norm is not None: + warn( + FutureWarning( + "The `layer_norm` argument is deprecated. Specify the target size " + "factor directly with `target_sum`." + ) + ) + if layers == 'all': layers = adata.layers.keys() elif isinstance(layers, str): @@ -139,32 +146,47 @@ def normalize_total( view_to_actual(adata) + X = _get_obs_rep(adata, layer=layer) + gene_subset = None msg = 'normalizing counts per cell' if exclude_highly_expressed: - counts_per_cell = adata.X.sum(1) # original counts per cell + counts_per_cell = X.sum(1) # original counts per cell counts_per_cell = np.ravel(counts_per_cell) # at least one cell as more than max_fraction of counts per cell - gene_subset = (adata.X > counts_per_cell[:, None] * max_fraction).sum(0) + + gene_subset = (X > counts_per_cell[:, None] * max_fraction).sum(0) gene_subset = np.ravel(gene_subset) == 0 msg += ( ' The following highly-expressed genes are not considered during ' f'normalization factor computation:\n{adata.var_names[~gene_subset].tolist()}' ) + counts_per_cell = X[:, gene_subset].sum(1) + else: + counts_per_cell = X.sum(1) start = logg.info(msg) - - # counts per cell for subset, if max_fraction!=1 - X = adata.X if gene_subset is None else adata[:, gene_subset].X - counts_per_cell = X.sum(1) - # get rid of adata view - counts_per_cell = np.ravel(counts_per_cell).copy() + counts_per_cell = np.ravel(counts_per_cell) cell_subset = counts_per_cell > 0 if not np.all(cell_subset): - logg.warning('Some cells have total count of genes equal to zero') + warn(UserWarning('Some cells have zero counts')) + + if inplace: + if key_added is not None: + adata.obs[key_added] = counts_per_cell + _set_obs_rep( + adata, _normalize_data(X, counts_per_cell, target_sum), layer=layer + ) + else: + # not recarray because need to support sparse + dat = dict( + X=_normalize_data(X, counts_per_cell, target_sum, copy=True), + norm_factor=counts_per_cell, + ) + # Deprecated features if layer_norm == 'after': after = target_sum elif layer_norm == 'X': @@ -173,26 +195,13 @@ def normalize_total( after = None else: raise ValueError('layer_norm should be "after", "X" or None') - del cell_subset - if inplace: - if key_added is not None: - adata.obs[key_added] = counts_per_cell - adata.X = _normalize_data(adata.X, counts_per_cell, target_sum) - else: - # not recarray because need to support sparse - dat = dict( - X=_normalize_data(adata.X, counts_per_cell, target_sum, copy=True), - norm_factor=counts_per_cell, + for layer_to_norm in layers if layers is not None else (): + res = normalize_total( + adata, layer=layer_to_norm, target_sum=after, inplace=inplace ) - - for layer_name in layers or (): - layer = adata.layers[layer_name] - counts = np.ravel(layer.sum(1)) - if inplace: - adata.layers[layer_name] = _normalize_data(layer, counts, after) - else: - dat[layer_name] = _normalize_data(layer, counts, after, copy=True) + if not inplace: + dat[layer_to_norm] = res["X"] logg.info( ' finished ({time_passed})', From 4d34bf0d3980675aff57284237dd4c2d4d6363f8 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 2 Mar 2021 13:43:40 +1100 Subject: [PATCH 2/6] Add modification tests and copy kwarg for normalize_total --- scanpy/preprocessing/_normalization.py | 13 +++++++- scanpy/tests/helpers.py | 44 ++++++++++++++++---------- scanpy/tests/test_normalization.py | 12 ++++++- 3 files changed, 51 insertions(+), 18 deletions(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index f5d13c2297..badd3d835c 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -37,6 +37,7 @@ def normalize_total( layers: Union[Literal['all'], Iterable[str]] = None, layer_norm: Optional[str] = None, inplace: bool = True, + copy: bool = False, ) -> Optional[Dict[str, np.ndarray]]: """\ Normalize counts per cell. @@ -80,6 +81,8 @@ def normalize_total( inplace Whether to update `adata` or return dictionary with normalized copies of `adata.X` and `adata.layers`. + copy + Whether to modify copied input object. Not compatible with inplace=False. Returns ------- @@ -118,6 +121,11 @@ def normalize_total( [ 0.5, 0.5, 0.5, 1. , 1. ], [ 0.5, 11. , 0.5, 1. , 1. ]], dtype=float32) """ + if copy: + if not inplace: + raise ValueError() + adata = adata.copy() + if max_fraction < 0 or max_fraction > 1: raise ValueError('Choose max_fraction between 0 and 1.') @@ -212,4 +220,7 @@ def normalize_total( f'and added {key_added!r}, counts per cell before normalization (adata.obs)' ) - return dat if not inplace else None + if copy: + return adata + elif not inplace: + return dat diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index 35253bad69..a8bf6ec088 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -2,6 +2,8 @@ This file contains helper functions for the scanpy test suite. """ +from itertools import permutations + import scanpy as sc import numpy as np @@ -13,26 +15,36 @@ # These functions can be used to check that functions are correctly using arugments like `layers`, `obsm`, etc. -def check_rep_mutation(func, X, **kwargs): +def check_rep_mutation(func, X, fields=["layer", "obsm"], **kwargs): """Check that only the array meant to be modified is modified.""" - adata = sc.AnnData( - X=X.copy(), - layers={"layer": X.copy()}, - obsm={"obsm": X.copy()}, - dtype=X.dtype, - ) + adata = sc.AnnData(X=X.copy(), dtype=X.dtype) + for field in fields: + sc.get._set_obs_rep(adata, X, **{field: field}) + X_array = asarray(X) + adata_X = func(adata, copy=True, **kwargs) - adata_layer = func(adata, layer="layer", copy=True, **kwargs) - adata_obsm = func(adata, obsm="obsm", copy=True, **kwargs) + adatas_proc = { + field: func(adata, copy=True, **{field: field}, **kwargs) for field in fields + } - assert np.array_equal(asarray(adata_X.X), asarray(adata_layer.layers["layer"])) - assert np.array_equal(asarray(adata_X.X), asarray(adata_obsm.obsm["obsm"])) + # Modified fields + for field in fields: + result_array = asarray( + sc.get._get_obs_rep(adatas_proc[field], **{field: field}) + ) + np.testing.assert_array_equal(asarray(adata_X.X), result_array) - assert np.array_equal(asarray(adata_layer.X), asarray(adata_layer.obsm["obsm"])) - assert np.array_equal(asarray(adata_obsm.X), asarray(adata_obsm.layers["layer"])) - assert np.array_equal( - asarray(adata_X.layers["layer"]), asarray(adata_X.obsm["obsm"]) - ) + # Unmodified fields + for field in fields: + np.testing.assert_array_equal(X_array, asarray(adatas_proc[field].X)) + np.testing.assert_array_equal( + X_array, asarray(sc.get._get_obs_rep(adata_X, **{field: field})) + ) + for field_a, field_b in permutations(fields, 2): + result_array = asarray( + sc.get._get_obs_rep(adatas_proc[field_a], **{field_b: field_b}) + ) + np.testing.assert_array_equal(X_array, result_array) def check_rep_results(func, X, **kwargs): diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index 9b84699d9a..a815025449 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -2,9 +2,11 @@ import numpy as np from anndata import AnnData from scipy.sparse import csr_matrix +from scipy import sparse import scanpy as sc -from anndata.tests.helpers import assert_equal +from scanpy.tests.helpers import check_rep_mutation +from anndata.tests.helpers import assert_equal, asarray X_total = [[1, 0], [3, 0], [5, 6]] X_frac = [[1, 0, 1], [3, 0, 1], [5, 6, 1]] @@ -24,6 +26,14 @@ def test_normalize_total(typ, dtype): assert np.allclose(np.ravel(adata.X[:, 1:3].sum(axis=1)), [1.0, 1.0, 1.0]) +@pytest.mark.parametrize('typ', [asarray, csr_matrix], ids=lambda x: x.__name__) +@pytest.mark.parametrize('dtype', ['float32', 'int64']) +def test_normalize_total_rep(typ, dtype): + # Test that layer kwarg works + X = typ(sparse.random(100, 50, format="csr", density=0.2, dtype=dtype)) + check_rep_mutation(sc.pp.normalize_total, X, fields=["layer"]) + + @pytest.mark.parametrize('typ', [np.array, csr_matrix], ids=lambda x: x.__name__) @pytest.mark.parametrize('dtype', ['float32', 'int64']) def test_normalize_total_layers(typ, dtype): From 01288605eda5c77885be4aa5c644ea9ae03dd611 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 2 Mar 2021 14:04:26 +1100 Subject: [PATCH 3/6] Test that 'layers' argument is deprecated --- scanpy/tests/test_normalization.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index a815025449..ce38f703ba 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -39,7 +39,8 @@ def test_normalize_total_rep(typ, dtype): def test_normalize_total_layers(typ, dtype): adata = AnnData(typ(X_total), dtype=dtype) adata.layers["layer"] = adata.X.copy() - sc.pp.normalize_total(adata, layers=["layer"]) + with pytest.warns(FutureWarning, match=r".*layers.*deprecated"): + sc.pp.normalize_total(adata, layers=["layer"]) assert np.allclose(adata.layers["layer"].sum(axis=1), [3.0, 3.0, 3.0]) From 8c5fd98f5ac3d3b2c493013a78591d6bf6f76512 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 3 Mar 2021 15:11:13 +1100 Subject: [PATCH 4/6] Added more mutation checks for normalize_total --- scanpy/tests/helpers.py | 52 ++++++++++++++++-------------- scanpy/tests/test_normalization.py | 3 +- 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/scanpy/tests/helpers.py b/scanpy/tests/helpers.py index a8bf6ec088..61fc35e23e 100644 --- a/scanpy/tests/helpers.py +++ b/scanpy/tests/helpers.py @@ -9,13 +9,16 @@ from anndata.tests.helpers import asarray, assert_equal +# TODO: Report more context on the fields being compared on error +# TODO: Allow specifying paths to ignore on comparison + ########################### # Representation choice ########################### # These functions can be used to check that functions are correctly using arugments like `layers`, `obsm`, etc. -def check_rep_mutation(func, X, fields=["layer", "obsm"], **kwargs): +def check_rep_mutation(func, X, *, fields=["layer", "obsm"], **kwargs): """Check that only the array meant to be modified is modified.""" adata = sc.AnnData(X=X.copy(), dtype=X.dtype) for field in fields: @@ -47,35 +50,36 @@ def check_rep_mutation(func, X, fields=["layer", "obsm"], **kwargs): np.testing.assert_array_equal(X_array, result_array) -def check_rep_results(func, X, **kwargs): +def check_rep_results(func, X, *, fields=["layer", "obsm"], **kwargs): """Checks that the results of a computation add values/ mutate the anndata object in a consistent way.""" # Gen data - adata_X = sc.AnnData( - X=X.copy(), - layers={"layer": np.zeros(shape=X.shape, dtype=X.dtype)}, - obsm={"obsm": np.zeros(shape=X.shape, dtype=X.dtype)}, - ) - adata_layer = sc.AnnData( - X=np.zeros(shape=X.shape, dtype=X.dtype), - layers={"layer": X.copy()}, - obsm={"obsm": np.zeros(shape=X.shape, dtype=X.dtype)}, - ) - adata_obsm = sc.AnnData( - X=np.zeros(shape=X.shape, dtype=X.dtype), - layers={"layer": np.zeros(shape=X.shape, dtype=X.dtype)}, - obsm={"obsm": X.copy()}, + empty_X = np.zeros(shape=X.shape, dtype=X.dtype) + adata = sc.AnnData( + X=empty_X.copy(), + layers={"layer": empty_X.copy()}, + obsm={"obsm": empty_X.copy()}, ) + adata_X = adata.copy() + adata_X.X = X.copy() + + adatas_proc = {} + for field in fields: + cur = adata.copy() + sc.get._set_obs_rep(cur, X.copy(), **{field: field}) + adatas_proc[field] = cur + # Apply function func(adata_X, **kwargs) - func(adata_layer, layer="layer", **kwargs) - func(adata_obsm, obsm="obsm", **kwargs) + for field in fields: + func(adatas_proc[field], **{field: field}, **kwargs) # Reset X - adata_X.X = np.zeros(shape=X.shape, dtype=X.dtype) - adata_layer.layers["layer"] = np.zeros(shape=X.shape, dtype=X.dtype) - adata_obsm.obsm["obsm"] = np.zeros(shape=X.shape, dtype=X.dtype) + adata_X.X = empty_X.copy() + for field in fields: + sc.get._set_obs_rep(adatas_proc[field], empty_X.copy(), **{field: field}) - # Check equality - assert_equal(adata_X, adata_layer) - assert_equal(adata_X, adata_obsm) + for field_a, field_b in permutations(fields, 2): + assert_equal(adatas_proc[field_a], adatas_proc[field_b]) + for field in fields: + assert_equal(adata_X, adatas_proc[field]) diff --git a/scanpy/tests/test_normalization.py b/scanpy/tests/test_normalization.py index ce38f703ba..0f5dbb102d 100644 --- a/scanpy/tests/test_normalization.py +++ b/scanpy/tests/test_normalization.py @@ -5,7 +5,7 @@ from scipy import sparse import scanpy as sc -from scanpy.tests.helpers import check_rep_mutation +from scanpy.tests.helpers import check_rep_mutation, check_rep_results from anndata.tests.helpers import assert_equal, asarray X_total = [[1, 0], [3, 0], [5, 6]] @@ -32,6 +32,7 @@ def test_normalize_total_rep(typ, dtype): # Test that layer kwarg works X = typ(sparse.random(100, 50, format="csr", density=0.2, dtype=dtype)) check_rep_mutation(sc.pp.normalize_total, X, fields=["layer"]) + check_rep_results(sc.pp.normalize_total, X, fields=["layer"]) @pytest.mark.parametrize('typ', [np.array, csr_matrix], ids=lambda x: x.__name__) From e3ed02b192dd8f1ff30c055c92c4fa70be50d54a Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 3 Mar 2021 15:18:17 +1100 Subject: [PATCH 5/6] release note --- docs/release-notes/1.8.0.rst | 3 +++ 1 file changed, 3 insertions(+) diff --git a/docs/release-notes/1.8.0.rst b/docs/release-notes/1.8.0.rst index b98676558a..d7d709ecd0 100644 --- a/docs/release-notes/1.8.0.rst +++ b/docs/release-notes/1.8.0.rst @@ -5,6 +5,7 @@ - Switched to flit_ for building and deploying the package, a simple tool with an easy to understand command line interface and metadata. +- Added `layer` and `copy` kwargs to :func:`~scanpy.pp.normalize_total` :pr:`1667` :smaller:`I Virshup` .. _flit: https://flit.readthedocs.io/en/latest/ @@ -15,3 +16,5 @@ .. rubric:: Bug fixes .. rubric:: Deprecations + +- Deprecated `layers` and `layers_norm` kwargs to :func:`~scanpy.pp.normalize_total` :pr:`1667` :smaller:`I Virshup` From 6a9d3fc7643f6b728d37f5d6fc23dd8ad8707176 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 3 Mar 2021 15:35:11 +1100 Subject: [PATCH 6/6] Error message --- scanpy/preprocessing/_normalization.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpy/preprocessing/_normalization.py b/scanpy/preprocessing/_normalization.py index badd3d835c..542afd38ca 100644 --- a/scanpy/preprocessing/_normalization.py +++ b/scanpy/preprocessing/_normalization.py @@ -123,7 +123,7 @@ def normalize_total( """ if copy: if not inplace: - raise ValueError() + raise ValueError("`copy=True` cannot be used with `inplace=False`.") adata = adata.copy() if max_fraction < 0 or max_fraction > 1: