From 4d344dc11edd3d792681b2b863d03d8c9ec11996 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 13 Nov 2019 16:22:17 +1100 Subject: [PATCH] Workaround umap<0.4 and increase numerical stability of gearys_c * Work around https://github.com/lmcinnes/umap/issues/306 by not calling out to kernel function. That code has been kept, but commented out. * Increase numerical stability by casting data to system width. Tests were failing due to instability. --- scanpy/metrics/_gearys_c.py | 124 ++++++++++++++++++++++++++++------- scanpy/tests/test_metrics.py | 15 +++++ 2 files changed, 116 insertions(+), 23 deletions(-) diff --git a/scanpy/metrics/_gearys_c.py b/scanpy/metrics/_gearys_c.py index 09b0be538e..e2606f8cf1 100644 --- a/scanpy/metrics/_gearys_c.py +++ b/scanpy/metrics/_gearys_c.py @@ -1,4 +1,3 @@ -# TODO: Calling this after UMAP has imported will hang, pending https://github.com/lmcinnes/umap/issues/306 from typing import Optional, Union from anndata import AnnData @@ -36,6 +35,16 @@ def _choose_obs_rep(adata, *, use_raw=False, layer=None, obsm=None, obsp=None): ############################################################################### # Calculation ############################################################################### +# Some notes on the implementation: +# * This could be phrased as tensor multiplication. However that does not get +# parallelized, which boosts performance almost linearly with cores. +# * Due to the umap setting the default threading backend, a parallel numba +# function that calls another parallel numba function can get stuck. This +# ends up meaning code re-use will be limited until umap 0.4. +# See: https://github.com/lmcinnes/umap/issues/306 +# * There can be a fair amount of numerical instability here (big reductions), +# so data is cast to float64. Removing these casts/ conversion will cause the +# tests to fail. @numba.njit(cache=True, parallel=True) @@ -48,6 +57,7 @@ def _gearys_c_vec(data, indices, indptr, x): def _gearys_c_vec_W(data, indices, indptr, x, W): N = len(indptr) - 1 x_bar = x.mean() + x = x.astype(np.float_) total = 0.0 for i in numba.prange(N): @@ -69,27 +79,90 @@ def _gearys_c_mtx_csr( ): M, N = x_shape W = g_data.sum() - out = np.zeros(M, dtype=np.float64) + out = np.zeros(M, dtype=np.float_) for k in numba.prange(M): - x_arr = np.zeros(N, dtype=x_data.dtype) + x_k = np.zeros(N, dtype=np.float_) sk = slice(x_indptr[k], x_indptr[k + 1]) - x_arr[x_indices[sk]] = x_data[sk] - outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, x_arr, W) - out[k] = outval + x_k_data = x_data[sk] + x_k[x_indices[sk]] = x_k_data + x_k_bar = np.sum(x_k_data) / N + total = 0.0 + for i in numba.prange(N): + s = slice(g_indptr[i], g_indptr[i + 1]) + i_indices = g_indices[s] + i_data = g_data[s] + total += np.sum(i_data * ((x_k[i] - x_k[i_indices]) ** 2)) + numer = (N - 1) * total + # Expanded from 2 * W * ((x_k - x_k_bar) ** 2).sum(), but uses sparsity + # to skip some calculations + denom = ( + 2 + * W + * ( + np.sum(x_k_data ** 2) + - np.sum(x_k_data * x_k_bar * 2) + + (x_k_bar ** 2) * N + ) + ) + C = numer / denom + out[k] = C return out +# Simplified implementation, hits race condition after umap import due to numba +# parallel backend +# @numba.njit(cache=True, parallel=True) +# def _gearys_c_mtx_csr( +# g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape +# ): +# M, N = x_shape +# W = g_data.sum() +# out = np.zeros(M, dtype=np.float64) +# for k in numba.prange(M): +# x_arr = np.zeros(N, dtype=x_data.dtype) +# sk = slice(x_indptr[k], x_indptr[k + 1]) +# x_arr[x_indices[sk]] = x_data[sk] +# outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, x_arr, W) +# out[k] = outval +# return out + + @numba.njit(cache=True, parallel=True) def _gearys_c_mtx(g_data, g_indices, g_indptr, X): M, N = X.shape W = g_data.sum() - out = np.zeros(M, dtype=np.float64) + out = np.zeros(M, dtype=np.float_) for k in numba.prange(M): - outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, X[k, :], W) - out[k] = outval + x = X[k, :].astype(np.float_) + x_bar = x.mean() + + total = 0.0 + for i in numba.prange(N): + s = slice(g_indptr[i], g_indptr[i + 1]) + i_indices = g_indices[s] + i_data = g_data[s] + total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2)) + + numer = (N - 1) * total + denom = 2 * W * ((x - x_bar) ** 2).sum() + C = numer / denom + + out[k] = C return out +# Similar to above, simplified version umaps choice of parallel backend breaks: +# @numba.njit(cache=True, parallel=True) +# def _gearys_c_mtx(g_data, g_indices, g_indptr, X): +# M, N = X.shape +# W = g_data.sum() +# out = np.zeros(M, dtype=np.float64) +# for k in numba.prange(M): +# outval = _gearys_c_vec_W(g_data, g_indices, g_indptr, X[k, :], W) +# out[k] = outval +# return out + + ############################################################################### # Interface ############################################################################### @@ -97,50 +170,52 @@ def _gearys_c_mtx(g_data, g_indices, g_indptr, X): @dispatch(sparse.csr_matrix, sparse.csr_matrix) def gearys_c(g, vals) -> np.ndarray: - assert g.shape[0] == g.shape[1] + assert g.shape[0] == g.shape[1], "`g` should be a square adjacency matrix" assert g.shape[0] == vals.shape[1] return _gearys_c_mtx_csr( - g.data, + g.data.astype(np.float_, copy=False), g.indices, g.indptr, - vals.data, + vals.data.astype(np.float_, copy=False), vals.indices, vals.indptr, vals.shape, ) -@dispatch(sparse.spmatrix, np.ndarray) +@dispatch(sparse.spmatrix, np.ndarray) # noqa def gearys_c(g, vals): assert g.shape[0] == g.shape[1], "`g` should be a square matrix." if not isinstance(g, sparse.csr_matrix): g = g.tocsr() + g_data = g.data.astype(np.float_, copy=False) if vals.ndim == 1: assert g.shape[0] == vals.shape[0] - return _gearys_c_vec(g.data, g.indices, g.indptr, vals) + return _gearys_c_vec(g_data, g.indices, g.indptr, vals) elif vals.ndim == 2: assert g.shape[0] == vals.shape[1] - return _gearys_c_mtx(g.data, g.indices, g.indptr, vals) + return _gearys_c_mtx(g_data, g.indices, g.indptr, vals) else: raise ValueError() -@dispatch(sparse.spmatrix, (pd.DataFrame, pd.Series)) +@dispatch(sparse.spmatrix, (pd.DataFrame, pd.Series)) # noqa def gearys_c(g, vals): return gearys_c(g, vals.values) -@dispatch(sparse.spmatrix, sparse.spmatrix) + +@dispatch(sparse.spmatrix, sparse.spmatrix) # noqa def gearys_c(g, vals) -> np.ndarray: if not isinstance(g, sparse.csr_matrix): g = g.tocsr() - if not isinstance(vals, sparse.csc_matrix): + if not isinstance(vals, sparse.csr_matrix): vals = vals.tocsr() return gearys_c(g, vals) # TODO: Document better # TODO: Have scanpydoc work with multipledispatch -@dispatch(AnnData) +@dispatch(AnnData) # noqa def gearys_c( adata: AnnData, *, @@ -156,12 +231,12 @@ def gearys_c( by `VISION `_. Geary's C is a measure of autocorrelation for some measure on a graph. This - can be to whether measures are correlated between neighboring cells. Lower + can be to whether measures are correlated between neighboring cells. Lower values indicate greater correlation. ..math - C = + C = \frac{ (N - 1)\sum_{i,j} w_{i,j} (x_i - x_j)^2 }{ @@ -191,7 +266,8 @@ def gearys_c( Returns ------- - If vals is two dimensional, returns a 1 dimensional ndarray array. Returns a scalar if `vals` is 1d. + If vals is two dimensional, returns a 1 dimensional ndarray array. Returns + a scalar if `vals` is 1d. """ if use_graph is None: if "connectivities" in adata.obsp: @@ -203,5 +279,7 @@ def gearys_c( else: raise NotImplementedError() if vals is None: - vals = _choose_obs_rep(adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp).T + vals = _choose_obs_rep( + adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp + ).T return gearys_c(g, vals) diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py index 6324cfee8e..fae15a636c 100644 --- a/scanpy/tests/test_metrics.py +++ b/scanpy/tests/test_metrics.py @@ -27,6 +27,19 @@ def test_gearys_c(): sc.metrics.gearys_c(g, pbmc.obsm["X_pca"].T), ) + all_genes = sc.metrics.gearys_c(pbmc, layer="raw") + first_gene = sc.metrics.gearys_c( + pbmc, + vals=pbmc.obs_vector(pbmc.var_names[0], layer="raw") + ) + + assert np.allclose(all_genes[0], first_gene) + + assert np.allclose( + sc.metrics.gearys_c(pbmc, layer="raw"), + sc.metrics.gearys_c(pbmc, vals=pbmc.layers["raw"].T.toarray()) + ) + # Test case with perfectly seperated groups connected = np.zeros(100) connected[np.random.choice(100, size=30, replace=False)] = 1 @@ -36,6 +49,8 @@ def test_gearys_c(): graph = sparse.csr_matrix(graph) assert sc.metrics.gearys_c(graph, connected) == 0.0 + assert sc.metrics.gearys_c(graph, connected) \ + == sc.metrics.gearys_c(graph, sparse.csr_matrix(connected)) adata = sc.AnnData( sparse.csr_matrix((100, 100)), obsp={"connectivities": graph} )