Skip to content

Commit

Permalink
sc.metrics module (add confusion matrix & Geary's C methods) (#915)
Browse files Browse the repository at this point in the history
* Add `sc.metrics` with `gearys_c`

Add a module for computing useful metrics. Started off with Geary's C since I'm using it and finding it useful. I've also got a fairly fast way to calculate it worked out.

Unfortunatly my implementation runs into some issues with some global configs set by umap (see lmcinnes/umap#306), so I'm going to see if that can be resolved before changing it.

* Add sc.metrics.confusion_matrix

* Better tests and output for confusion_matrix

* Workaround umap<0.4 and increase numerical stability of gearys_c

* Work around lmcinnes/umap#306 by not
  calling out to kernel function. That code has been kept, but commented
  out.
* Increase numerical stability by casting data to system width. Tests
  were failing due to instability.

* Split up gearys_c tests

* Improved unexpected error message

* gearys_c working again. Sadly, a bit slower

* One option for doc strings

* Simplify implementation to use single dispatch

* release notes
  • Loading branch information
ivirshup committed Mar 4, 2021
1 parent 6569bd6 commit b3a2a6d
Show file tree
Hide file tree
Showing 9 changed files with 528 additions and 2 deletions.
15 changes: 15 additions & 0 deletions docs/api/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,21 @@ This module provides useful queries for annotation and enrichment.
queries.enrich


Metrics
-------

.. module:: scanpy.metrics
.. currentmodule:: scanpy

Collections of useful measurements for evaluating results.

.. autosummary::
:toctree: .

metrics.confusion_matrix
metrics.gearys_c


Classes
-------

Expand Down
1 change: 1 addition & 0 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def setup(app):
"scanpy.plotting._matrixplot.MatrixPlot": "scanpy.pl.MatrixPlot",
"scanpy.plotting._dotplot.DotPlot": "scanpy.pl.DotPlot",
"scanpy.plotting._stacked_violin.StackedViolin": "scanpy.pl.StackedViolin",
"pandas.core.series.Series": "pandas.Series",
}

nitpick_ignore = [
Expand Down
7 changes: 7 additions & 0 deletions docs/release-notes/1.8.0.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,13 @@

.. _flit: https://flit.readthedocs.io/en/latest/

.. rubric:: Metrics module

- Added :mod:`scanpy.metrics` module!

- Added :func:`scanpy.metrics.confusion_matrix` for comparing labellings :pr:`915` :smaller:`I Virshup`
- Added :func:`scanpy.metrics.gearys_c` for spatial autocorrelation :pr:`915` :smaller:`I Virshup`

.. rubric:: External tools

.. rubric:: Performance enhancements
Expand Down
2 changes: 1 addition & 1 deletion scanpy/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from . import tools as tl
from . import preprocessing as pp
from . import plotting as pl
from . import datasets, logging, queries, external, get
from . import datasets, logging, queries, external, get, metrics

from anndata import AnnData, concat
from anndata import (
Expand Down
2 changes: 2 additions & 0 deletions scanpy/metrics/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
from ._gearys_c import gearys_c
from ._metrics import confusion_matrix
298 changes: 298 additions & 0 deletions scanpy/metrics/_gearys_c.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,298 @@
from functools import singledispatch
from typing import Optional, Union


from anndata import AnnData
from scanpy.get import _get_obs_rep
import numba
import numpy as np
import pandas as pd
from scipy import sparse


@singledispatch
def gearys_c(
adata: AnnData,
*,
vals: Optional[Union[np.ndarray, sparse.spmatrix]] = None,
use_graph: Optional[str] = None,
layer: Optional[str] = None,
obsm: Optional[str] = None,
obsp: Optional[str] = None,
use_raw: bool = False,
) -> Union[np.ndarray, float]:
r"""
Calculate `Geary's C <https://en.wikipedia.org/wiki/Geary's_C>`_, as used
by `VISION <https://doi.org/10.1038/s41467-019-12235-0>`_.
Geary's C is a measure of autocorrelation for some measure on a graph. This
can be to whether measures are correlated between neighboring cells. Lower
values indicate greater correlation.
.. math::
C =
\frac{
(N - 1)\sum_{i,j} w_{i,j} (x_i - x_j)^2
}{
2W \sum_i (x_i - \bar{x})^2
}
Params
------
adata
vals
Values to calculate Geary's C for. If this is two dimensional, should
be of shape `(n_features, n_cells)`. Otherwise should be of shape
`(n_cells,)`. This matrix can be selected from elements of the anndata
object by using key word arguments: `layer`, `obsm`, `obsp`, or
`use_raw`.
use_graph
Key to use for graph in anndata object. If not provided, default
neighbors connectivities will be used instead.
layer
Key for `adata.layers` to choose `vals`.
obsm
Key for `adata.obsm` to choose `vals`.
obsp
Key for `adata.obsp` to choose `vals`.
use_raw
Whether to use `adata.raw.X` for `vals`.
This function can also be called on the graph and values directly. In this case
the signature looks like:
Params
------
g
The graph
vals
The values
See the examples for more info.
Returns
-------
If vals is two dimensional, returns a 1 dimensional ndarray array. Returns
a scalar if `vals` is 1d.
Examples
--------
Calculate Gearys C for each components of a dimensionality reduction:
.. code:: python
import scanpy as sc, numpy as np
pbmc = sc.datasets.pbmc68k_processed()
pc_c = sc.metrics.gearys_c(pbmc, obsm="X_pca")
It's equivalent to call the function directly on the underlying arrays:
.. code:: python
alt = sc.metrics.gearys_c(pbmc.obsp["connectivities"], pbmc.obsm["X_pca"].T)
np.testing.assert_array_equal(pc_c, alt)
"""
if use_graph is None:
# Fix for anndata<0.7
if hasattr(adata, "obsp") and "connectivities" in adata.obsp:
g = adata.obsp["connectivities"]
elif "neighbors" in adata.uns:
g = adata.uns["neighbors"]["connectivities"]
else:
raise ValueError("Must run neighbors first.")
else:
raise NotImplementedError()
if vals is None:
vals = _get_obs_rep(adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp).T
return gearys_c(g, vals)


###############################################################################
# Calculation
###############################################################################
# Some notes on the implementation:
# * This could be phrased as tensor multiplication. However that does not get
# parallelized, which boosts performance almost linearly with cores.
# * Due to the umap setting the default threading backend, a parallel numba
# function that calls another parallel numba function can get stuck. This
# ends up meaning code re-use will be limited until umap 0.4.
# See: https://github.com/lmcinnes/umap/issues/306
# * There can be a fair amount of numerical instability here (big reductions),
# so data is cast to float64. Removing these casts/ conversion will cause the
# tests to fail.


@numba.njit(cache=True, parallel=True)
def _gearys_c_vec(data, indices, indptr, x):
W = data.sum()
return _gearys_c_vec_W(data, indices, indptr, x, W)


@numba.njit(cache=True, parallel=True)
def _gearys_c_vec_W(data, indices, indptr, x, W):
N = len(indptr) - 1
x = x.astype(np.float_)
x_bar = x.mean()

total = 0.0
for i in numba.prange(N):
s = slice(indptr[i], indptr[i + 1])
i_indices = indices[s]
i_data = data[s]
total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))

numer = (N - 1) * total
denom = 2 * W * ((x - x_bar) ** 2).sum()
C = numer / denom

return C


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# Inner functions (per element C)
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# For calling gearys_c on collections.
# TODO: These are faster if we can compile them in parallel mode. However,
# `workqueue` does not allow nested functions to be parallelized.
# Additionally, there are currently problems with numba's compiler around
# parallelization of this code:
# https://github.com/numba/numba/issues/6774#issuecomment-788789663


@numba.njit
def _gearys_c_inner_sparse_x_densevec(g_data, g_indices, g_indptr, x, W):
x_bar = x.mean()
total = 0.0
N = len(x)
for i in numba.prange(N):
s = slice(g_indptr[i], g_indptr[i + 1])
i_indices = g_indices[s]
i_data = g_data[s]
total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))
numer = (N - 1) * total
denom = 2 * W * ((x - x_bar) ** 2).sum()
C = numer / denom
return C


@numba.njit
def _gearys_c_inner_sparse_x_sparsevec(
g_data, g_indices, g_indptr, x_data, x_indices, N, W
):
x = np.zeros(N, dtype=np.float_)
x[x_indices] = x_data
x_bar = np.sum(x_data) / N
total = 0.0
N = len(x)
for i in numba.prange(N):
s = slice(g_indptr[i], g_indptr[i + 1])
i_indices = g_indices[s]
i_data = g_data[s]
total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))
numer = (N - 1) * total
# Expanded from 2 * W * ((x_k - x_k_bar) ** 2).sum(), but uses sparsity
# to skip some calculations
# fmt: off
denom = (
2 * W
* (
np.sum(x_data ** 2)
- np.sum(x_data * x_bar * 2)
+ (x_bar ** 2) * N
)
)
# fmt: on
C = numer / denom
return C


@numba.njit(cache=True, parallel=True)
def _gearys_c_mtx(g_data, g_indices, g_indptr, X):
M, N = X.shape
assert N == len(g_indptr) - 1
W = g_data.sum()
out = np.zeros(M, dtype=np.float_)
for k in numba.prange(M):
x = X[k, :].astype(np.float_)
out[k] = _gearys_c_inner_sparse_x_densevec(g_data, g_indices, g_indptr, x, W)
return out


@numba.njit(cache=True, parallel=True)
def _gearys_c_mtx_csr(
g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape
):
M, N = x_shape
W = g_data.sum()
out = np.zeros(M, dtype=np.float_)
x_data_list = np.split(x_data, x_indptr[1:-1])
x_indices_list = np.split(x_indices, x_indptr[1:-1])
for k in numba.prange(M):
out[k] = _gearys_c_inner_sparse_x_sparsevec(
g_data,
g_indices,
g_indptr,
x_data_list[k],
x_indices_list[k],
N,
W,
)
return out


###############################################################################
# Interface
###############################################################################
@singledispatch
def _resolve_vals(val):
return np.asarray(val)


@_resolve_vals.register(np.ndarray)
@_resolve_vals.register(sparse.csr_matrix)
def _(val):
return val


@_resolve_vals.register(sparse.spmatrix)
def _(val):
return sparse.csr_matrix(val)


@_resolve_vals.register(pd.DataFrame)
@_resolve_vals.register(pd.Series)
def _(val):
return val.to_numpy()


@gearys_c.register(sparse.csr_matrix)
def _gearys_c(g, vals) -> np.ndarray:
assert g.shape[0] == g.shape[1], "`g` should be a square adjacency matrix"
vals = _resolve_vals(vals)
g_data = g.data.astype(np.float_, copy=False)
if isinstance(vals, sparse.csr_matrix):
assert g.shape[0] == vals.shape[1]
return _gearys_c_mtx_csr(
g_data,
g.indices,
g.indptr,
vals.data.astype(np.float_, copy=False),
vals.indices,
vals.indptr,
vals.shape,
)
elif isinstance(vals, np.ndarray) and vals.ndim == 1:
assert g.shape[0] == vals.shape[0]
return _gearys_c_vec(g_data, g.indices, g.indptr, vals)
elif isinstance(vals, np.ndarray) and vals.ndim == 2:
assert g.shape[0] == vals.shape[1]
return _gearys_c_mtx(g_data, g.indices, g.indptr, vals)
else:
raise NotImplementedError()
Loading

0 comments on commit b3a2a6d

Please sign in to comment.