From b3a2a6d45bff507625ae3f42d89ddd13389f6bf0 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 4 Mar 2021 17:18:33 +1100 Subject: [PATCH] `sc.metrics` module (add confusion matrix & Geary's C methods) (#915) * Add `sc.metrics` with `gearys_c` Add a module for computing useful metrics. Started off with Geary's C since I'm using it and finding it useful. I've also got a fairly fast way to calculate it worked out. Unfortunatly my implementation runs into some issues with some global configs set by umap (see https://github.com/lmcinnes/umap/issues/306), so I'm going to see if that can be resolved before changing it. * Add sc.metrics.confusion_matrix * Better tests and output for confusion_matrix * Workaround umap<0.4 and increase numerical stability of gearys_c * Work around https://github.com/lmcinnes/umap/issues/306 by not calling out to kernel function. That code has been kept, but commented out. * Increase numerical stability by casting data to system width. Tests were failing due to instability. * Split up gearys_c tests * Improved unexpected error message * gearys_c working again. Sadly, a bit slower * One option for doc strings * Simplify implementation to use single dispatch * release notes --- docs/api/index.rst | 15 ++ docs/conf.py | 1 + docs/release-notes/1.8.0.rst | 7 + scanpy/__init__.py | 2 +- scanpy/metrics/__init__.py | 2 + scanpy/metrics/_gearys_c.py | 298 +++++++++++++++++++++++++ scanpy/metrics/_metrics.py | 88 ++++++++ scanpy/tests/test_metrics.py | 116 ++++++++++ scanpy/tests/test_package_structure.py | 1 - 9 files changed, 528 insertions(+), 2 deletions(-) create mode 100644 scanpy/metrics/__init__.py create mode 100644 scanpy/metrics/_gearys_c.py create mode 100644 scanpy/metrics/_metrics.py create mode 100644 scanpy/tests/test_metrics.py diff --git a/docs/api/index.rst b/docs/api/index.rst index 5ca53b0028..e99bb0a8dd 100644 --- a/docs/api/index.rst +++ b/docs/api/index.rst @@ -236,6 +236,21 @@ This module provides useful queries for annotation and enrichment. queries.enrich +Metrics +------- + +.. module:: scanpy.metrics +.. currentmodule:: scanpy + +Collections of useful measurements for evaluating results. + +.. autosummary:: + :toctree: . + + metrics.confusion_matrix + metrics.gearys_c + + Classes ------- diff --git a/docs/conf.py b/docs/conf.py index 2e2b3c5157..46553c1640 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -139,6 +139,7 @@ def setup(app): "scanpy.plotting._matrixplot.MatrixPlot": "scanpy.pl.MatrixPlot", "scanpy.plotting._dotplot.DotPlot": "scanpy.pl.DotPlot", "scanpy.plotting._stacked_violin.StackedViolin": "scanpy.pl.StackedViolin", + "pandas.core.series.Series": "pandas.Series", } nitpick_ignore = [ diff --git a/docs/release-notes/1.8.0.rst b/docs/release-notes/1.8.0.rst index b98676558a..f3759523e0 100644 --- a/docs/release-notes/1.8.0.rst +++ b/docs/release-notes/1.8.0.rst @@ -8,6 +8,13 @@ .. _flit: https://flit.readthedocs.io/en/latest/ +.. rubric:: Metrics module + +- Added :mod:`scanpy.metrics` module! + + - Added :func:`scanpy.metrics.confusion_matrix` for comparing labellings :pr:`915` :smaller:`I Virshup` + - Added :func:`scanpy.metrics.gearys_c` for spatial autocorrelation :pr:`915` :smaller:`I Virshup` + .. rubric:: External tools .. rubric:: Performance enhancements diff --git a/scanpy/__init__.py b/scanpy/__init__.py index fa1124b85e..31b7c7dba9 100644 --- a/scanpy/__init__.py +++ b/scanpy/__init__.py @@ -14,7 +14,7 @@ from . import tools as tl from . import preprocessing as pp from . import plotting as pl - from . import datasets, logging, queries, external, get + from . import datasets, logging, queries, external, get, metrics from anndata import AnnData, concat from anndata import ( diff --git a/scanpy/metrics/__init__.py b/scanpy/metrics/__init__.py new file mode 100644 index 0000000000..d542d27dd4 --- /dev/null +++ b/scanpy/metrics/__init__.py @@ -0,0 +1,2 @@ +from ._gearys_c import gearys_c +from ._metrics import confusion_matrix diff --git a/scanpy/metrics/_gearys_c.py b/scanpy/metrics/_gearys_c.py new file mode 100644 index 0000000000..4d7f5dd618 --- /dev/null +++ b/scanpy/metrics/_gearys_c.py @@ -0,0 +1,298 @@ +from functools import singledispatch +from typing import Optional, Union + + +from anndata import AnnData +from scanpy.get import _get_obs_rep +import numba +import numpy as np +import pandas as pd +from scipy import sparse + + +@singledispatch +def gearys_c( + adata: AnnData, + *, + vals: Optional[Union[np.ndarray, sparse.spmatrix]] = None, + use_graph: Optional[str] = None, + layer: Optional[str] = None, + obsm: Optional[str] = None, + obsp: Optional[str] = None, + use_raw: bool = False, +) -> Union[np.ndarray, float]: + r""" + Calculate `Geary's C `_, as used + by `VISION `_. + + Geary's C is a measure of autocorrelation for some measure on a graph. This + can be to whether measures are correlated between neighboring cells. Lower + values indicate greater correlation. + + .. math:: + + C = + \frac{ + (N - 1)\sum_{i,j} w_{i,j} (x_i - x_j)^2 + }{ + 2W \sum_i (x_i - \bar{x})^2 + } + + Params + ------ + adata + vals + Values to calculate Geary's C for. If this is two dimensional, should + be of shape `(n_features, n_cells)`. Otherwise should be of shape + `(n_cells,)`. This matrix can be selected from elements of the anndata + object by using key word arguments: `layer`, `obsm`, `obsp`, or + `use_raw`. + use_graph + Key to use for graph in anndata object. If not provided, default + neighbors connectivities will be used instead. + layer + Key for `adata.layers` to choose `vals`. + obsm + Key for `adata.obsm` to choose `vals`. + obsp + Key for `adata.obsp` to choose `vals`. + use_raw + Whether to use `adata.raw.X` for `vals`. + + + This function can also be called on the graph and values directly. In this case + the signature looks like: + + Params + ------ + g + The graph + vals + The values + + + See the examples for more info. + + Returns + ------- + If vals is two dimensional, returns a 1 dimensional ndarray array. Returns + a scalar if `vals` is 1d. + + + Examples + -------- + + Calculate Gearys C for each components of a dimensionality reduction: + + .. code:: python + + import scanpy as sc, numpy as np + + pbmc = sc.datasets.pbmc68k_processed() + pc_c = sc.metrics.gearys_c(pbmc, obsm="X_pca") + + + It's equivalent to call the function directly on the underlying arrays: + + .. code:: python + + alt = sc.metrics.gearys_c(pbmc.obsp["connectivities"], pbmc.obsm["X_pca"].T) + np.testing.assert_array_equal(pc_c, alt) + """ + if use_graph is None: + # Fix for anndata<0.7 + if hasattr(adata, "obsp") and "connectivities" in adata.obsp: + g = adata.obsp["connectivities"] + elif "neighbors" in adata.uns: + g = adata.uns["neighbors"]["connectivities"] + else: + raise ValueError("Must run neighbors first.") + else: + raise NotImplementedError() + if vals is None: + vals = _get_obs_rep(adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp).T + return gearys_c(g, vals) + + +############################################################################### +# Calculation +############################################################################### +# Some notes on the implementation: +# * This could be phrased as tensor multiplication. However that does not get +# parallelized, which boosts performance almost linearly with cores. +# * Due to the umap setting the default threading backend, a parallel numba +# function that calls another parallel numba function can get stuck. This +# ends up meaning code re-use will be limited until umap 0.4. +# See: https://github.com/lmcinnes/umap/issues/306 +# * There can be a fair amount of numerical instability here (big reductions), +# so data is cast to float64. Removing these casts/ conversion will cause the +# tests to fail. + + +@numba.njit(cache=True, parallel=True) +def _gearys_c_vec(data, indices, indptr, x): + W = data.sum() + return _gearys_c_vec_W(data, indices, indptr, x, W) + + +@numba.njit(cache=True, parallel=True) +def _gearys_c_vec_W(data, indices, indptr, x, W): + N = len(indptr) - 1 + x = x.astype(np.float_) + x_bar = x.mean() + + total = 0.0 + for i in numba.prange(N): + s = slice(indptr[i], indptr[i + 1]) + i_indices = indices[s] + i_data = data[s] + total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2)) + + numer = (N - 1) * total + denom = 2 * W * ((x - x_bar) ** 2).sum() + C = numer / denom + + return C + + +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# Inner functions (per element C) +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +# For calling gearys_c on collections. +# TODO: These are faster if we can compile them in parallel mode. However, +# `workqueue` does not allow nested functions to be parallelized. +# Additionally, there are currently problems with numba's compiler around +# parallelization of this code: +# https://github.com/numba/numba/issues/6774#issuecomment-788789663 + + +@numba.njit +def _gearys_c_inner_sparse_x_densevec(g_data, g_indices, g_indptr, x, W): + x_bar = x.mean() + total = 0.0 + N = len(x) + for i in numba.prange(N): + s = slice(g_indptr[i], g_indptr[i + 1]) + i_indices = g_indices[s] + i_data = g_data[s] + total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2)) + numer = (N - 1) * total + denom = 2 * W * ((x - x_bar) ** 2).sum() + C = numer / denom + return C + + +@numba.njit +def _gearys_c_inner_sparse_x_sparsevec( + g_data, g_indices, g_indptr, x_data, x_indices, N, W +): + x = np.zeros(N, dtype=np.float_) + x[x_indices] = x_data + x_bar = np.sum(x_data) / N + total = 0.0 + N = len(x) + for i in numba.prange(N): + s = slice(g_indptr[i], g_indptr[i + 1]) + i_indices = g_indices[s] + i_data = g_data[s] + total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2)) + numer = (N - 1) * total + # Expanded from 2 * W * ((x_k - x_k_bar) ** 2).sum(), but uses sparsity + # to skip some calculations + # fmt: off + denom = ( + 2 * W + * ( + np.sum(x_data ** 2) + - np.sum(x_data * x_bar * 2) + + (x_bar ** 2) * N + ) + ) + # fmt: on + C = numer / denom + return C + + +@numba.njit(cache=True, parallel=True) +def _gearys_c_mtx(g_data, g_indices, g_indptr, X): + M, N = X.shape + assert N == len(g_indptr) - 1 + W = g_data.sum() + out = np.zeros(M, dtype=np.float_) + for k in numba.prange(M): + x = X[k, :].astype(np.float_) + out[k] = _gearys_c_inner_sparse_x_densevec(g_data, g_indices, g_indptr, x, W) + return out + + +@numba.njit(cache=True, parallel=True) +def _gearys_c_mtx_csr( + g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape +): + M, N = x_shape + W = g_data.sum() + out = np.zeros(M, dtype=np.float_) + x_data_list = np.split(x_data, x_indptr[1:-1]) + x_indices_list = np.split(x_indices, x_indptr[1:-1]) + for k in numba.prange(M): + out[k] = _gearys_c_inner_sparse_x_sparsevec( + g_data, + g_indices, + g_indptr, + x_data_list[k], + x_indices_list[k], + N, + W, + ) + return out + + +############################################################################### +# Interface +############################################################################### +@singledispatch +def _resolve_vals(val): + return np.asarray(val) + + +@_resolve_vals.register(np.ndarray) +@_resolve_vals.register(sparse.csr_matrix) +def _(val): + return val + + +@_resolve_vals.register(sparse.spmatrix) +def _(val): + return sparse.csr_matrix(val) + + +@_resolve_vals.register(pd.DataFrame) +@_resolve_vals.register(pd.Series) +def _(val): + return val.to_numpy() + + +@gearys_c.register(sparse.csr_matrix) +def _gearys_c(g, vals) -> np.ndarray: + assert g.shape[0] == g.shape[1], "`g` should be a square adjacency matrix" + vals = _resolve_vals(vals) + g_data = g.data.astype(np.float_, copy=False) + if isinstance(vals, sparse.csr_matrix): + assert g.shape[0] == vals.shape[1] + return _gearys_c_mtx_csr( + g_data, + g.indices, + g.indptr, + vals.data.astype(np.float_, copy=False), + vals.indices, + vals.indptr, + vals.shape, + ) + elif isinstance(vals, np.ndarray) and vals.ndim == 1: + assert g.shape[0] == vals.shape[0] + return _gearys_c_vec(g_data, g.indices, g.indptr, vals) + elif isinstance(vals, np.ndarray) and vals.ndim == 2: + assert g.shape[0] == vals.shape[1] + return _gearys_c_mtx(g_data, g.indices, g.indptr, vals) + else: + raise NotImplementedError() diff --git a/scanpy/metrics/_metrics.py b/scanpy/metrics/_metrics.py new file mode 100644 index 0000000000..71edf36dfe --- /dev/null +++ b/scanpy/metrics/_metrics.py @@ -0,0 +1,88 @@ +""" +Metrics which don't quite deserve their own file. +""" +from typing import Optional, Sequence, Union + +import pandas as pd +from pandas.api.types import is_categorical +from natsort import natsorted +import numpy as np + + +def confusion_matrix( + orig: Union[pd.Series, np.ndarray, Sequence], + new: Union[pd.Series, np.ndarray, Sequence], + data: Optional[pd.DataFrame] = None, + *, + normalize: bool = True, +) -> pd.DataFrame: + """\ + Given an original and new set of labels, create a labelled confusion matrix. + + Parameters `orig` and `new` can either be entries in data or categorical arrays + of the same size. + + Params + ------ + orig + Original labels. + new + New labels. + data + Optional dataframe to fill entries from. + normalize + Should the confusion matrix be normalized? + + + Examples + -------- + + .. plot:: + + import scanpy as sc; import seaborn as sns + pbmc = sc.datasets.pbmc68k_reduced() + cmtx = sc.metrics.confusion_matrix("bulk_labels", "louvain", pbmc.obs) + sns.heatmap(cmtx) + + """ + from sklearn.metrics import confusion_matrix as _confusion_matrix + + if data is not None: + if isinstance(orig, str): + orig = data[orig] + if isinstance(new, str): + new = data[new] + + # Coercing so I don't have to deal with it later + orig, new = pd.Series(orig), pd.Series(new) + assert len(orig) == len(new) + + unique_labels = pd.unique(np.concatenate((orig.values, new.values))) + + # Compute + mtx = _confusion_matrix(orig, new, labels=unique_labels) + if normalize: + sums = mtx.sum(axis=1)[:, np.newaxis] + mtx = np.divide(mtx, sums, where=sums != 0) + + # Label + orig_name = "Original labels" if orig.name is None else orig.name + new_name = "New Labels" if new.name is None else new.name + df = pd.DataFrame( + mtx, + index=pd.Index(unique_labels, name=orig_name), + columns=pd.Index(unique_labels, name=new_name), + ) + + # Filter + if is_categorical(orig): + orig_idx = pd.Series(orig).cat.categories + else: + orig_idx = natsorted(pd.unique(orig)) + if is_categorical(new): + new_idx = pd.Series(new).cat.categories + else: + new_idx = natsorted(pd.unique(new)) + df = df.loc[np.array(orig_idx), np.array(new_idx)] + + return df diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py new file mode 100644 index 0000000000..846226d1dc --- /dev/null +++ b/scanpy/tests/test_metrics.py @@ -0,0 +1,116 @@ +from operator import eq +from string import ascii_letters + +import numpy as np +import pandas as pd +import scanpy as sc +from scipy import sparse + + +def test_gearys_c_consistency(): + pbmc = sc.datasets.pbmc68k_reduced() + pbmc.layers["raw"] = pbmc.raw.X.copy() + g = pbmc.obsp["connectivities"] + + assert eq( + sc.metrics.gearys_c(g, pbmc.obs["percent_mito"]), + sc.metrics.gearys_c(pbmc, vals=pbmc.obs["percent_mito"]), + ) + + assert eq( # Test that series and vectors return same value + sc.metrics.gearys_c(g, pbmc.obs["percent_mito"]), + sc.metrics.gearys_c(g, pbmc.obs["percent_mito"].values), + ) + + np.testing.assert_array_equal( + sc.metrics.gearys_c(pbmc, obsm="X_pca"), + sc.metrics.gearys_c(g, pbmc.obsm["X_pca"].T), + ) + + all_genes = sc.metrics.gearys_c(pbmc, layer="raw") + first_gene = sc.metrics.gearys_c( + pbmc, vals=pbmc.obs_vector(pbmc.var_names[0], layer="raw") + ) + + np.testing.assert_allclose(all_genes[0], first_gene) + + np.testing.assert_allclose( + sc.metrics.gearys_c(pbmc, layer="raw"), + sc.metrics.gearys_c(pbmc, vals=pbmc.layers["raw"].T.toarray()), + ) + + +def test_gearys_c_correctness(): + # Test case with perfectly seperated groups + connected = np.zeros(100) + connected[np.random.choice(100, size=30, replace=False)] = 1 + graph = np.zeros((100, 100)) + graph[np.ix_(connected.astype(bool), connected.astype(bool))] = 1 + graph[np.ix_(~connected.astype(bool), ~connected.astype(bool))] = 1 + graph = sparse.csr_matrix(graph) + + assert sc.metrics.gearys_c(graph, connected) == 0.0 + assert eq( + sc.metrics.gearys_c(graph, connected), + sc.metrics.gearys_c(graph, sparse.csr_matrix(connected)), + ) + # Check for anndata > 0.7 + if hasattr(sc.AnnData, "obsp"): + # Checking that obsp works + adata = sc.AnnData( + sparse.csr_matrix((100, 100)), obsp={"connectivities": graph} + ) + assert sc.metrics.gearys_c(adata, vals=connected) == 0.0 + + +def test_confusion_matrix(): + mtx = sc.metrics.confusion_matrix(["a", "b"], ["c", "d"], normalize=False) + assert mtx.loc["a", "c"] == 1 + assert mtx.loc["a", "d"] == 0 + assert mtx.loc["b", "d"] == 1 + assert mtx.loc["b", "c"] == 0 + + mtx = sc.metrics.confusion_matrix(["a", "b"], ["c", "d"], normalize=True) + assert mtx.loc["a", "c"] == 1.0 + assert mtx.loc["a", "d"] == 0.0 + assert mtx.loc["b", "d"] == 1.0 + assert mtx.loc["b", "c"] == 0.0 + + mtx = sc.metrics.confusion_matrix( + ["a", "a", "b", "b"], ["c", "d", "c", "d"], normalize=True + ) + assert np.all(mtx == 0.5) + + +def test_confusion_matrix_randomized(): + chars = np.array(list(ascii_letters)) + pos = np.random.choice(len(chars), size=np.random.randint(50, 150)) + a = chars[pos] + b = np.random.permutation(chars)[pos] + df = pd.DataFrame({"a": a, "b": b}) + + pd.testing.assert_frame_equal( + sc.metrics.confusion_matrix("a", "b", df), + sc.metrics.confusion_matrix(df["a"], df["b"]), + ) + pd.testing.assert_frame_equal( + sc.metrics.confusion_matrix(df["a"].values, df["b"].values), + sc.metrics.confusion_matrix(a, b), + ) + + +def test_confusion_matrix_api(): + data = pd.DataFrame( + {"a": np.random.randint(5, size=100), "b": np.random.randint(5, size=100)} + ) + expected = sc.metrics.confusion_matrix(data["a"], data["b"]) + + pd.testing.assert_frame_equal(expected, sc.metrics.confusion_matrix("a", "b", data)) + + pd.testing.assert_frame_equal( + expected, sc.metrics.confusion_matrix("a", data["b"], data) + ) + + pd.testing.assert_frame_equal( + expected, sc.metrics.confusion_matrix(data["a"], "b", data) + ) diff --git a/scanpy/tests/test_package_structure.py b/scanpy/tests/test_package_structure.py index 8c87d91b6d..abaa8fc312 100644 --- a/scanpy/tests/test_package_structure.py +++ b/scanpy/tests/test_package_structure.py @@ -36,7 +36,6 @@ def test_function_headers(f): name = f"{f.__module__}.{f.__qualname__}" assert f.__doc__ is not None, f"{name} has no docstring" lines = getattr(f, "__orig_doc__", f.__doc__).split("\n") - assert lines[0], f"{name} needs a single-line summary" broken = [i for i, l in enumerate(lines) if l.strip() and not l.startswith(" ")] if any(broken): msg = f'''\