From b3a2a6d45bff507625ae3f42d89ddd13389f6bf0 Mon Sep 17 00:00:00 2001
From: Isaac Virshup <ivirshup@gmail.com>
Date: Thu, 4 Mar 2021 17:18:33 +1100
Subject: [PATCH] `sc.metrics` module (add confusion matrix & Geary's C
 methods) (#915)

* Add `sc.metrics` with `gearys_c`

Add a module for computing useful metrics. Started off with Geary's C since I'm using it and finding it useful. I've also got a fairly fast way to calculate it worked out.

Unfortunatly my implementation runs into some issues with some global configs set by umap (see https://github.com/lmcinnes/umap/issues/306), so I'm going to see if that can be resolved before changing it.

* Add sc.metrics.confusion_matrix

* Better tests and output for confusion_matrix

* Workaround umap<0.4 and increase numerical stability of gearys_c

* Work around https://github.com/lmcinnes/umap/issues/306 by not
  calling out to kernel function. That code has been kept, but commented
  out.
* Increase numerical stability by casting data to system width. Tests
  were failing due to instability.

* Split up gearys_c tests

* Improved unexpected error message

* gearys_c working again. Sadly, a bit slower

* One option for doc strings

* Simplify implementation to use single dispatch

* release notes
---
 docs/api/index.rst                     |  15 ++
 docs/conf.py                           |   1 +
 docs/release-notes/1.8.0.rst           |   7 +
 scanpy/__init__.py                     |   2 +-
 scanpy/metrics/__init__.py             |   2 +
 scanpy/metrics/_gearys_c.py            | 298 +++++++++++++++++++++++++
 scanpy/metrics/_metrics.py             |  88 ++++++++
 scanpy/tests/test_metrics.py           | 116 ++++++++++
 scanpy/tests/test_package_structure.py |   1 -
 9 files changed, 528 insertions(+), 2 deletions(-)
 create mode 100644 scanpy/metrics/__init__.py
 create mode 100644 scanpy/metrics/_gearys_c.py
 create mode 100644 scanpy/metrics/_metrics.py
 create mode 100644 scanpy/tests/test_metrics.py

diff --git a/docs/api/index.rst b/docs/api/index.rst
index 5ca53b0028..e99bb0a8dd 100644
--- a/docs/api/index.rst
+++ b/docs/api/index.rst
@@ -236,6 +236,21 @@ This module provides useful queries for annotation and enrichment.
    queries.enrich
 
 
+Metrics
+-------
+
+.. module:: scanpy.metrics
+.. currentmodule:: scanpy
+
+Collections of useful measurements for evaluating results.
+
+.. autosummary::
+   :toctree: .
+
+   metrics.confusion_matrix
+   metrics.gearys_c
+
+
 Classes
 -------
 
diff --git a/docs/conf.py b/docs/conf.py
index 2e2b3c5157..46553c1640 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -139,6 +139,7 @@ def setup(app):
     "scanpy.plotting._matrixplot.MatrixPlot": "scanpy.pl.MatrixPlot",
     "scanpy.plotting._dotplot.DotPlot": "scanpy.pl.DotPlot",
     "scanpy.plotting._stacked_violin.StackedViolin": "scanpy.pl.StackedViolin",
+    "pandas.core.series.Series": "pandas.Series",
 }
 
 nitpick_ignore = [
diff --git a/docs/release-notes/1.8.0.rst b/docs/release-notes/1.8.0.rst
index b98676558a..f3759523e0 100644
--- a/docs/release-notes/1.8.0.rst
+++ b/docs/release-notes/1.8.0.rst
@@ -8,6 +8,13 @@
 
 .. _flit: https://flit.readthedocs.io/en/latest/
 
+.. rubric:: Metrics module
+
+- Added :mod:`scanpy.metrics` module!
+
+    - Added :func:`scanpy.metrics.confusion_matrix` for comparing labellings :pr:`915` :smaller:`I Virshup`
+    - Added :func:`scanpy.metrics.gearys_c` for spatial autocorrelation :pr:`915` :smaller:`I Virshup`
+
 .. rubric:: External tools
 
 .. rubric:: Performance enhancements
diff --git a/scanpy/__init__.py b/scanpy/__init__.py
index fa1124b85e..31b7c7dba9 100644
--- a/scanpy/__init__.py
+++ b/scanpy/__init__.py
@@ -14,7 +14,7 @@
     from . import tools as tl
     from . import preprocessing as pp
     from . import plotting as pl
-    from . import datasets, logging, queries, external, get
+    from . import datasets, logging, queries, external, get, metrics
 
     from anndata import AnnData, concat
     from anndata import (
diff --git a/scanpy/metrics/__init__.py b/scanpy/metrics/__init__.py
new file mode 100644
index 0000000000..d542d27dd4
--- /dev/null
+++ b/scanpy/metrics/__init__.py
@@ -0,0 +1,2 @@
+from ._gearys_c import gearys_c
+from ._metrics import confusion_matrix
diff --git a/scanpy/metrics/_gearys_c.py b/scanpy/metrics/_gearys_c.py
new file mode 100644
index 0000000000..4d7f5dd618
--- /dev/null
+++ b/scanpy/metrics/_gearys_c.py
@@ -0,0 +1,298 @@
+from functools import singledispatch
+from typing import Optional, Union
+
+
+from anndata import AnnData
+from scanpy.get import _get_obs_rep
+import numba
+import numpy as np
+import pandas as pd
+from scipy import sparse
+
+
+@singledispatch
+def gearys_c(
+    adata: AnnData,
+    *,
+    vals: Optional[Union[np.ndarray, sparse.spmatrix]] = None,
+    use_graph: Optional[str] = None,
+    layer: Optional[str] = None,
+    obsm: Optional[str] = None,
+    obsp: Optional[str] = None,
+    use_raw: bool = False,
+) -> Union[np.ndarray, float]:
+    r"""
+    Calculate `Geary's C <https://en.wikipedia.org/wiki/Geary's_C>`_, as used
+    by `VISION <https://doi.org/10.1038/s41467-019-12235-0>`_.
+
+    Geary's C is a measure of autocorrelation for some measure on a graph. This
+    can be to whether measures are correlated between neighboring cells. Lower
+    values indicate greater correlation.
+
+    .. math::
+
+        C =
+        \frac{
+            (N - 1)\sum_{i,j} w_{i,j} (x_i - x_j)^2
+        }{
+            2W \sum_i (x_i - \bar{x})^2
+        }
+
+    Params
+    ------
+    adata
+    vals
+        Values to calculate Geary's C for. If this is two dimensional, should
+        be of shape `(n_features, n_cells)`. Otherwise should be of shape
+        `(n_cells,)`. This matrix can be selected from elements of the anndata
+        object by using key word arguments: `layer`, `obsm`, `obsp`, or
+        `use_raw`.
+    use_graph
+        Key to use for graph in anndata object. If not provided, default
+        neighbors connectivities will be used instead.
+    layer
+        Key for `adata.layers` to choose `vals`.
+    obsm
+        Key for `adata.obsm` to choose `vals`.
+    obsp
+        Key for `adata.obsp` to choose `vals`.
+    use_raw
+        Whether to use `adata.raw.X` for `vals`.
+
+
+    This function can also be called on the graph and values directly. In this case
+    the signature looks like:
+
+    Params
+    ------
+    g
+        The graph
+    vals
+        The values
+
+
+    See the examples for more info.
+
+    Returns
+    -------
+    If vals is two dimensional, returns a 1 dimensional ndarray array. Returns
+    a scalar if `vals` is 1d.
+
+
+    Examples
+    --------
+
+    Calculate Gearys C for each components of a dimensionality reduction:
+
+    .. code:: python
+
+        import scanpy as sc, numpy as np
+
+        pbmc = sc.datasets.pbmc68k_processed()
+        pc_c = sc.metrics.gearys_c(pbmc, obsm="X_pca")
+
+
+    It's equivalent to call the function directly on the underlying arrays:
+
+    .. code:: python
+
+        alt = sc.metrics.gearys_c(pbmc.obsp["connectivities"], pbmc.obsm["X_pca"].T)
+        np.testing.assert_array_equal(pc_c, alt)
+    """
+    if use_graph is None:
+        # Fix for anndata<0.7
+        if hasattr(adata, "obsp") and "connectivities" in adata.obsp:
+            g = adata.obsp["connectivities"]
+        elif "neighbors" in adata.uns:
+            g = adata.uns["neighbors"]["connectivities"]
+        else:
+            raise ValueError("Must run neighbors first.")
+    else:
+        raise NotImplementedError()
+    if vals is None:
+        vals = _get_obs_rep(adata, use_raw=use_raw, layer=layer, obsm=obsm, obsp=obsp).T
+    return gearys_c(g, vals)
+
+
+###############################################################################
+# Calculation
+###############################################################################
+# Some notes on the implementation:
+# * This could be phrased as tensor multiplication. However that does not get
+#   parallelized, which boosts performance almost linearly with cores.
+# * Due to the umap setting the default threading backend, a parallel numba
+#   function that calls another parallel numba function can get stuck. This
+#   ends up meaning code re-use will be limited until umap 0.4.
+#   See: https://github.com/lmcinnes/umap/issues/306
+# * There can be a fair amount of numerical instability here (big reductions),
+#   so data is cast to float64. Removing these casts/ conversion will cause the
+#   tests to fail.
+
+
+@numba.njit(cache=True, parallel=True)
+def _gearys_c_vec(data, indices, indptr, x):
+    W = data.sum()
+    return _gearys_c_vec_W(data, indices, indptr, x, W)
+
+
+@numba.njit(cache=True, parallel=True)
+def _gearys_c_vec_W(data, indices, indptr, x, W):
+    N = len(indptr) - 1
+    x = x.astype(np.float_)
+    x_bar = x.mean()
+
+    total = 0.0
+    for i in numba.prange(N):
+        s = slice(indptr[i], indptr[i + 1])
+        i_indices = indices[s]
+        i_data = data[s]
+        total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))
+
+    numer = (N - 1) * total
+    denom = 2 * W * ((x - x_bar) ** 2).sum()
+    C = numer / denom
+
+    return C
+
+
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# Inner functions (per element C)
+# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+# For calling gearys_c on collections.
+# TODO: These are faster if we can compile them in parallel mode. However,
+# `workqueue` does not allow nested functions to be parallelized.
+# Additionally, there are currently problems with numba's compiler around
+# parallelization of this code:
+# https://github.com/numba/numba/issues/6774#issuecomment-788789663
+
+
+@numba.njit
+def _gearys_c_inner_sparse_x_densevec(g_data, g_indices, g_indptr, x, W):
+    x_bar = x.mean()
+    total = 0.0
+    N = len(x)
+    for i in numba.prange(N):
+        s = slice(g_indptr[i], g_indptr[i + 1])
+        i_indices = g_indices[s]
+        i_data = g_data[s]
+        total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))
+    numer = (N - 1) * total
+    denom = 2 * W * ((x - x_bar) ** 2).sum()
+    C = numer / denom
+    return C
+
+
+@numba.njit
+def _gearys_c_inner_sparse_x_sparsevec(
+    g_data, g_indices, g_indptr, x_data, x_indices, N, W
+):
+    x = np.zeros(N, dtype=np.float_)
+    x[x_indices] = x_data
+    x_bar = np.sum(x_data) / N
+    total = 0.0
+    N = len(x)
+    for i in numba.prange(N):
+        s = slice(g_indptr[i], g_indptr[i + 1])
+        i_indices = g_indices[s]
+        i_data = g_data[s]
+        total += np.sum(i_data * ((x[i] - x[i_indices]) ** 2))
+    numer = (N - 1) * total
+    # Expanded from 2 * W * ((x_k - x_k_bar) ** 2).sum(), but uses sparsity
+    # to skip some calculations
+    # fmt: off
+    denom = (
+        2 * W
+        * (
+            np.sum(x_data ** 2)
+            - np.sum(x_data * x_bar * 2)
+            + (x_bar ** 2) * N
+        )
+    )
+    # fmt: on
+    C = numer / denom
+    return C
+
+
+@numba.njit(cache=True, parallel=True)
+def _gearys_c_mtx(g_data, g_indices, g_indptr, X):
+    M, N = X.shape
+    assert N == len(g_indptr) - 1
+    W = g_data.sum()
+    out = np.zeros(M, dtype=np.float_)
+    for k in numba.prange(M):
+        x = X[k, :].astype(np.float_)
+        out[k] = _gearys_c_inner_sparse_x_densevec(g_data, g_indices, g_indptr, x, W)
+    return out
+
+
+@numba.njit(cache=True, parallel=True)
+def _gearys_c_mtx_csr(
+    g_data, g_indices, g_indptr, x_data, x_indices, x_indptr, x_shape
+):
+    M, N = x_shape
+    W = g_data.sum()
+    out = np.zeros(M, dtype=np.float_)
+    x_data_list = np.split(x_data, x_indptr[1:-1])
+    x_indices_list = np.split(x_indices, x_indptr[1:-1])
+    for k in numba.prange(M):
+        out[k] = _gearys_c_inner_sparse_x_sparsevec(
+            g_data,
+            g_indices,
+            g_indptr,
+            x_data_list[k],
+            x_indices_list[k],
+            N,
+            W,
+        )
+    return out
+
+
+###############################################################################
+# Interface
+###############################################################################
+@singledispatch
+def _resolve_vals(val):
+    return np.asarray(val)
+
+
+@_resolve_vals.register(np.ndarray)
+@_resolve_vals.register(sparse.csr_matrix)
+def _(val):
+    return val
+
+
+@_resolve_vals.register(sparse.spmatrix)
+def _(val):
+    return sparse.csr_matrix(val)
+
+
+@_resolve_vals.register(pd.DataFrame)
+@_resolve_vals.register(pd.Series)
+def _(val):
+    return val.to_numpy()
+
+
+@gearys_c.register(sparse.csr_matrix)
+def _gearys_c(g, vals) -> np.ndarray:
+    assert g.shape[0] == g.shape[1], "`g` should be a square adjacency matrix"
+    vals = _resolve_vals(vals)
+    g_data = g.data.astype(np.float_, copy=False)
+    if isinstance(vals, sparse.csr_matrix):
+        assert g.shape[0] == vals.shape[1]
+        return _gearys_c_mtx_csr(
+            g_data,
+            g.indices,
+            g.indptr,
+            vals.data.astype(np.float_, copy=False),
+            vals.indices,
+            vals.indptr,
+            vals.shape,
+        )
+    elif isinstance(vals, np.ndarray) and vals.ndim == 1:
+        assert g.shape[0] == vals.shape[0]
+        return _gearys_c_vec(g_data, g.indices, g.indptr, vals)
+    elif isinstance(vals, np.ndarray) and vals.ndim == 2:
+        assert g.shape[0] == vals.shape[1]
+        return _gearys_c_mtx(g_data, g.indices, g.indptr, vals)
+    else:
+        raise NotImplementedError()
diff --git a/scanpy/metrics/_metrics.py b/scanpy/metrics/_metrics.py
new file mode 100644
index 0000000000..71edf36dfe
--- /dev/null
+++ b/scanpy/metrics/_metrics.py
@@ -0,0 +1,88 @@
+"""
+Metrics which don't quite deserve their own file.
+"""
+from typing import Optional, Sequence, Union
+
+import pandas as pd
+from pandas.api.types import is_categorical
+from natsort import natsorted
+import numpy as np
+
+
+def confusion_matrix(
+    orig: Union[pd.Series, np.ndarray, Sequence],
+    new: Union[pd.Series, np.ndarray, Sequence],
+    data: Optional[pd.DataFrame] = None,
+    *,
+    normalize: bool = True,
+) -> pd.DataFrame:
+    """\
+    Given an original and new set of labels, create a labelled confusion matrix.
+
+    Parameters `orig` and `new` can either be entries in data or categorical arrays
+    of the same size.
+
+    Params
+    ------
+    orig
+        Original labels.
+    new
+        New labels.
+    data
+        Optional dataframe to fill entries from.
+    normalize
+        Should the confusion matrix be normalized?
+
+
+    Examples
+    --------
+
+    .. plot::
+
+        import scanpy as sc; import seaborn as sns
+        pbmc = sc.datasets.pbmc68k_reduced()
+        cmtx = sc.metrics.confusion_matrix("bulk_labels", "louvain", pbmc.obs)
+        sns.heatmap(cmtx)
+
+    """
+    from sklearn.metrics import confusion_matrix as _confusion_matrix
+
+    if data is not None:
+        if isinstance(orig, str):
+            orig = data[orig]
+        if isinstance(new, str):
+            new = data[new]
+
+    # Coercing so I don't have to deal with it later
+    orig, new = pd.Series(orig), pd.Series(new)
+    assert len(orig) == len(new)
+
+    unique_labels = pd.unique(np.concatenate((orig.values, new.values)))
+
+    # Compute
+    mtx = _confusion_matrix(orig, new, labels=unique_labels)
+    if normalize:
+        sums = mtx.sum(axis=1)[:, np.newaxis]
+        mtx = np.divide(mtx, sums, where=sums != 0)
+
+    # Label
+    orig_name = "Original labels" if orig.name is None else orig.name
+    new_name = "New Labels" if new.name is None else new.name
+    df = pd.DataFrame(
+        mtx,
+        index=pd.Index(unique_labels, name=orig_name),
+        columns=pd.Index(unique_labels, name=new_name),
+    )
+
+    # Filter
+    if is_categorical(orig):
+        orig_idx = pd.Series(orig).cat.categories
+    else:
+        orig_idx = natsorted(pd.unique(orig))
+    if is_categorical(new):
+        new_idx = pd.Series(new).cat.categories
+    else:
+        new_idx = natsorted(pd.unique(new))
+    df = df.loc[np.array(orig_idx), np.array(new_idx)]
+
+    return df
diff --git a/scanpy/tests/test_metrics.py b/scanpy/tests/test_metrics.py
new file mode 100644
index 0000000000..846226d1dc
--- /dev/null
+++ b/scanpy/tests/test_metrics.py
@@ -0,0 +1,116 @@
+from operator import eq
+from string import ascii_letters
+
+import numpy as np
+import pandas as pd
+import scanpy as sc
+from scipy import sparse
+
+
+def test_gearys_c_consistency():
+    pbmc = sc.datasets.pbmc68k_reduced()
+    pbmc.layers["raw"] = pbmc.raw.X.copy()
+    g = pbmc.obsp["connectivities"]
+
+    assert eq(
+        sc.metrics.gearys_c(g, pbmc.obs["percent_mito"]),
+        sc.metrics.gearys_c(pbmc, vals=pbmc.obs["percent_mito"]),
+    )
+
+    assert eq(  # Test that series and vectors return same value
+        sc.metrics.gearys_c(g, pbmc.obs["percent_mito"]),
+        sc.metrics.gearys_c(g, pbmc.obs["percent_mito"].values),
+    )
+
+    np.testing.assert_array_equal(
+        sc.metrics.gearys_c(pbmc, obsm="X_pca"),
+        sc.metrics.gearys_c(g, pbmc.obsm["X_pca"].T),
+    )
+
+    all_genes = sc.metrics.gearys_c(pbmc, layer="raw")
+    first_gene = sc.metrics.gearys_c(
+        pbmc, vals=pbmc.obs_vector(pbmc.var_names[0], layer="raw")
+    )
+
+    np.testing.assert_allclose(all_genes[0], first_gene)
+
+    np.testing.assert_allclose(
+        sc.metrics.gearys_c(pbmc, layer="raw"),
+        sc.metrics.gearys_c(pbmc, vals=pbmc.layers["raw"].T.toarray()),
+    )
+
+
+def test_gearys_c_correctness():
+    # Test case with perfectly seperated groups
+    connected = np.zeros(100)
+    connected[np.random.choice(100, size=30, replace=False)] = 1
+    graph = np.zeros((100, 100))
+    graph[np.ix_(connected.astype(bool), connected.astype(bool))] = 1
+    graph[np.ix_(~connected.astype(bool), ~connected.astype(bool))] = 1
+    graph = sparse.csr_matrix(graph)
+
+    assert sc.metrics.gearys_c(graph, connected) == 0.0
+    assert eq(
+        sc.metrics.gearys_c(graph, connected),
+        sc.metrics.gearys_c(graph, sparse.csr_matrix(connected)),
+    )
+    # Check for anndata > 0.7
+    if hasattr(sc.AnnData, "obsp"):
+        # Checking that obsp works
+        adata = sc.AnnData(
+            sparse.csr_matrix((100, 100)), obsp={"connectivities": graph}
+        )
+        assert sc.metrics.gearys_c(adata, vals=connected) == 0.0
+
+
+def test_confusion_matrix():
+    mtx = sc.metrics.confusion_matrix(["a", "b"], ["c", "d"], normalize=False)
+    assert mtx.loc["a", "c"] == 1
+    assert mtx.loc["a", "d"] == 0
+    assert mtx.loc["b", "d"] == 1
+    assert mtx.loc["b", "c"] == 0
+
+    mtx = sc.metrics.confusion_matrix(["a", "b"], ["c", "d"], normalize=True)
+    assert mtx.loc["a", "c"] == 1.0
+    assert mtx.loc["a", "d"] == 0.0
+    assert mtx.loc["b", "d"] == 1.0
+    assert mtx.loc["b", "c"] == 0.0
+
+    mtx = sc.metrics.confusion_matrix(
+        ["a", "a", "b", "b"], ["c", "d", "c", "d"], normalize=True
+    )
+    assert np.all(mtx == 0.5)
+
+
+def test_confusion_matrix_randomized():
+    chars = np.array(list(ascii_letters))
+    pos = np.random.choice(len(chars), size=np.random.randint(50, 150))
+    a = chars[pos]
+    b = np.random.permutation(chars)[pos]
+    df = pd.DataFrame({"a": a, "b": b})
+
+    pd.testing.assert_frame_equal(
+        sc.metrics.confusion_matrix("a", "b", df),
+        sc.metrics.confusion_matrix(df["a"], df["b"]),
+    )
+    pd.testing.assert_frame_equal(
+        sc.metrics.confusion_matrix(df["a"].values, df["b"].values),
+        sc.metrics.confusion_matrix(a, b),
+    )
+
+
+def test_confusion_matrix_api():
+    data = pd.DataFrame(
+        {"a": np.random.randint(5, size=100), "b": np.random.randint(5, size=100)}
+    )
+    expected = sc.metrics.confusion_matrix(data["a"], data["b"])
+
+    pd.testing.assert_frame_equal(expected, sc.metrics.confusion_matrix("a", "b", data))
+
+    pd.testing.assert_frame_equal(
+        expected, sc.metrics.confusion_matrix("a", data["b"], data)
+    )
+
+    pd.testing.assert_frame_equal(
+        expected, sc.metrics.confusion_matrix(data["a"], "b", data)
+    )
diff --git a/scanpy/tests/test_package_structure.py b/scanpy/tests/test_package_structure.py
index 8c87d91b6d..abaa8fc312 100644
--- a/scanpy/tests/test_package_structure.py
+++ b/scanpy/tests/test_package_structure.py
@@ -36,7 +36,6 @@ def test_function_headers(f):
     name = f"{f.__module__}.{f.__qualname__}"
     assert f.__doc__ is not None, f"{name} has no docstring"
     lines = getattr(f, "__orig_doc__", f.__doc__).split("\n")
-    assert lines[0], f"{name} needs a single-line summary"
     broken = [i for i, l in enumerate(lines) if l.strip() and not l.startswith("    ")]
     if any(broken):
         msg = f'''\