Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Port Insight distance metrics #128

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ install_requires =
matplotlib>=2.1.0
seaborn>=0.11.1
dcor>=0.5.3
pyemd==0.5.1
synthesized-insight>=0.3

[options.packages.find]
where = src
Expand Down
200 changes: 55 additions & 145 deletions src/fairlens/metrics/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,13 @@

import numpy as np
import pandas as pd
import pyemd
from scipy.spatial.distance import jensenshannon
from scipy.stats import entropy, kruskal, ks_2samp
from scipy.stats import wasserstein_distance
from synthesized_insight.metrics import HellingerDistance as HD
from synthesized_insight.metrics import JensenShannonDivergence as JSD
from synthesized_insight.metrics import KolmogorovSmirnovDistanceTest, KruskalWallisTest
from synthesized_insight.metrics import KullbackLeiblerDivergence as KLD

from .. import utils
from ..metrics import significance as pv


class DistanceMetric(ABC):
Expand Down Expand Up @@ -47,9 +48,9 @@ def __call__(self, x: pd.Series, y: pd.Series) -> Optional[float]:

Args:
x (pd.Series):
The data in the column representing the first group.
The data in the first sample.
y (pd.Series):
The data in the column representing the second group.
The data in the second sample.

Returns:
Optional[float]:
Expand All @@ -67,9 +68,9 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool:

Args:
x (pd.Series):
The data in the column representing the first group.
The data in the first sample.
y (pd.Series):
The data in the column representing the second group.
The data in the second sample.

Returns:
bool:
Expand All @@ -79,36 +80,20 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool:

@abstractmethod
def distance(self, x: pd.Series, y: pd.Series) -> float:
"""Distance between the distribution of numerical data in x and y. Derived classes must implement this.
"""Distance between the distributions in x and y. Derived classes must implement this.

Args:
x (pd.Series):
Numerical data in a column.
The data in the first sample.
y (pd.Series):
Numerical data in a column.
The data in the second sample.

Returns:
float:
The computed distance.
"""
...

def p_value(self, x: pd.Series, y: pd.Series) -> float:
"""Returns a p-value for the test that x and y are sampled from the same distribution.

Args:
x (pd.Series):
Numerical data in a column.
y (pd.Series):
Numerical data in a column.

Returns:
float:
The computed p-value.
"""

raise NotImplementedError()

@property
@abstractmethod
def id(self) -> str:
Expand All @@ -130,103 +115,37 @@ class ContinuousDistanceMetric(DistanceMetric):
Subclasses must implement a distance method.
"""

def __init__(self, p_value_test="bootstrap"):
"""Initialize continuous distance metric.

Args:
p_value_test (str, optional):
Choose which method of resampling will be used to compute the p-value. Overidden by metrics
such as Kolmogrov Smirnov Distance.
Defaults to "permutation".
"""

self.p_value_test = p_value_test

def check_input(self, x: pd.Series, y: pd.Series) -> bool:
x_dtype = utils.infer_dtype(x).dtype
y_dtype = utils.infer_dtype(y).dtype

return x_dtype in ["int64", "float64"] and y_dtype in ["int64", "float64"]

def p_value(self, x: pd.Series, y: pd.Series) -> float:
if self.p_value_test == "permutation":
ts_distribution = pv.permutation_statistic(x, y, self.distance, n_perm=100)
elif self.p_value_test == "bootstrap":
ts_distribution = pv.bootstrap_statistic(x, y, self.distance, n_samples=1000)
else:
raise ValueError('p_value_test must be one of ["permutation", "bootstrap"]')

return pv.resampling_p_value(self.distance(x, y), ts_distribution)


class CategoricalDistanceMetric(DistanceMetric):
"""
Base class for distance metrics on categorical data.

Continuous data is automatically binned to create histograms, bin edges can be provided as an argument
and will be used to bin continous data. If the data has been pre-binned and consists of pd.Intervals
for instance, the histograms will be computed using the counts of each bin, and the bin_edges, if given,
will be used in metrics such as EarthMoversDistanceCategorical to compute the distance space.

Subclasses must implement a distance_pdf method.
Subclasses must implement a distance method.
"""

def __init__(self, bin_edges: Optional[np.ndarray] = None):
"""Initialize categorical distance metric.

Args:
bin_edges (Optional[np.ndarray], optional):
A numpy array of bin edges used to bin continuous data or to indicate bins of pre-binned data
to metrics which take the distance space into account.
i.e. For bins [0-5, 5-10, 10-15, 15-20], bin_edges would be [0, 5, 10, 15, 20].
See numpy.histogram_bin_edges() for more information.
"""

self.bin_edges = bin_edges

def check_input(self, x: pd.Series, y: pd.Series) -> bool:
x_dtype = utils.infer_dtype(x).dtype
y_dtype = utils.infer_dtype(y).dtype

return x_dtype == y_dtype

def distance(self, x: pd.Series, y: pd.Series) -> float:
(p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True)

return self.distance_pdf(p, q, bin_edges)

@abstractmethod
def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
"""Distance between 2 aligned normalized histograms. Derived classes must implement this.

Args:
p (pd.Series):
A normalized histogram.
q (pd.Series):
A normalized histogram.
bin_edges (Optional[np.ndarray]):
bin_edges for binned continuous data. Used by metrics such as Earth Mover's Distance to compute the
distance metric space.

Returns:
float:
The computed distance.
"""
...

def p_value(self, x: pd.Series, y: pd.Series) -> float:
(h_x, h_y), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, normalize=False, ret_bins=True)

def distance_call(h_x, h_y):
with np.errstate(divide="ignore", invalid="ignore"):
p = pd.Series(np.nan_to_num(h_x / h_x.sum()))
q = pd.Series(np.nan_to_num(h_y / h_y.sum()))

return self.distance_pdf(p, q, bin_edges)
class BinaryDistanceMetric(DistanceMetric):
"""
Base class for distance metrics on binary data.

ts_distribution = pv.bootstrap_binned_statistic(h_x, h_y, distance_call, n_samples=100)
Subclasses must implement a distance method.
"""

return pv.resampling_p_value(distance_call(h_x, h_y), ts_distribution)
def check_input(self, x: pd.Series, y: pd.Series) -> bool:
joint = pd.concat((x, y))
return utils.infer_distr_type(joint).is_binary() and (np.sort(joint.unique()) == [0, 1]).all()


class MeanDistance(ContinuousDistanceMetric):
Expand All @@ -242,11 +161,11 @@ def id(self) -> str:
return "mean"


class BinomialDistance(ContinuousDistanceMetric):
class BinomialDistance(BinaryDistanceMetric):
"""
Difference distance between two binary data samples.
i.e p_x - p_y, where p_x, p_y are the probabilities of success in x and y, respectively.
The p-value computed is for the null hypothesis is that the probability of success is p_y.

Data is assumed to be a series of 1, 0 (success, failure) Bernoulli random variates.
"""

Expand All @@ -256,13 +175,6 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool:
def distance(self, x: pd.Series, y: pd.Series) -> float:
return x.mean() - y.mean()

def p_value(self, x: pd.Series, y: pd.Series) -> float:
p_obs = x.mean()
p_null = y.mean()
n = len(x)

return pv.binominal_proportion_p_value(p_obs, p_null, n)

@property
def id(self) -> str:
return "binomial"
Expand All @@ -274,22 +186,20 @@ class KolmogorovSmirnovDistance(ContinuousDistanceMetric):
"""

def distance(self, x: pd.Series, y: pd.Series) -> float:
return ks_2samp(x, y)[0]

def p_value(self, x: pd.Series, y: pd.Series) -> float:
return ks_2samp(x, y)[1]
return KolmogorovSmirnovDistanceTest()._compute_test(x, y)[0]

@property
def id(self) -> str:
return "ks_distance"


class KruskalWallis(ContinuousDistanceMetric):
def distance(self, x: pd.Series, y: pd.Series) -> float:
return kruskal(x, y)[0]
"""
Kruskal Wallis H test between two data samples.
"""

def p_value(self, x: pd.Series, y: pd.Series) -> float:
return kruskal(x, y)[1]
def distance(self, x: pd.Series, y: pd.Series) -> float:
return KruskalWallisTest()._compute_test(x, y)[0]

@property
def id(self) -> str:
Expand All @@ -299,24 +209,27 @@ def id(self) -> str:
class EarthMoversDistance(CategoricalDistanceMetric):
"""
Earth movers distance (EMD), aka Wasserstein 1-distance, for categorical data.

Using EarthMoversDistance on the raw data is faster and recommended.
"""

def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
distance_matrix = 1 - np.eye(len(p))
def __init__(self, bin_edges: Optional[np.ndarray] = None):
"""
Args:
bin_edges (Optional[np.ndarray], optional):
A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data.
Defaults to None.
"""

self.bin_edges = bin_edges

if bin_edges is not None:
# Use pair-wise euclidean distances between bin centers for scale data
bin_centers = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0)
xx, yy = np.meshgrid(bin_centers, bin_centers)
distance_matrix = np.abs(xx - yy)
def distance(self, x: pd.Series, y: pd.Series) -> float:
(p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True)

p = np.array(p).astype(np.float64)
q = np.array(q).astype(np.float64)
distance_matrix = distance_matrix.astype(np.float64)
if bin_edges is None:
bin_centers = np.arange(len(p))
else:
bin_centers = (np.array(bin_edges[:-1]) + np.array(bin_edges[1:])) / 2

return pyemd.emd(p, q, distance_matrix)
return wasserstein_distance(bin_centers, bin_centers, u_weights=p, v_weights=q)

@property
def id(self) -> str:
Expand All @@ -328,8 +241,8 @@ class KullbackLeiblerDivergence(CategoricalDistanceMetric):
Kullback–Leibler Divergence or Relative Entropy between two probability distributions.
"""

def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
return entropy(np.array(p), np.array(q))
def distance(self, x: pd.Series, y: pd.Series) -> float:
return KLD()._compute_metric(x, y)

@property
def id(self) -> str:
Expand All @@ -341,8 +254,8 @@ class JensenShannonDivergence(CategoricalDistanceMetric):
Jensen-Shannon Divergence between two probability distributions.
"""

def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
return jensenshannon(p, q)
def distance(self, x: pd.Series, y: pd.Series) -> float:
return JSD()._compute_metric(x, y)

@property
def id(self) -> str:
Expand All @@ -351,24 +264,21 @@ def id(self) -> str:

class Norm(CategoricalDistanceMetric):
"""
LP Norm between two probability distributions.
L-P Norm between two probability distributions.
"""

def __init__(self, bin_edges: Optional[np.ndarray] = None, ord: Union[str, int] = 2):
def __init__(self, ord: Union[str, int] = 2):
"""
Args:
bin_edges (Optional[np.ndarray], optional):
A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data.
Defaults to None.
ord (Union[str, int], optional):
The order of the norm. Possible values include positive numbers, 'fro', 'nuc'.
See numpy.linalg.norm for more details. Defaults to 2.
"""

super().__init__(bin_edges=bin_edges)
self.ord = ord

def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
def distance(self, x: pd.Series, y: pd.Series) -> float:
(p, q), _ = utils.zipped_hist((x, y), ret_bins=True)
return np.linalg.norm(p - q, ord=self.ord)

@property
Expand All @@ -381,8 +291,8 @@ class HellingerDistance(CategoricalDistanceMetric):
Hellinger distance between two probability distributions.
"""

def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
return np.linalg.norm(np.sqrt(p) - np.sqrt(q)) / np.sqrt(2)
def distance(self, x: pd.Series, y: pd.Series) -> float:
return HD()._compute_metric(x, y)

@property
def id(self) -> str:
Expand Down
Loading