synthesized-io · Hilly12 · Sep 15, 2021 · Sep 16, 2021 · Sep 16, 2021 · Sep 16, 2021
diff --git a/setup.cfg b/setup.cfg
@@ -37,7 +37,7 @@ install_requires =
     matplotlib>=2.1.0
     seaborn>=0.11.1
     dcor>=0.5.3
-    pyemd==0.5.1
+    synthesized-insight>=0.3
 
 [options.packages.find]
 where = src

diff --git a/src/fairlens/metrics/distance.py b/src/fairlens/metrics/distance.py
@@ -8,12 +8,13 @@
 
 import numpy as np
 import pandas as pd
-import pyemd
-from scipy.spatial.distance import jensenshannon
-from scipy.stats import entropy, kruskal, ks_2samp
+from scipy.stats import wasserstein_distance
+from synthesized_insight.metrics import HellingerDistance as HD
+from synthesized_insight.metrics import JensenShannonDivergence as JSD
+from synthesized_insight.metrics import KolmogorovSmirnovDistanceTest, KruskalWallisTest
+from synthesized_insight.metrics import KullbackLeiblerDivergence as KLD
 
 from .. import utils
-from ..metrics import significance as pv
 
 
 class DistanceMetric(ABC):
@@ -47,9 +48,9 @@ def __call__(self, x: pd.Series, y: pd.Series) -> Optional[float]:
 
         Args:
             x (pd.Series):
-                The data in the column representing the first group.
+                The data in the first sample.
             y (pd.Series):
-                The data in the column representing the second group.
+                The data in the second sample.
 
         Returns:
             Optional[float]:
@@ -67,9 +68,9 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool:
 
         Args:
             x (pd.Series):
-                The data in the column representing the first group.
+                The data in the first sample.
             y (pd.Series):
-                The data in the column representing the second group.
+                The data in the second sample.
 
         Returns:
             bool:
@@ -79,36 +80,20 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool:
 
     @abstractmethod
     def distance(self, x: pd.Series, y: pd.Series) -> float:
-        """Distance between the distribution of numerical data in x and y. Derived classes must implement this.
+        """Distance between the distributions in x and y. Derived classes must implement this.
 
         Args:
             x (pd.Series):
-                Numerical data in a column.
+                The data in the first sample.
             y (pd.Series):
-                Numerical data in a column.
+                The data in the second sample.
 
         Returns:
             float:
                 The computed distance.
         """
         ...
 
-    def p_value(self, x: pd.Series, y: pd.Series) -> float:
-        """Returns a p-value for the test that x and y are sampled from the same distribution.
-
-        Args:
-            x (pd.Series):
-                Numerical data in a column.
-            y (pd.Series):
-                Numerical data in a column.
-
-        Returns:
-            float:
-                The computed p-value.
-        """
-
-        raise NotImplementedError()
-
     @property
     @abstractmethod
     def id(self) -> str:
@@ -130,103 +115,37 @@ class ContinuousDistanceMetric(DistanceMetric):
     Subclasses must implement a distance method.
     """
 
-    def __init__(self, p_value_test="bootstrap"):
-        """Initialize continuous distance metric.
-
-        Args:
-            p_value_test (str, optional):
-                Choose which method of resampling will be used to compute the p-value. Overidden by metrics
-                such as Kolmogrov Smirnov Distance.
-                Defaults to "permutation".
-        """
-
-        self.p_value_test = p_value_test
-
     def check_input(self, x: pd.Series, y: pd.Series) -> bool:
         x_dtype = utils.infer_dtype(x).dtype
         y_dtype = utils.infer_dtype(y).dtype
 
         return x_dtype in ["int64", "float64"] and y_dtype in ["int64", "float64"]
 
-    def p_value(self, x: pd.Series, y: pd.Series) -> float:
-        if self.p_value_test == "permutation":
-            ts_distribution = pv.permutation_statistic(x, y, self.distance, n_perm=100)
-        elif self.p_value_test == "bootstrap":
-            ts_distribution = pv.bootstrap_statistic(x, y, self.distance, n_samples=1000)
-        else:
-            raise ValueError('p_value_test must be one of ["permutation", "bootstrap"]')
-
-        return pv.resampling_p_value(self.distance(x, y), ts_distribution)
-
 
 class CategoricalDistanceMetric(DistanceMetric):
     """
     Base class for distance metrics on categorical data.
 
-    Continuous data is automatically binned to create histograms, bin edges can be provided as an argument
-    and will be used to bin continous data. If the data has been pre-binned and consists of pd.Intervals
-    for instance, the histograms will be computed using the counts of each bin, and the bin_edges, if given,
-    will be used in metrics such as EarthMoversDistanceCategorical to compute the distance space.
-
-    Subclasses must implement a distance_pdf method.
+    Subclasses must implement a distance method.
     """
 
-    def __init__(self, bin_edges: Optional[np.ndarray] = None):
-        """Initialize categorical distance metric.
-
-        Args:
-            bin_edges (Optional[np.ndarray], optional):
-                A numpy array of bin edges used to bin continuous data or to indicate bins of pre-binned data
-                to metrics which take the distance space into account.
-                i.e. For bins [0-5, 5-10, 10-15, 15-20], bin_edges would be [0, 5, 10, 15, 20].
-                See numpy.histogram_bin_edges() for more information.
-        """
-
-        self.bin_edges = bin_edges
-
     def check_input(self, x: pd.Series, y: pd.Series) -> bool:
         x_dtype = utils.infer_dtype(x).dtype
         y_dtype = utils.infer_dtype(y).dtype
 
         return x_dtype == y_dtype
 
-    def distance(self, x: pd.Series, y: pd.Series) -> float:
-        (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True)
-
-        return self.distance_pdf(p, q, bin_edges)
 
-    @abstractmethod
-    def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
-        """Distance between 2 aligned normalized histograms. Derived classes must implement this.
-
-        Args:
-            p (pd.Series):
-                A normalized histogram.
-            q (pd.Series):
-                A normalized histogram.
-            bin_edges (Optional[np.ndarray]):
-                bin_edges for binned continuous data. Used by metrics such as Earth Mover's Distance to compute the
-                distance metric space.
-
-        Returns:
-            float:
-                The computed distance.
-        """
-        ...
-
-    def p_value(self, x: pd.Series, y: pd.Series) -> float:
-        (h_x, h_y), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, normalize=False, ret_bins=True)
-
-        def distance_call(h_x, h_y):
-            with np.errstate(divide="ignore", invalid="ignore"):
-                p = pd.Series(np.nan_to_num(h_x / h_x.sum()))
-                q = pd.Series(np.nan_to_num(h_y / h_y.sum()))
-
-            return self.distance_pdf(p, q, bin_edges)
+class BinaryDistanceMetric(DistanceMetric):
+    """
+    Base class for distance metrics on binary data.
 
-        ts_distribution = pv.bootstrap_binned_statistic(h_x, h_y, distance_call, n_samples=100)
+    Subclasses must implement a distance method.
+    """
 
-        return pv.resampling_p_value(distance_call(h_x, h_y), ts_distribution)
+    def check_input(self, x: pd.Series, y: pd.Series) -> bool:
+        joint = pd.concat((x, y))
+        return utils.infer_distr_type(joint).is_binary() and (np.sort(joint.unique()) == [0, 1]).all()
 
 
 class MeanDistance(ContinuousDistanceMetric):
@@ -242,11 +161,11 @@ def id(self) -> str:
         return "mean"
 
 
-class BinomialDistance(ContinuousDistanceMetric):
+class BinomialDistance(BinaryDistanceMetric):
     """
     Difference distance between two binary data samples.
     i.e p_x - p_y, where p_x, p_y are the probabilities of success in x and y, respectively.
-    The p-value computed is for the null hypothesis is that the probability of success is p_y.
+
     Data is assumed to be a series of 1, 0 (success, failure) Bernoulli random variates.
     """
 
@@ -256,13 +175,6 @@ def check_input(self, x: pd.Series, y: pd.Series) -> bool:
     def distance(self, x: pd.Series, y: pd.Series) -> float:
         return x.mean() - y.mean()
 
-    def p_value(self, x: pd.Series, y: pd.Series) -> float:
-        p_obs = x.mean()
-        p_null = y.mean()
-        n = len(x)
-
-        return pv.binominal_proportion_p_value(p_obs, p_null, n)
-
     @property
     def id(self) -> str:
         return "binomial"
@@ -274,22 +186,20 @@ class KolmogorovSmirnovDistance(ContinuousDistanceMetric):
     """
 
     def distance(self, x: pd.Series, y: pd.Series) -> float:
-        return ks_2samp(x, y)[0]
-
-    def p_value(self, x: pd.Series, y: pd.Series) -> float:
-        return ks_2samp(x, y)[1]
+        return KolmogorovSmirnovDistanceTest()._compute_test(x, y)[0]
 
     @property
     def id(self) -> str:
         return "ks_distance"
 
 
 class KruskalWallis(ContinuousDistanceMetric):
-    def distance(self, x: pd.Series, y: pd.Series) -> float:
-        return kruskal(x, y)[0]
+    """
+    Kruskal Wallis H test between two data samples.
+    """
 
-    def p_value(self, x: pd.Series, y: pd.Series) -> float:
-        return kruskal(x, y)[1]
+    def distance(self, x: pd.Series, y: pd.Series) -> float:
+        return KruskalWallisTest()._compute_test(x, y)[0]
 
     @property
     def id(self) -> str:
@@ -299,24 +209,27 @@ def id(self) -> str:
 class EarthMoversDistance(CategoricalDistanceMetric):
     """
     Earth movers distance (EMD), aka Wasserstein 1-distance, for categorical data.
-
-    Using EarthMoversDistance on the raw data is faster and recommended.
     """
 
-    def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
-        distance_matrix = 1 - np.eye(len(p))
+    def __init__(self, bin_edges: Optional[np.ndarray] = None):
+        """
+        Args:
+            bin_edges (Optional[np.ndarray], optional):
+                A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data.
+                Defaults to None.
+        """
+
+        self.bin_edges = bin_edges
 
-        if bin_edges is not None:
-            # Use pair-wise euclidean distances between bin centers for scale data
-            bin_centers = np.mean([bin_edges[:-1], bin_edges[1:]], axis=0)
-            xx, yy = np.meshgrid(bin_centers, bin_centers)
-            distance_matrix = np.abs(xx - yy)
+    def distance(self, x: pd.Series, y: pd.Series) -> float:
+        (p, q), bin_edges = utils.zipped_hist((x, y), bin_edges=self.bin_edges, ret_bins=True)
 
-        p = np.array(p).astype(np.float64)
-        q = np.array(q).astype(np.float64)
-        distance_matrix = distance_matrix.astype(np.float64)
+        if bin_edges is None:
+            bin_centers = np.arange(len(p))
+        else:
+            bin_centers = (np.array(bin_edges[:-1]) + np.array(bin_edges[1:])) / 2
 
-        return pyemd.emd(p, q, distance_matrix)
+        return wasserstein_distance(bin_centers, bin_centers, u_weights=p, v_weights=q)
 
     @property
     def id(self) -> str:
@@ -328,8 +241,8 @@ class KullbackLeiblerDivergence(CategoricalDistanceMetric):
     Kullback–Leibler Divergence or Relative Entropy between two probability distributions.
     """
 
-    def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
-        return entropy(np.array(p), np.array(q))
+    def distance(self, x: pd.Series, y: pd.Series) -> float:
+        return KLD()._compute_metric(x, y)
 
     @property
     def id(self) -> str:
@@ -341,8 +254,8 @@ class JensenShannonDivergence(CategoricalDistanceMetric):
     Jensen-Shannon Divergence between two probability distributions.
     """
 
-    def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
-        return jensenshannon(p, q)
+    def distance(self, x: pd.Series, y: pd.Series) -> float:
+        return JSD()._compute_metric(x, y)
 
     @property
     def id(self) -> str:
@@ -351,24 +264,21 @@ def id(self) -> str:
 
 class Norm(CategoricalDistanceMetric):
     """
-    LP Norm between two probability distributions.
+    L-P Norm between two probability distributions.
     """
 
-    def __init__(self, bin_edges: Optional[np.ndarray] = None, ord: Union[str, int] = 2):
+    def __init__(self, ord: Union[str, int] = 2):
         """
         Args:
-            bin_edges (Optional[np.ndarray], optional):
-                A list of bin edges used to bin continuous data by or to indicate bins of pre-binned data.
-                Defaults to None.
             ord (Union[str, int], optional):
                 The order of the norm. Possible values include positive numbers, 'fro', 'nuc'.
                 See numpy.linalg.norm for more details. Defaults to 2.
         """
 
-        super().__init__(bin_edges=bin_edges)
         self.ord = ord
 
-    def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
+    def distance(self, x: pd.Series, y: pd.Series) -> float:
+        (p, q), _ = utils.zipped_hist((x, y), ret_bins=True)
         return np.linalg.norm(p - q, ord=self.ord)
 
     @property
@@ -381,8 +291,8 @@ class HellingerDistance(CategoricalDistanceMetric):
     Hellinger distance between two probability distributions.
     """
 
-    def distance_pdf(self, p: pd.Series, q: pd.Series, bin_edges: Optional[np.ndarray]) -> float:
-        return np.linalg.norm(np.sqrt(p) - np.sqrt(q)) / np.sqrt(2)
+    def distance(self, x: pd.Series, y: pd.Series) -> float:
+        return HD()._compute_metric(x, y)
 
     @property
     def id(self) -> str: