Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Correlation speedup #123

Open
wants to merge 15 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 8 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 3 additions & 5 deletions docs/user_guide/correlations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ Correlation Heatmaps
^^^^^^^^^^^^^^^^^^^^

The :code:`plot` module allows users to generate a correlation heatmap of any dataset by simply
passing the dataframe to the :code:`two_column_heatmap()` function, which will plot a heatmap from the
passing the dataframe to the :code:`heatmap()` function, which will plot a heatmap from the
matrix of the correlation coefficients computed by using the Pearson Coefficient, the Kruskal-Wallis
Test and Cramer's V between each two of the columns (for numerical-numerical, categorical-numerical and
categorical-categorical associations, respectively).
Expand All @@ -92,19 +92,17 @@ This will automatically choose different methods for different types of data, ho
are configurable.

.. ipython:: python
:okwarning:

@savefig corr_heatmap_1.png
fl.plot.two_column_heatmap(df)
fl.plot.heatmap(df)


Let's try generating a heatmap of the same dataset, but using some non-linear metrics
for numerical-numerical and numerical-categorical associations for added precision.

.. ipython:: python
:okwarning:

from fairlens.metrics import distance_nn_correlation, distance_cn_correlation, cramers_v

@savefig corr_heatmap_2.png
fl.plot.two_column_heatmap(df, distance_nn_correlation, distance_cn_correlation, cramers_v)
fl.plot.heatmap(df, distance_nn_correlation, distance_cn_correlation, cramers_v)
2 changes: 2 additions & 0 deletions src/fairlens/metrics/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
cramers_v,
distance_cn_correlation,
distance_nn_correlation,
pearson,
r2_mcfadden,
kruskal_wallis,
kruskal_wallis_boolean,
Expand Down Expand Up @@ -58,6 +59,7 @@
"cramers_v",
"distance_cn_correlation",
"distance_nn_correlation",
"pearson",
"r2_mcfadden",
"kruskal_wallis",
"kruskal_wallis_boolean",
Expand Down
73 changes: 42 additions & 31 deletions src/fairlens/metrics/correlation.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,9 @@
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler

EPSILON = 1e-6
MIN_MEAN_SAMPLE_SIZE = 20


def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
"""Metric that calculates the corrected Cramer's V statistic for categorical-categorical
Expand All @@ -23,43 +26,50 @@ def cramers_v(sr_a: pd.Series, sr_b: pd.Series) -> float:
Second categorical series to analyze.

Returns:
float: Value of the statistic.
float:
Value of the statistic.
"""

if len(sr_a.value_counts()) == 1:
if sr_a.equals(sr_b):
return 1

confusion_matrix = pd.crosstab(sr_a, sr_b)
r, k = confusion_matrix.shape
n = confusion_matrix.to_numpy().sum()

if r < 2 or k < 2:
return 0
if len(sr_b.value_counts()) == 1:

chi2 = ss.chi2_contingency(confusion_matrix, correction=(confusion_matrix.shape[0] != 2))[0]
phi2 = chi2 / n

phi2corr = phi2 - ((k - 1) * (r - 1)) / (n - 1)

if phi2corr <= EPSILON:
return 0
else:
confusion_matrix = pd.crosstab(sr_a, sr_b)

if confusion_matrix.shape[0] == 2:
correct = False
else:
correct = True
rcorr = r - ((r - 1) ** 2) / (n - 1)
kcorr = k - ((k - 1) ** 2) / (n - 1)

chi2 = ss.chi2_contingency(confusion_matrix, correction=correct)[0]
n = sum(confusion_matrix.sum())
phi2 = chi2 / n
r, k = confusion_matrix.shape
phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
rcorr = r - ((r - 1) ** 2) / (n - 1)
kcorr = k - ((k - 1) ** 2) / (n - 1)
return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))


def pearson(sr_a: pd.Series, sr_b: pd.Series) -> float:
"""Metric that calculates Pearson's correlation coefficent for numerical-numerical
"""Calculates the Pearson's correlation coefficent for numerical-numerical
pairs of series, used in heatmap generation.

Args:
sr_a (pd.Series): First numerical series to analyze.
sr_b (pd.Series): Second numerical series to analyze.
sr_a (pd.Series):
First numerical series to analyze.
sr_b (pd.Series):
Second numerical series to analyze.

Returns:
float: Value of the coefficient.
float:
Value of the coefficient.
"""
return abs(sr_a.corr(sr_b))

return sr_a.corr(sr_b, method="pearson")


def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
Expand All @@ -78,6 +88,7 @@ def r2_mcfadden(sr_a: pd.Series, sr_b: pd.Series) -> float:
Returns:
float: Value of the pseudo-R2 McFadden score.
"""

x = sr_b.to_numpy().reshape(-1, 1)
x = StandardScaler().fit_transform(x)
y = sr_a.to_numpy()
Expand Down Expand Up @@ -120,16 +131,17 @@ def kruskal_wallis(sr_a: pd.Series, sr_b: pd.Series) -> float:
p-value is the probability that the two columns are not correlated.
"""

sr_a = sr_a.astype("category").cat.codes
groups = sr_b.groupby(sr_a)
arrays = [groups.get_group(category) for category in sr_a.unique()]
if len(groups) < 2:
return 0

args = [groups.get_group(category).array for category in sr_a.unique()]

args = [group.array for group in arrays]
try:
_, p_val = ss.kruskal(*args, nan_policy="omit")
except ValueError:
if np.mean([len(values) for values in args]) <= MIN_MEAN_SAMPLE_SIZE:
return 0

_, p_val = ss.kruskal(*args, nan_policy="omit")

return p_val


Expand All @@ -147,7 +159,8 @@ def kruskal_wallis_boolean(sr_a: pd.Series, sr_b: pd.Series, p_cutoff: float = 0
The maximum admitted p-value for the distributions to be considered independent.

Returns:
bool: Bool value representing whether or not the two series are correlated.
bool:
Bool value representing whether or not the two series are correlated.
"""

sr_a = sr_a.astype("category").cat.codes
Expand Down Expand Up @@ -181,8 +194,6 @@ def distance_nn_correlation(sr_a: pd.Series, sr_b: pd.Series) -> float:
The correlation coefficient.
"""

warnings.filterwarnings(action="ignore", category=UserWarning)

if sr_a.size < sr_b.size:
sr_a = sr_a.append(pd.Series(sr_a.mean()).repeat(sr_b.size - sr_a.size), ignore_index=True)
elif sr_a.size > sr_b.size:
Expand Down
68 changes: 34 additions & 34 deletions src/fairlens/metrics/unified.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,9 @@
Collection of helper methods which can be used as to interface metrics.
"""

import multiprocessing as mp
from typing import Any, Callable, List, Mapping, Optional, Tuple, Type, Union
from typing import Any, Callable, List, Mapping, Tuple, Type, Union

import numpy as np
import pandas as pd

from .. import utils
Expand Down Expand Up @@ -118,8 +118,6 @@ def correlation_matrix(
num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson,
cat_num_metric: Callable[[pd.Series, pd.Series], float] = kruskal_wallis,
cat_cat_metric: Callable[[pd.Series, pd.Series], float] = cramers_v,
columns_x: Optional[List[str]] = None,
columns_y: Optional[List[str]] = None,
) -> pd.DataFrame:
"""This function creates a correlation matrix out of a dataframe, using a correlation metric for each
possible type of pair of series (i.e. numerical-numerical, categorical-numerical, categorical-categorical).
Expand All @@ -135,60 +133,62 @@ def correlation_matrix(
cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
statistic.
columns_x (Optional[List[str]]):
The column names that determine the rows of the matrix.
columns_y (Optional[List[str]]):
The column names that determine the columns of the matrix.

Returns:
pd.DataFrame:
The correlation matrix to be used in heatmap generation.
"""

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The correlation matrix is generated using df.corr(). Since df.corr() only works on numerical data, we need to encode all the columns. The issue with this is that we use the infer_distr_type() function to decide which metric would be suitable, which works differently on the encoded numerical data. The only way to resolve this issue is to infer types beforehand (which is probably more efficient). The problem then becomes about making a binary function (a, b) -> float that knows the types of a and b beforehand.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just to add, using df.corr() provides a major performance improvement.

if columns_x is None:
columns_x = df.columns
df = df.copy()

if columns_y is None:
columns_y = df.columns
distr_types = [utils.infer_distr_type(df[col]) for col in df.columns]

pool = mp.Pool(mp.cpu_count())
for col in df.columns:
df[col] = utils.infer_dtype(df[col])

series_list = [
pd.Series(
pool.starmap(
_correlation_matrix_helper,
[(df[col_x], df[col_y], num_num_metric, cat_num_metric, cat_cat_metric) for col_x in columns_x],
),
index=columns_x,
name=col_y,
)
for col_y in columns_y
]
if df[col].dtype.kind == "O":
df[col] = pd.Series(pd.factorize(df[col], na_sentinel=-1)[0]).replace(-1, np.nan)

df = df.append(pd.DataFrame({col: [i] for i, col in enumerate(df.columns)}))

Copy link
Contributor Author

@Hilly12 Hilly12 Sep 9, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea here is - it's impossible to know which column corresponds to which distribution type in the helper, so we append the column's index in the data frame to it (as the final row). Then in the helper, we use that row to index the precomputed distribution types and drop that row. There might be a better way of doing this.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think another way to do this would be to revert back to using utils.infer_distribution_type but also adding a functools.lru_cache to the utils.infer_distribution_type function so that we avoid repetitive calculations.

pool.close()
def corr(a: np.ndarray, b: np.ndarray):
return _correlation_matrix_helper(
a,
b,
distr_types=distr_types,
num_num_metric=num_num_metric,
cat_num_metric=cat_num_metric,
cat_cat_metric=cat_cat_metric,
)

return pd.concat(series_list, axis=1, keys=[series.name for series in series_list])
return df.corr(method=corr)


def _correlation_matrix_helper(
sr_a: pd.Series,
sr_b: pd.Series,
a: np.ndarray,
b: np.ndarray,
distr_types: List[utils.DistrType],
num_num_metric: Callable[[pd.Series, pd.Series], float] = pearson,
cat_num_metric: Callable[[pd.Series, pd.Series], float] = kruskal_wallis,
cat_cat_metric: Callable[[pd.Series, pd.Series], float] = cramers_v,
) -> float:

a_type = utils.infer_distr_type(sr_a)
b_type = utils.infer_distr_type(sr_b)
a_type = distr_types[int(a[-1])]
b_type = distr_types[int(b[-1])]

sr_a = pd.Series(a[:-1])
sr_b = pd.Series(b[:-1])

df = pd.DataFrame({"a": sr_a, "b": sr_b}).dropna().reset_index()

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Columns need to be joined so any rows with nulls are dropped before the correlation metric is applied.

if a_type.is_continuous() and b_type.is_continuous():
return num_num_metric(sr_a, sr_b)
return num_num_metric(df["a"], df["b"])

elif b_type.is_continuous():
return cat_num_metric(sr_a, sr_b)
return cat_num_metric(df["a"], df["b"])

elif a_type.is_continuous():
return cat_num_metric(sr_b, sr_a)
return cat_num_metric(df["b"], df["a"])

else:
return cat_cat_metric(sr_a, sr_b)
return cat_cat_metric(df["a"], df["b"])
4 changes: 2 additions & 2 deletions src/fairlens/plot/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
"""


from .correlation import heatmap
from .distr import attr_distr_plot, distr_plot, mult_distr_plot
from .heatmap import two_column_heatmap
from .style import reset_style, use_style

__all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "two_column_heatmap"]
__all__ = ["use_style", "reset_style", "distr_plot", "attr_distr_plot", "mult_distr_plot", "heatmap"]
49 changes: 49 additions & 0 deletions src/fairlens/plot/correlation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Plot correlation heatmaps for datasets.
"""

from typing import Callable

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from ..metrics import correlation, unified


def heatmap(
df: pd.DataFrame,
num_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.pearson,
cat_num_metric: Callable[[pd.Series, pd.Series], float] = correlation.kruskal_wallis,
cat_cat_metric: Callable[[pd.Series, pd.Series], float] = correlation.cramers_v,
**kwargs
Hilly12 marked this conversation as resolved.
Show resolved Hide resolved
):
"""This function creates a correlation heatmap out of a dataframe, using user provided or default correlation
metrics for all possible types of pairs of series (i.e. numerical-numerical, categorical-numerical,
categorical-categorical).

Args:
df (pd.DataFrame):
The dataframe used for computing correlations and producing a heatmap.
num_num_metric (Callable[[pd.Series, pd.Series], float], optional):
The correlation metric used for numerical-numerical series pairs. Defaults to Pearson's correlation
coefficient.
cat_num_metric (Callable[[pd.Series, pd.Series], float], optional):
The correlation metric used for categorical-numerical series pairs. Defaults to Kruskal-Wallis' H Test.
cat_cat_metric (Callable[[pd.Series, pd.Series], float], optional):
The correlation metric used for categorical-categorical series pairs. Defaults to corrected Cramer's V
statistic.
kwargs:
Key word arguments for sns.heatmap.
"""

corr_matrix = unified.correlation_matrix(df, num_num_metric, cat_num_metric, cat_cat_metric)

if "cmap" not in kwargs:
kwargs["cmap"] = sns.cubehelix_palette(start=0.2, rot=-0.2, dark=0.3, as_cmap=True)

if "linewidth" not in kwargs:
kwargs["linewidth"] = 0.5

sns.heatmap(corr_matrix, vmin=0, vmax=1, square=True, **kwargs)
plt.tight_layout()
Loading