From 72e7dad1c0ae4437bd25eddf8e76120fdc5f0aa2 Mon Sep 17 00:00:00 2001
From: Marjan-emd <100327880+Marjan-emd@users.noreply.github.com>
Date: Fri, 3 May 2024 14:25:01 -0700
Subject: [PATCH] Replace NaNs with a NaN proxy in Correlation Calculation

Co-authored-by: marjan_emd <marjan_emd>
GitOrigin-RevId: d7e8ff40250d5e205456476a2fb72f48c69db7a1
---
 src/gretel_synthetics/utils/stats.py | 45 +++++++++++++---------------
 1 file changed, 20 insertions(+), 25 deletions(-)

diff --git a/src/gretel_synthetics/utils/stats.py b/src/gretel_synthetics/utils/stats.py
index 2014e40..5749577 100644
--- a/src/gretel_synthetics/utils/stats.py
+++ b/src/gretel_synthetics/utils/stats.py
@@ -3,6 +3,7 @@
 """
 
 import math
+import uuid
 import warnings
 
 from typing import List, Tuple
@@ -320,6 +321,10 @@ def calculate_correlation(
     if nominal_columns is None:
         nominal_columns = list()
 
+    df_cp = df.copy()
+    # Replace NaNs with NaN proxy only for nominal columns. This helps with more consistency in FCS regardless of the generated row counts.
+    df_cp[nominal_columns] = df_cp[nominal_columns].fillna(f"gretel-{uuid.uuid4().hex}")
+
     corr = np.zeros((len(columns), len(columns)))
     single_value_columns = []
     numeric_columns = []
@@ -328,25 +333,28 @@ def calculate_correlation(
 
     # Set up all the column groupings needed for correlation
     for i, c in enumerate(columns):
-        if df[c].nunique() == 1:
+        if df_cp[c].nunique() == 1:
             single_value_columns.append(c)
         elif c not in nominal_columns:
-            if df[c].dtype == "object":
+            if df_cp[c].dtype == "object":
                 nominal_columns.append(c)
             else:
                 numeric_columns.append(c)
 
         column_to_index[c] = i
 
+    # Replace NaNs with NaN proxy one more time since the nominal column might have updated.
+    df_cp[nominal_columns] = df_cp[nominal_columns].fillna(f"gretel-{uuid.uuid4().hex}")
     nominal = [x for x in nominal_columns if x not in single_value_columns]
-    df_rows = df.shape[0]
+    df_rows = df_cp.shape[0]
     high_unique_nominal = []
     completely_unique_nominal = []
     not_high_unique_nominal = []
+    uniqueness_ratio = df_cp[c].nunique() / df_rows
     for c in nominal:
-        if df[c].nunique() / df_rows == 1:
+        if uniqueness_ratio == 1:
             completely_unique_nominal.append(c)
-        elif df[c].nunique() / df_rows > UNIQUENESS_THRESHOLD:
+        elif uniqueness_ratio > UNIQUENESS_THRESHOLD:
             high_unique_nominal.append(c)
         else:
             not_high_unique_nominal.append(c)
@@ -365,11 +373,11 @@ def calculate_correlation(
             if x == y:
                 corr[y_index][x_index] = 1.0
             # Edge case, guard against ValueError in math.log when the other column is empty
-            elif df[y].nunique() == 0:
+            elif df_cp[y].nunique() == 0:
                 corr[y_index][x_index] = 0.0
             else:
-                corr[y_index][x_index] = math.log(df[y].nunique()) / math.log(
-                    df[x].nunique()
+                corr[y_index][x_index] = math.log(df_cp[y].nunique()) / math.log(
+                    df_cp[x].nunique()
                 )
 
     for x in single_value_columns:
@@ -380,7 +388,7 @@ def calculate_correlation(
 
     # Do nominal-nominal exluding any that are 100% unique (Theil's U)
     scores = Parallel(n_jobs=job_count)(
-        delayed(calculate_theils_u)(df[field1], df[field2])
+        delayed(calculate_theils_u)(df_cp[field1], df_cp[field2])
         for field1 in notcompletely_unique_nominal
         for field2 in notcompletely_unique_nominal
     )
@@ -398,7 +406,7 @@ def calculate_correlation(
 
     # Do "not_high_unique_nominal with numeric" (Correlation Ratio)
     scores = Parallel(n_jobs=job_count)(
-        delayed(calculate_correlation_ratio)(df[field1], df[field2], opt)
+        delayed(calculate_correlation_ratio)(df_cp[field1], df_cp[field2], opt)
         for field1 in not_high_unique_nominal
         for field2 in numeric_columns
     )
@@ -418,7 +426,7 @@ def calculate_correlation(
     # comparing the mean within buckets to the mean overall to give unstable, over inflated
     # correlation values.  Using Theil's U instead gives a much more realistic score
     scores = Parallel(n_jobs=job_count)(
-        delayed(calculate_theils_u)(df[field1], df[field2])
+        delayed(calculate_theils_u)(df_cp[field1], df_cp[field2])
         for field1 in high_unique_nominal
         for field2 in numeric_columns
     )
@@ -430,19 +438,6 @@ def calculate_correlation(
             corr[field2_index][field1_index] = scores[i]
             i += 1
 
-    scores = Parallel(n_jobs=job_count)(
-        delayed(calculate_theils_u)(df[field1], df[field2])
-        for field2 in high_unique_nominal
-        for field1 in numeric_columns
-    )
-    i = 0
-    for field2 in high_unique_nominal:
-        field2_index = column_to_index[field2]
-        for field1 in numeric_columns:
-            field1_index = column_to_index[field1]
-            corr[field2_index][field1_index] = scores[i]
-            i += 1
-
     # Do numeric numeric (Pearson's)
     num_len = len(numeric_columns)
     if num_len > 1:
@@ -451,7 +446,7 @@ def calculate_correlation(
             for j in range(i + 1, num_len):
                 delayed_calls.append(
                     delayed(calculate_pearsons_r)(
-                        df[numeric_columns[i]], df[numeric_columns[j]], opt
+                        df_cp[numeric_columns[i]], df_cp[numeric_columns[j]], opt
                     )
                 )
         scores = Parallel(n_jobs=1)(delayed_calls)