diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py index 09cc89684..0c83abaca 100644 --- a/dataprofiler/profilers/numerical_column_stats.py +++ b/dataprofiler/profilers/numerical_column_stats.py @@ -563,12 +563,10 @@ def _preprocess_for_calculate_psi( new_other_histogram["bin_edges"] = other_histogram["bin_edges"] new_other_histogram["bin_counts"] = other_histogram["bin_counts"] - len_self_bin_counts = 0 - if len(self_histogram["bin_counts"]) > 0: - len_self_bin_counts = len(self_histogram["bin_counts"]) + len_self_bin_counts = len(self_histogram["bin_counts"]) # re-calculate `self` histogram - if not len_self_bin_counts == num_psi_bins: + if len_self_bin_counts != num_psi_bins: histogram, hist_loss = self._regenerate_histogram( bin_counts=self_histogram["bin_counts"], bin_edges=self_histogram["bin_edges"], @@ -583,9 +581,9 @@ def _preprocess_for_calculate_psi( # re-calculate `other_profile` histogram histogram_edges_not_equal = False - all_array_values_equal = ( - other_histogram["bin_edges"] == self_histogram["bin_edges"] - ).all() + all_array_values_equal = np.array_equal( + other_histogram["bin_edges"], self_histogram["bin_edges"] + ) if not all_array_values_equal: histogram_edges_not_equal = True diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py index 9130af447..eb93c577e 100644 --- a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py +++ b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py @@ -996,3 +996,106 @@ def test_diff(self): str(exc.exception), "Unsupported operand type(s) for diff: 'TestColumnWProps' and" " 'str'", ) + + # PSI same distribution test + other1, other2 = TestColumnWProps(), TestColumnWProps() + other1.match_count = 55 + other1._stored_histogram = { + "total_loss": 0, + "current_loss": 0, + "suggested_bin_count": 10, + "histogram": { + "bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + "bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), + }, + } + + other2.match_count = 550 + other2._stored_histogram = { + "total_loss": 0, + "current_loss": 0, + "suggested_bin_count": 10, + "histogram": { + "bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) * 10, + "bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), + }, + } + + expected_psi_value = 0 + psi_value = other1._calculate_psi( + self_match_count=other1.match_count, + self_histogram=other1._stored_histogram["histogram"], + other_match_count=other2.match_count, + other_histogram=other2._stored_histogram["histogram"], + ) + self.assertEquals(expected_psi_value, psi_value) + + # PSI min_min_edge == max_max_edge + other1, other2 = TestColumnWProps(), TestColumnWProps() + other1.match_count = 10 + other1._stored_histogram = { + "total_loss": 0, + "current_loss": 0, + "suggested_bin_count": 10, + "histogram": {"bin_counts": np.array([10]), "bin_edges": np.array([1, 1])}, + } + + other2.match_count = 20 + other2._stored_histogram = { + "total_loss": 0, + "current_loss": 0, + "suggested_bin_count": 10, + "histogram": {"bin_counts": np.array([20]), "bin_edges": np.array([1, 1])}, + } + + expected_psi_value = 0 + psi_value = other1._calculate_psi( + self_match_count=other1.match_count, + self_histogram=other1._stored_histogram["histogram"], + other_match_count=other2.match_count, + other_histogram=other2._stored_histogram["histogram"], + ) + self.assertEquals(expected_psi_value, psi_value) + + # PSI regen other / not self + other1, other2 = TestColumnWProps(), TestColumnWProps() + other1.match_count = 55 + other1._stored_histogram = { + "total_loss": 0, + "current_loss": 0, + "suggested_bin_count": 10, + "histogram": { + "bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), + "bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]), + }, + } + + other2.match_count = 20 + other2._stored_histogram = { + "total_loss": 0, + "current_loss": 0, + "suggested_bin_count": 10, + "histogram": { + "bin_counts": np.array([5, 5, 10]), + "bin_edges": np.array([1, 3, 5, 7]), + }, + } + + expected_psi_value = 0.6617899380349177 + psi_value = other1._calculate_psi( + self_match_count=other1.match_count, + self_histogram=other1._stored_histogram["histogram"], + other_match_count=other2.match_count, + other_histogram=other2._stored_histogram["histogram"], + ) + self.assertEquals(expected_psi_value, psi_value) + + # PSI regen self / not other + expected_psi_value = 0.6617899380349177 + psi_value = other1._calculate_psi( + self_match_count=other2.match_count, + self_histogram=other2._stored_histogram["histogram"], + other_match_count=other1.match_count, + other_histogram=other1._stored_histogram["histogram"], + ) + self.assertEquals(expected_psi_value, psi_value) diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py index a085837eb..da096da4b 100644 --- a/dataprofiler/tests/profilers/test_text_column_profile.py +++ b/dataprofiler/tests/profilers/test_text_column_profile.py @@ -608,37 +608,3 @@ def test_diff(self): places=2, ) self.assertDictEqual(expected_diff, profile_diff) - - # re-create `diamond.csv` categorical column - df = pd.Series(["D", "I", "F", "H", "G"]).apply(str) - - df2 = pd.Series(["D", "I", "F", "H", "G"]).apply(str) - - profiler1 = TextColumn(df.name) - profiler1.update(df) - profile1 = profiler1.profile - - profiler2 = TextColumn(df2.name) - profiler2.update(df2) - profile2 = profiler2.profile - - expected_diff = { - "min": "unchanged", - "max": "unchanged", - "sum": "unchanged", - "mean": "unchanged", - "median": "unchanged", - "mode": "unchanged", - "median_absolute_deviation": "unchanged", - "variance": "unchanged", - "stddev": "unchanged", - "t-test": { - "t-statistic": None, - "conservative": {"df": None, "p-value": None}, - "welch": {"df": None, "p-value": None}, - }, - "vocab": "unchanged", - } - - profile_diff = profiler1.diff(profiler2) - self.assertDictEqual(expected_diff, profile_diff)