Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PSI fixing unit tests #712

Merged
merged 2 commits into from
Nov 10, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 5 additions & 7 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -563,12 +563,10 @@ def _preprocess_for_calculate_psi(
new_other_histogram["bin_edges"] = other_histogram["bin_edges"]
new_other_histogram["bin_counts"] = other_histogram["bin_counts"]

len_self_bin_counts = 0
if len(self_histogram["bin_counts"]) > 0:
len_self_bin_counts = len(self_histogram["bin_counts"])
len_self_bin_counts = len(self_histogram["bin_counts"])

# re-calculate `self` histogram
if not len_self_bin_counts == num_psi_bins:
if len_self_bin_counts != num_psi_bins:
histogram, hist_loss = self._regenerate_histogram(
bin_counts=self_histogram["bin_counts"],
bin_edges=self_histogram["bin_edges"],
Expand All @@ -583,9 +581,9 @@ def _preprocess_for_calculate_psi(

# re-calculate `other_profile` histogram
histogram_edges_not_equal = False
all_array_values_equal = (
other_histogram["bin_edges"] == self_histogram["bin_edges"]
).all()
all_array_values_equal = np.array_equal(
other_histogram["bin_edges"], self_histogram["bin_edges"]
)
if not all_array_values_equal:
histogram_edges_not_equal = True

Expand Down
103 changes: 103 additions & 0 deletions dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -996,3 +996,106 @@ def test_diff(self):
str(exc.exception),
"Unsupported operand type(s) for diff: 'TestColumnWProps' and" " 'str'",
)

# PSI same distribution test
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.match_count = 55
other1._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
"bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
},
}

other2.match_count = 550
other2._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) * 10,
"bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
},
}

expected_psi_value = 0
psi_value = other1._calculate_psi(
self_match_count=other1.match_count,
self_histogram=other1._stored_histogram["histogram"],
other_match_count=other2.match_count,
other_histogram=other2._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)

# PSI min_min_edge == max_max_edge
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.match_count = 10
other1._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {"bin_counts": np.array([10]), "bin_edges": np.array([1, 1])},
}

other2.match_count = 20
other2._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {"bin_counts": np.array([20]), "bin_edges": np.array([1, 1])},
}

expected_psi_value = 0
psi_value = other1._calculate_psi(
self_match_count=other1.match_count,
self_histogram=other1._stored_histogram["histogram"],
other_match_count=other2.match_count,
other_histogram=other2._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)
JGSweets marked this conversation as resolved.
Show resolved Hide resolved

# PSI regen other / not self
other1, other2 = TestColumnWProps(), TestColumnWProps()
other1.match_count = 55
other1._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]),
"bin_edges": np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11]),
},
}

other2.match_count = 20
other2._stored_histogram = {
"total_loss": 0,
"current_loss": 0,
"suggested_bin_count": 10,
"histogram": {
"bin_counts": np.array([5, 5, 10]),
"bin_edges": np.array([1, 3, 5, 7]),
},
}

expected_psi_value = 0.6617899380349177
psi_value = other1._calculate_psi(
self_match_count=other1.match_count,
self_histogram=other1._stored_histogram["histogram"],
other_match_count=other2.match_count,
other_histogram=other2._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)

# PSI regen self / not other
expected_psi_value = 0.6617899380349177
psi_value = other1._calculate_psi(
self_match_count=other2.match_count,
self_histogram=other2._stored_histogram["histogram"],
other_match_count=other1.match_count,
other_histogram=other1._stored_histogram["histogram"],
)
self.assertEquals(expected_psi_value, psi_value)
34 changes: 0 additions & 34 deletions dataprofiler/tests/profilers/test_text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -608,37 +608,3 @@ def test_diff(self):
places=2,
)
self.assertDictEqual(expected_diff, profile_diff)

# re-create `diamond.csv` categorical column
df = pd.Series(["D", "I", "F", "H", "G"]).apply(str)

df2 = pd.Series(["D", "I", "F", "H", "G"]).apply(str)

profiler1 = TextColumn(df.name)
profiler1.update(df)
profile1 = profiler1.profile

profiler2 = TextColumn(df2.name)
profiler2.update(df2)
profile2 = profiler2.profile

expected_diff = {
"min": "unchanged",
"max": "unchanged",
"sum": "unchanged",
"mean": "unchanged",
"median": "unchanged",
"mode": "unchanged",
"median_absolute_deviation": "unchanged",
"variance": "unchanged",
"stddev": "unchanged",
"t-test": {
"t-statistic": None,
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"vocab": "unchanged",
}

profile_diff = profiler1.diff(profiler2)
self.assertDictEqual(expected_diff, profile_diff)