Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add profiler option for column level invalid values #704

Merged
merged 8 commits into from
Nov 4, 2022
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 29 additions & 4 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
min_true_samples: int = 0,
sample_ids: np.ndarray = None,
pool: Pool = None,
column_index: int = None,
options: StructuredOptions = None,
) -> None:
"""
Expand All @@ -69,6 +70,8 @@ def __init__(
:type sample_ids: list(list)
:param pool: pool utilized for multiprocessing
:type pool: multiprocessing.Pool
:param column_index: index of the given column
:type column_index: int
:param options: Options for the structured profiler.
:type options: StructuredOptions Object
"""
Expand Down Expand Up @@ -100,10 +103,13 @@ def __init__(
}
if options:
if options.null_values is not None:
self._null_values = options.null_values
self._null_values = options.null_values.copy()
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added copy

if column_index is not None and options.column_null_values is not None:
self._null_values.update(
options.column_null_values.get(column_index, {})
)

if df_series is not None and len(df_series) > 0:

if not sample_size:
sample_size = self._get_sample_size(df_series)
if sample_size < len(df_series):
Expand Down Expand Up @@ -497,7 +503,7 @@ def clean_data_and_get_base_stats(
:param null_values: Dictionary mapping null values to regex flag where
the key represents the null value to remove from the data and the
flag represents the regex flag to apply
:type null_values: dict[str, re.FLAG]
:type null_values: Dict[str, Union[re.RegexFlag, int]]
:param min_true_samples: Minimum number of samples required for the
profiler
:type min_true_samples: int
Expand Down Expand Up @@ -2393,7 +2399,10 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> Dict:
return merged_properties

def _update_profile_from_chunk(
self, data: pd.DataFrame, sample_size: int, min_true_samples: int = None
self,
data: Union[List, pd.Series, pd.DataFrame],
sample_size: int,
min_true_samples: int = None,
) -> None:
"""
Iterate over the columns of a dataset and identify its parameters.
Expand Down Expand Up @@ -2472,6 +2481,7 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
sample_size=sample_size,
min_true_samples=min_true_samples, # type: ignore
sample_ids=sample_ids, # type: ignore
column_index=col_idx,
options=self.options,
)
)
Expand Down Expand Up @@ -2512,6 +2522,11 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
min_true_samples = self._profile[prof_idx]._min_true_samples
try:
null_values = self._profile[prof_idx]._null_values
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a huge add and most LGTM, one big here though bc a doctor is mutable, this will change self._null_values with the update. If we instead copy prior to a variable, that would alleviate the issue. Great job though!

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just need to fix in the locations where we update

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added .copy()

if self.options.column_null_values:
null_values.update(
self.options.column_null_values.get(col_idx, {})
)

multi_process_dict[col_idx] = pool.apply_async(
self._profile[prof_idx].clean_data_and_get_base_stats,
(
Expand Down Expand Up @@ -2551,7 +2566,13 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
prof_idx = col_idx_to_prof_idx[col_idx]
if min_true_samples is None:
min_true_samples = self._profile[prof_idx]._min_true_samples

null_values = self._profile[prof_idx]._null_values
if self.options.column_null_values:
null_values.update(
self.options.column_null_values.get(col_idx, {})
)

clean_sampled_dict[prof_idx], base_stats = self._profile[
prof_idx
].clean_data_and_get_base_stats(
Expand All @@ -2569,7 +2590,11 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
prof_idx = col_idx_to_prof_idx[col_idx]
if min_true_samples is None:
min_true_samples = self._profile[prof_idx]._min_true_samples

null_values = self._profile[prof_idx]._null_values
if self.options.column_null_values:
null_values.update(self.options.column_null_values.get(col_idx, {}))

clean_sampled_dict[prof_idx], base_stats = self._profile[
prof_idx
].clean_data_and_get_base_stats(
Expand Down
35 changes: 32 additions & 3 deletions dataprofiler/profilers/profiler_options.py
Original file line number Diff line number Diff line change
Expand Up @@ -1142,12 +1142,18 @@ def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> List[s
class StructuredOptions(BaseOption):
"""For configuring options for structured profiler."""

def __init__(self, null_values: Dict = None) -> None:
def __init__(
self,
null_values: Dict[str, Union[re.RegexFlag, int]] = None,
column_null_values: Dict[int, Dict[str, Union[re.RegexFlag, int]]] = None,
) -> None:
"""
Construct the StructuredOptions object with default values.

:param null_values: null values we input.
:vartype null_values: Union[None, dict]
:vartype null_values: Dict[str, Union[re.RegexFlag, int]]
:param column_null_values: column level null values we input.
:vartype column_null_values: Dict[int, Dict[str, Union[re.RegexFlag, int]]]
:ivar int: option set for int profiling.
:vartype int: IntOptions
:ivar float: option set for float profiling.
Expand Down Expand Up @@ -1186,14 +1192,16 @@ def __init__(self, null_values: Dict = None) -> None:
self.null_replication_metrics = BooleanOption(is_enabled=False)
# Non-Option variables
self.null_values = null_values
self.column_null_values = column_null_values

@property
def enabled_profiles(self) -> List[str]:
"""Return a list of the enabled profilers for columns."""
enabled_profiles = list()
# null_values does not have is_enabled
# null_values and column_null_values do not have is_enabled
properties = self.properties
properties.pop("null_values")
properties.pop("column_null_values")
for key, value in properties.items():
if value.is_enabled:
enabled_profiles.append(key)
Expand Down Expand Up @@ -1230,6 +1238,7 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> List[str
)
properties = self.properties
properties.pop("null_values")
properties.pop("column_null_values")
for column in properties:
if not isinstance(self.properties[column], prop_check[column]):
errors.append(
Expand Down Expand Up @@ -1258,6 +1267,26 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> List[str
"a re.RegexFlag".format(variable_path)
)

if self.column_null_values is not None and not (
isinstance(self.column_null_values, dict)
and all(
isinstance(key, int)
and isinstance(value, dict)
and all(
isinstance(k, str) and (isinstance(v, re.RegexFlag) or v == 0)
for k, v in value.items()
)
for key, value in self.column_null_values.items()
)
):
errors.append(
"{}.column_null_values must be either None or "
"a dictionary that contains keys of type int "
"that map to dictionaries that contains keys "
"of type str and values == 0 or are instances of "
"a re.RegexFlag".format(variable_path)
)

if (
isinstance(self.category, CategoricalOptions)
and isinstance(self.chi2_homogeneity, BooleanOption)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ def test_default_profiler_options(self, *mocks):
# TODO: remove the check for correlation option once it's updated to True
if column == "correlation" or column == "null_replication_metrics":
self.assertFalse(profile.options.properties[column].is_enabled)
elif column == "null_values":
elif column == "null_values" or column == "column_null_values":
self.assertIsNone(profile.options.properties[column])
else:
self.assertTrue(profile.options.properties[column].is_enabled)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
class TestStructuredOptions(TestBaseOption):

option_class = StructuredOptions
other_keys = ["null_values"]
other_keys = ["null_values", "column_null_values"]
boolean_keys = [
"int",
"float",
Expand Down Expand Up @@ -83,22 +83,13 @@ def test_set(self):
with self.assertRaisesRegex(AttributeError, expected_error):
option.set({"{}.is_enabled".format(key): True})

expected_error = (
"{}.null_values must be either None or "
"a dictionary that contains keys of str type "
"and values == 0 or are instances of "
"a re.RegexFlag".format(optpth)
)
for test_dict in ({"a": 0}, {"a": re.IGNORECASE}, None):
option.set({"null_values": test_dict})
self.assertEqual(test_dict, option.null_values)

test_dict = {"a": 0}
option.set({"null_values": test_dict})
self.assertEqual({"a": 0}, option.null_values)
test_dict = {"a": re.IGNORECASE}
option.set({"null_values": test_dict})
self.assertEqual({"a": 2}, option.null_values)
test_dict = None
option.set({"null_values": test_dict})
self.assertEqual(None, option.null_values)
for test_dict in ({0: {"a": 0}}, {0: {"a": re.IGNORECASE}}, None):
option.set({"column_null_values": test_dict})
self.assertEqual(test_dict, option.column_null_values)

def test_validate_helper(self):
# Valid cases should return [] while invalid cases
Expand Down Expand Up @@ -266,9 +257,39 @@ def test_validate(self):
option.set({"null_values": None})
self.assertEqual([], option._validate_helper())

expected_error = [
"{}.column_null_values must be either None or "
"a dictionary that contains keys of type int "
"that map to dictionaries that contains keys "
"of type str and values == 0 or are instances of "
"a re.RegexFlag".format(optpth)
]
# Test column key is not an int
option.set({"column_null_values": {"a": {"a": 0}}})
self.assertEqual(expected_error, option._validate_helper())
# Test key is not a str
option.set({"column_null_values": {0: {0: 0}}})
self.assertEqual(expected_error, option._validate_helper())
# Test value is not correct type (0 or regex)
option.set({"column_null_values": {0: {"a": 1}}})
self.assertEqual(expected_error, option._validate_helper())
# Test variable is not correct variable type
option.set({"column_null_values": 1})
self.assertEqual(expected_error, option._validate_helper())
# Test 0 works for option set
option.set({"column_null_values": {0: {"a": 0}}})
self.assertEqual([], option._validate_helper())
# Test a regex flag works for option set
option.set({"column_null_values": {0: {"a": re.IGNORECASE}}})
self.assertEqual([], option._validate_helper())
# Test None works for option set
option.set({"column_null_values": None})
self.assertEqual([], option._validate_helper())

def test_enabled_profilers(self):
options = self.get_options()
self.assertNotIn("null_values", options.enabled_profiles)
self.assertNotIn("column_null_values", options.enabled_profiles)

# All Columns Enabled
for key in self.boolean_keys:
Expand Down
35 changes: 35 additions & 0 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,41 @@ def test_null_replication_metrics_calculation(self):
np.testing.assert_array_almost_equal([17 / 8, 48 / 8], column["class_mean"][0])
np.testing.assert_array_almost_equal([12 / 2, 6 / 2], column["class_mean"][1])

def test_column_level_invalid_values(self):
data = pd.DataFrame([[1, 1], [9999999, 2], [3, 3]])

NO_FLAG = 0
profile_options = dp.ProfilerOptions()
profile_options.set(
{
"*.null_values": {
"": NO_FLAG,
"nan": re.IGNORECASE,
"none": re.IGNORECASE,
"null": re.IGNORECASE,
" *": NO_FLAG,
"--*": NO_FLAG,
"__*": NO_FLAG,
"9" * 7: NO_FLAG,
},
"*.column_null_values": {
0: {"1": NO_FLAG},
1: {"3": NO_FLAG},
},
"*.null_replication_metrics.is_enabled": True,
"data_labeler.is_enabled": False,
"multiprocess.is_enabled": False,
}
)

profiler = dp.StructuredProfiler(data, options=profile_options)
report = profiler.report()

np.testing.assert_array_equal(["3"], report["data_stats"][0]["samples"])
np.testing.assert_array_equal(
["1", "2"], sorted(report["data_stats"][1]["samples"])
)


class TestStructuredColProfilerClass(unittest.TestCase):
def setUp(self):
Expand Down