diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 2e5cc5ef0..150135f40 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -53,6 +53,7 @@ def __init__( min_true_samples: int = 0, sample_ids: np.ndarray = None, pool: Pool = None, + column_index: int = None, options: StructuredOptions = None, ) -> None: """ @@ -69,6 +70,8 @@ def __init__( :type sample_ids: list(list) :param pool: pool utilized for multiprocessing :type pool: multiprocessing.Pool + :param column_index: index of the given column + :type column_index: int :param options: Options for the structured profiler. :type options: StructuredOptions Object """ @@ -100,10 +103,13 @@ def __init__( } if options: if options.null_values is not None: - self._null_values = options.null_values + self._null_values = options.null_values.copy() + if column_index is not None and options.column_null_values is not None: + self._null_values.update( + options.column_null_values.get(column_index, {}) + ) if df_series is not None and len(df_series) > 0: - if not sample_size: sample_size = self._get_sample_size(df_series) if sample_size < len(df_series): @@ -497,7 +503,7 @@ def clean_data_and_get_base_stats( :param null_values: Dictionary mapping null values to regex flag where the key represents the null value to remove from the data and the flag represents the regex flag to apply - :type null_values: dict[str, re.FLAG] + :type null_values: Dict[str, Union[re.RegexFlag, int]] :param min_true_samples: Minimum number of samples required for the profiler :type min_true_samples: int @@ -2418,7 +2424,10 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> Dict: return merged_properties def _update_profile_from_chunk( - self, data: pd.DataFrame, sample_size: int, min_true_samples: int = None + self, + data: Union[List, pd.Series, pd.DataFrame], + sample_size: int, + min_true_samples: int = None, ) -> None: """ Iterate over the columns of a dataset and identify its parameters. @@ -2497,6 +2506,7 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: sample_size=sample_size, min_true_samples=min_true_samples, # type: ignore sample_ids=sample_ids, # type: ignore + column_index=col_idx, options=self.options, ) ) @@ -2536,7 +2546,12 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: if min_true_samples is None: min_true_samples = self._profile[prof_idx]._min_true_samples try: - null_values = self._profile[prof_idx]._null_values + null_values: Dict = self._profile[prof_idx]._null_values.copy() + if self.options.column_null_values: + null_values.update( + self.options.column_null_values.get(col_idx, {}) + ) + multi_process_dict[col_idx] = pool.apply_async( self._profile[prof_idx].clean_data_and_get_base_stats, ( @@ -2576,7 +2591,13 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: prof_idx = col_idx_to_prof_idx[col_idx] if min_true_samples is None: min_true_samples = self._profile[prof_idx]._min_true_samples - null_values = self._profile[prof_idx]._null_values + + null_values = self._profile[prof_idx]._null_values.copy() + if self.options.column_null_values: + null_values.update( + self.options.column_null_values.get(col_idx, {}) + ) + clean_sampled_dict[prof_idx], base_stats = self._profile[ prof_idx ].clean_data_and_get_base_stats( @@ -2594,7 +2615,11 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: prof_idx = col_idx_to_prof_idx[col_idx] if min_true_samples is None: min_true_samples = self._profile[prof_idx]._min_true_samples - null_values = self._profile[prof_idx]._null_values + + null_values = self._profile[prof_idx]._null_values.copy() + if self.options.column_null_values: + null_values.update(self.options.column_null_values.get(col_idx, {})) + clean_sampled_dict[prof_idx], base_stats = self._profile[ prof_idx ].clean_data_and_get_base_stats( diff --git a/dataprofiler/profilers/profiler_options.py b/dataprofiler/profilers/profiler_options.py index 0690dc904..1f03f2ce8 100644 --- a/dataprofiler/profilers/profiler_options.py +++ b/dataprofiler/profilers/profiler_options.py @@ -1142,12 +1142,18 @@ def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> List[s class StructuredOptions(BaseOption): """For configuring options for structured profiler.""" - def __init__(self, null_values: Dict = None) -> None: + def __init__( + self, + null_values: Dict[str, Union[re.RegexFlag, int]] = None, + column_null_values: Dict[int, Dict[str, Union[re.RegexFlag, int]]] = None, + ) -> None: """ Construct the StructuredOptions object with default values. :param null_values: null values we input. - :vartype null_values: Union[None, dict] + :vartype null_values: Dict[str, Union[re.RegexFlag, int]] + :param column_null_values: column level null values we input. + :vartype column_null_values: Dict[int, Dict[str, Union[re.RegexFlag, int]]] :ivar int: option set for int profiling. :vartype int: IntOptions :ivar float: option set for float profiling. @@ -1186,14 +1192,16 @@ def __init__(self, null_values: Dict = None) -> None: self.null_replication_metrics = BooleanOption(is_enabled=False) # Non-Option variables self.null_values = null_values + self.column_null_values = column_null_values @property def enabled_profiles(self) -> List[str]: """Return a list of the enabled profilers for columns.""" enabled_profiles = list() - # null_values does not have is_enabled + # null_values and column_null_values do not have is_enabled properties = self.properties properties.pop("null_values") + properties.pop("column_null_values") for key, value in properties.items(): if value.is_enabled: enabled_profiles.append(key) @@ -1230,6 +1238,7 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> List[str ) properties = self.properties properties.pop("null_values") + properties.pop("column_null_values") for column in properties: if not isinstance(self.properties[column], prop_check[column]): errors.append( @@ -1258,6 +1267,26 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> List[str "a re.RegexFlag".format(variable_path) ) + if self.column_null_values is not None and not ( + isinstance(self.column_null_values, dict) + and all( + isinstance(key, int) + and isinstance(value, dict) + and all( + isinstance(k, str) and (isinstance(v, re.RegexFlag) or v == 0) + for k, v in value.items() + ) + for key, value in self.column_null_values.items() + ) + ): + errors.append( + "{}.column_null_values must be either None or " + "a dictionary that contains keys of type int " + "that map to dictionaries that contains keys " + "of type str and values == 0 or are instances of " + "a re.RegexFlag".format(variable_path) + ) + if ( isinstance(self.category, CategoricalOptions) and isinstance(self.chi2_homogeneity, BooleanOption) diff --git a/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py b/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py index 12daff978..f9407dd68 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_profiler_options.py @@ -29,7 +29,7 @@ def test_default_profiler_options(self, *mocks): # TODO: remove the check for correlation option once it's updated to True if column == "correlation" or column == "null_replication_metrics": self.assertFalse(profile.options.properties[column].is_enabled) - elif column == "null_values": + elif column == "null_values" or column == "column_null_values": self.assertIsNone(profile.options.properties[column]) else: self.assertTrue(profile.options.properties[column].is_enabled) diff --git a/dataprofiler/tests/profilers/profiler_options/test_structured_options.py b/dataprofiler/tests/profilers/profiler_options/test_structured_options.py index 519818b79..7ac55ba12 100644 --- a/dataprofiler/tests/profilers/profiler_options/test_structured_options.py +++ b/dataprofiler/tests/profilers/profiler_options/test_structured_options.py @@ -9,7 +9,7 @@ class TestStructuredOptions(TestBaseOption): option_class = StructuredOptions - other_keys = ["null_values"] + other_keys = ["null_values", "column_null_values"] boolean_keys = [ "int", "float", @@ -83,22 +83,13 @@ def test_set(self): with self.assertRaisesRegex(AttributeError, expected_error): option.set({"{}.is_enabled".format(key): True}) - expected_error = ( - "{}.null_values must be either None or " - "a dictionary that contains keys of str type " - "and values == 0 or are instances of " - "a re.RegexFlag".format(optpth) - ) + for test_dict in ({"a": 0}, {"a": re.IGNORECASE}, None): + option.set({"null_values": test_dict}) + self.assertEqual(test_dict, option.null_values) - test_dict = {"a": 0} - option.set({"null_values": test_dict}) - self.assertEqual({"a": 0}, option.null_values) - test_dict = {"a": re.IGNORECASE} - option.set({"null_values": test_dict}) - self.assertEqual({"a": 2}, option.null_values) - test_dict = None - option.set({"null_values": test_dict}) - self.assertEqual(None, option.null_values) + for test_dict in ({0: {"a": 0}}, {0: {"a": re.IGNORECASE}}, None): + option.set({"column_null_values": test_dict}) + self.assertEqual(test_dict, option.column_null_values) def test_validate_helper(self): # Valid cases should return [] while invalid cases @@ -266,9 +257,39 @@ def test_validate(self): option.set({"null_values": None}) self.assertEqual([], option._validate_helper()) + expected_error = [ + "{}.column_null_values must be either None or " + "a dictionary that contains keys of type int " + "that map to dictionaries that contains keys " + "of type str and values == 0 or are instances of " + "a re.RegexFlag".format(optpth) + ] + # Test column key is not an int + option.set({"column_null_values": {"a": {"a": 0}}}) + self.assertEqual(expected_error, option._validate_helper()) + # Test key is not a str + option.set({"column_null_values": {0: {0: 0}}}) + self.assertEqual(expected_error, option._validate_helper()) + # Test value is not correct type (0 or regex) + option.set({"column_null_values": {0: {"a": 1}}}) + self.assertEqual(expected_error, option._validate_helper()) + # Test variable is not correct variable type + option.set({"column_null_values": 1}) + self.assertEqual(expected_error, option._validate_helper()) + # Test 0 works for option set + option.set({"column_null_values": {0: {"a": 0}}}) + self.assertEqual([], option._validate_helper()) + # Test a regex flag works for option set + option.set({"column_null_values": {0: {"a": re.IGNORECASE}}}) + self.assertEqual([], option._validate_helper()) + # Test None works for option set + option.set({"column_null_values": None}) + self.assertEqual([], option._validate_helper()) + def test_enabled_profilers(self): options = self.get_options() self.assertNotIn("null_values", options.enabled_profiles) + self.assertNotIn("column_null_values", options.enabled_profiles) # All Columns Enabled for key in self.boolean_keys: diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index bea38380e..c90ef3eb2 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2081,6 +2081,41 @@ def test_null_replication_metrics_calculation(self): np.testing.assert_array_almost_equal([[np.nan], [18]], column["class_sum"]) np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"]) + def test_column_level_invalid_values(self): + data = pd.DataFrame([[1, 1], [9999999, 2], [3, 3]]) + + NO_FLAG = 0 + profile_options = dp.ProfilerOptions() + profile_options.set( + { + "*.null_values": { + "": NO_FLAG, + "nan": re.IGNORECASE, + "none": re.IGNORECASE, + "null": re.IGNORECASE, + " *": NO_FLAG, + "--*": NO_FLAG, + "__*": NO_FLAG, + "9" * 7: NO_FLAG, + }, + "*.column_null_values": { + 0: {"1": NO_FLAG}, + 1: {"3": NO_FLAG}, + }, + "*.null_replication_metrics.is_enabled": True, + "data_labeler.is_enabled": False, + "multiprocess.is_enabled": False, + } + ) + + profiler = dp.StructuredProfiler(data, options=profile_options) + report = profiler.report() + + np.testing.assert_array_equal(["3"], report["data_stats"][0]["samples"]) + np.testing.assert_array_equal( + ["1", "2"], sorted(report["data_stats"][1]["samples"]) + ) + class TestStructuredColProfilerClass(unittest.TestCase): def setUp(self):