capitalone · taylorfturner · Nov 4, 2022 · Nov 1, 2022 · Nov 1, 2022 · Nov 1, 2022
@@ -53,6 +53,7 @@ def __init__(
         min_true_samples: int = 0,
         sample_ids: np.ndarray = None,
         pool: Pool = None,
+        column_index: int = None,
         options: StructuredOptions = None,
     ) -> None:
         """
@@ -69,6 +70,8 @@ def __init__(
         :type sample_ids: list(list)
         :param pool: pool utilized for multiprocessing
         :type pool: multiprocessing.Pool
+        :param column_index: index of the given column
+        :type column_index: int
         :param options: Options for the structured profiler.
         :type options: StructuredOptions Object
         """
@@ -100,10 +103,13 @@ def __init__(
         }
         if options:
             if options.null_values is not None:
-                self._null_values = options.null_values
+                self._null_values = options.null_values.copy()
+            if column_index is not None and options.column_null_values is not None:
+                self._null_values.update(
+                    options.column_null_values.get(column_index, {})
+                )
 
         if df_series is not None and len(df_series) > 0:
-
             if not sample_size:
                 sample_size = self._get_sample_size(df_series)
             if sample_size < len(df_series):
@@ -497,7 +503,7 @@ def clean_data_and_get_base_stats(
         :param null_values: Dictionary mapping null values to regex flag where
             the key represents the null value to remove from the data and the
             flag represents the regex flag to apply
-        :type null_values: dict[str, re.FLAG]
+        :type null_values: Dict[str, Union[re.RegexFlag, int]]
         :param min_true_samples: Minimum number of samples required for the
             profiler
         :type min_true_samples: int
@@ -2418,7 +2424,10 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> Dict:
         return merged_properties
 
     def _update_profile_from_chunk(
-        self, data: pd.DataFrame, sample_size: int, min_true_samples: int = None
+        self,
+        data: Union[List, pd.Series, pd.DataFrame],
+        sample_size: int,
+        min_true_samples: int = None,
     ) -> None:
         """
         Iterate over the columns of a dataset and identify its parameters.
@@ -2497,6 +2506,7 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
                         sample_size=sample_size,
                         min_true_samples=min_true_samples,  # type: ignore
                         sample_ids=sample_ids,  # type: ignore
+                        column_index=col_idx,
                         options=self.options,
                     )
                 )
@@ -2536,7 +2546,12 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
                 if min_true_samples is None:
                     min_true_samples = self._profile[prof_idx]._min_true_samples
                 try:
-                    null_values = self._profile[prof_idx]._null_values
+                    null_values: Dict = self._profile[prof_idx]._null_values.copy()
+                    if self.options.column_null_values:
+                        null_values.update(
+                            self.options.column_null_values.get(col_idx, {})
+                        )
+
                     multi_process_dict[col_idx] = pool.apply_async(
                         self._profile[prof_idx].clean_data_and_get_base_stats,
                         (
@@ -2576,7 +2591,13 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
                     prof_idx = col_idx_to_prof_idx[col_idx]
                     if min_true_samples is None:
                         min_true_samples = self._profile[prof_idx]._min_true_samples
-                    null_values = self._profile[prof_idx]._null_values
+
+                    null_values = self._profile[prof_idx]._null_values.copy()
+                    if self.options.column_null_values:
+                        null_values.update(
+                            self.options.column_null_values.get(col_idx, {})
+                        )
+
                     clean_sampled_dict[prof_idx], base_stats = self._profile[
                         prof_idx
                     ].clean_data_and_get_base_stats(
@@ -2594,7 +2615,11 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]:
                 prof_idx = col_idx_to_prof_idx[col_idx]
                 if min_true_samples is None:
                     min_true_samples = self._profile[prof_idx]._min_true_samples
-                null_values = self._profile[prof_idx]._null_values
+
+                null_values = self._profile[prof_idx]._null_values.copy()
+                if self.options.column_null_values:
+                    null_values.update(self.options.column_null_values.get(col_idx, {}))
+
                 clean_sampled_dict[prof_idx], base_stats = self._profile[
                     prof_idx
                 ].clean_data_and_get_base_stats(

@@ -1142,12 +1142,18 @@ def _validate_helper(self, variable_path: str = "TextProfilerOptions") -> List[s
 class StructuredOptions(BaseOption):
     """For configuring options for structured profiler."""
 
-    def __init__(self, null_values: Dict = None) -> None:
+    def __init__(
+        self,
+        null_values: Dict[str, Union[re.RegexFlag, int]] = None,
+        column_null_values: Dict[int, Dict[str, Union[re.RegexFlag, int]]] = None,
+    ) -> None:
         """
         Construct the StructuredOptions object with default values.
 
         :param null_values: null values we input.
-        :vartype null_values: Union[None, dict]
+        :vartype null_values: Dict[str, Union[re.RegexFlag, int]]
+        :param column_null_values: column level null values we input.
+        :vartype column_null_values: Dict[int, Dict[str, Union[re.RegexFlag, int]]]
         :ivar int: option set for int profiling.
         :vartype int: IntOptions
         :ivar float: option set for float profiling.
@@ -1186,14 +1192,16 @@ def __init__(self, null_values: Dict = None) -> None:
         self.null_replication_metrics = BooleanOption(is_enabled=False)
         # Non-Option variables
         self.null_values = null_values
+        self.column_null_values = column_null_values
 
     @property
     def enabled_profiles(self) -> List[str]:
         """Return a list of the enabled profilers for columns."""
         enabled_profiles = list()
-        # null_values does not have is_enabled
+        # null_values and column_null_values do not have is_enabled
         properties = self.properties
         properties.pop("null_values")
+        properties.pop("column_null_values")
         for key, value in properties.items():
             if value.is_enabled:
                 enabled_profiles.append(key)
@@ -1230,6 +1238,7 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> List[str
         )
         properties = self.properties
         properties.pop("null_values")
+        properties.pop("column_null_values")
         for column in properties:
             if not isinstance(self.properties[column], prop_check[column]):
                 errors.append(
@@ -1258,6 +1267,26 @@ def _validate_helper(self, variable_path: str = "StructuredOptions") -> List[str
                 "a re.RegexFlag".format(variable_path)
             )
 
+        if self.column_null_values is not None and not (
+            isinstance(self.column_null_values, dict)
+            and all(
+                isinstance(key, int)
+                and isinstance(value, dict)
+                and all(
+                    isinstance(k, str) and (isinstance(v, re.RegexFlag) or v == 0)
+                    for k, v in value.items()
+                )
+                for key, value in self.column_null_values.items()
+            )
+        ):
+            errors.append(
+                "{}.column_null_values must be either None or "
+                "a dictionary that contains keys of type int "
+                "that map to dictionaries that contains keys "
+                "of type str and values == 0 or are instances of "
+                "a re.RegexFlag".format(variable_path)
+            )
+
         if (
             isinstance(self.category, CategoricalOptions)
             and isinstance(self.chi2_homogeneity, BooleanOption)

@@ -29,7 +29,7 @@ def test_default_profiler_options(self, *mocks):
             # TODO: remove the check for correlation option once it's updated to True
             if column == "correlation" or column == "null_replication_metrics":
                 self.assertFalse(profile.options.properties[column].is_enabled)
-            elif column == "null_values":
+            elif column == "null_values" or column == "column_null_values":
                 self.assertIsNone(profile.options.properties[column])
             else:
                 self.assertTrue(profile.options.properties[column].is_enabled)

@@ -9,7 +9,7 @@
 class TestStructuredOptions(TestBaseOption):
 
     option_class = StructuredOptions
-    other_keys = ["null_values"]
+    other_keys = ["null_values", "column_null_values"]
     boolean_keys = [
         "int",
         "float",
@@ -83,22 +83,13 @@ def test_set(self):
             with self.assertRaisesRegex(AttributeError, expected_error):
                 option.set({"{}.is_enabled".format(key): True})
 
-        expected_error = (
-            "{}.null_values must be either None or "
-            "a dictionary that contains keys of str type "
-            "and values == 0 or are instances of "
-            "a re.RegexFlag".format(optpth)
-        )
+        for test_dict in ({"a": 0}, {"a": re.IGNORECASE}, None):
+            option.set({"null_values": test_dict})
+            self.assertEqual(test_dict, option.null_values)
 
-        test_dict = {"a": 0}
-        option.set({"null_values": test_dict})
-        self.assertEqual({"a": 0}, option.null_values)
-        test_dict = {"a": re.IGNORECASE}
-        option.set({"null_values": test_dict})
-        self.assertEqual({"a": 2}, option.null_values)
-        test_dict = None
-        option.set({"null_values": test_dict})
-        self.assertEqual(None, option.null_values)
+        for test_dict in ({0: {"a": 0}}, {0: {"a": re.IGNORECASE}}, None):
+            option.set({"column_null_values": test_dict})
+            self.assertEqual(test_dict, option.column_null_values)
 
     def test_validate_helper(self):
         # Valid cases should return [] while invalid cases
@@ -266,9 +257,39 @@ def test_validate(self):
         option.set({"null_values": None})
         self.assertEqual([], option._validate_helper())
 
+        expected_error = [
+            "{}.column_null_values must be either None or "
+            "a dictionary that contains keys of type int "
+            "that map to dictionaries that contains keys "
+            "of type str and values == 0 or are instances of "
+            "a re.RegexFlag".format(optpth)
+        ]
+        # Test column key is not an int
+        option.set({"column_null_values": {"a": {"a": 0}}})
+        self.assertEqual(expected_error, option._validate_helper())
+        # Test key is not a str
+        option.set({"column_null_values": {0: {0: 0}}})
+        self.assertEqual(expected_error, option._validate_helper())
+        # Test value is not correct type (0 or regex)
+        option.set({"column_null_values": {0: {"a": 1}}})
+        self.assertEqual(expected_error, option._validate_helper())
+        # Test variable is not correct variable type
+        option.set({"column_null_values": 1})
+        self.assertEqual(expected_error, option._validate_helper())
+        # Test 0 works for option set
+        option.set({"column_null_values": {0: {"a": 0}}})
+        self.assertEqual([], option._validate_helper())
+        # Test a regex flag works for option set
+        option.set({"column_null_values": {0: {"a": re.IGNORECASE}}})
+        self.assertEqual([], option._validate_helper())
+        # Test None works for option set
+        option.set({"column_null_values": None})
+        self.assertEqual([], option._validate_helper())
+
     def test_enabled_profilers(self):
         options = self.get_options()
         self.assertNotIn("null_values", options.enabled_profiles)
+        self.assertNotIn("column_null_values", options.enabled_profiles)
 
         # All Columns Enabled
         for key in self.boolean_keys:

@@ -2081,6 +2081,41 @@ def test_null_replication_metrics_calculation(self):
         np.testing.assert_array_almost_equal([[np.nan], [18]], column["class_sum"])
         np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"])
 
+    def test_column_level_invalid_values(self):
+        data = pd.DataFrame([[1, 1], [9999999, 2], [3, 3]])
+
+        NO_FLAG = 0
+        profile_options = dp.ProfilerOptions()
+        profile_options.set(
+            {
+                "*.null_values": {
+                    "": NO_FLAG,
+                    "nan": re.IGNORECASE,
+                    "none": re.IGNORECASE,
+                    "null": re.IGNORECASE,
+                    "  *": NO_FLAG,
+                    "--*": NO_FLAG,
+                    "__*": NO_FLAG,
+                    "9" * 7: NO_FLAG,
+                },
+                "*.column_null_values": {
+                    0: {"1": NO_FLAG},
+                    1: {"3": NO_FLAG},
+                },
+                "*.null_replication_metrics.is_enabled": True,
+                "data_labeler.is_enabled": False,
+                "multiprocess.is_enabled": False,
+            }
+        )
+
+        profiler = dp.StructuredProfiler(data, options=profile_options)
+        report = profiler.report()
+
+        np.testing.assert_array_equal(["3"], report["data_stats"][0]["samples"])
+        np.testing.assert_array_equal(
+            ["1", "2"], sorted(report["data_stats"][1]["samples"])
+        )
+
 
 class TestStructuredColProfilerClass(unittest.TestCase):
     def setUp(self):