-
Notifications
You must be signed in to change notification settings - Fork 162
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add profiler option for column level invalid values #704
Changes from 6 commits
8314ae4
d0a0894
8fc5386
8d50f14
b319755
8947ea5
991b5ce
de9db62
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,7 @@ def __init__( | |
min_true_samples: int = 0, | ||
sample_ids: np.ndarray = None, | ||
pool: Pool = None, | ||
column_index: int = None, | ||
options: StructuredOptions = None, | ||
) -> None: | ||
""" | ||
|
@@ -69,6 +70,8 @@ def __init__( | |
:type sample_ids: list(list) | ||
:param pool: pool utilized for multiprocessing | ||
:type pool: multiprocessing.Pool | ||
:param column_index: index of the given column | ||
:type column_index: int | ||
:param options: Options for the structured profiler. | ||
:type options: StructuredOptions Object | ||
""" | ||
|
@@ -100,10 +103,13 @@ def __init__( | |
} | ||
if options: | ||
if options.null_values is not None: | ||
self._null_values = options.null_values | ||
self._null_values = options.null_values.copy() | ||
if column_index is not None and options.column_null_values is not None: | ||
self._null_values.update( | ||
options.column_null_values.get(column_index, {}) | ||
) | ||
|
||
if df_series is not None and len(df_series) > 0: | ||
|
||
if not sample_size: | ||
sample_size = self._get_sample_size(df_series) | ||
if sample_size < len(df_series): | ||
|
@@ -497,7 +503,7 @@ def clean_data_and_get_base_stats( | |
:param null_values: Dictionary mapping null values to regex flag where | ||
the key represents the null value to remove from the data and the | ||
flag represents the regex flag to apply | ||
:type null_values: dict[str, re.FLAG] | ||
:type null_values: Dict[str, Union[re.RegexFlag, int]] | ||
:param min_true_samples: Minimum number of samples required for the | ||
profiler | ||
:type min_true_samples: int | ||
|
@@ -2393,7 +2399,10 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> Dict: | |
return merged_properties | ||
|
||
def _update_profile_from_chunk( | ||
self, data: pd.DataFrame, sample_size: int, min_true_samples: int = None | ||
self, | ||
data: Union[List, pd.Series, pd.DataFrame], | ||
sample_size: int, | ||
min_true_samples: int = None, | ||
) -> None: | ||
""" | ||
Iterate over the columns of a dataset and identify its parameters. | ||
|
@@ -2472,6 +2481,7 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: | |
sample_size=sample_size, | ||
min_true_samples=min_true_samples, # type: ignore | ||
sample_ids=sample_ids, # type: ignore | ||
column_index=col_idx, | ||
options=self.options, | ||
) | ||
) | ||
|
@@ -2512,6 +2522,11 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: | |
min_true_samples = self._profile[prof_idx]._min_true_samples | ||
try: | ||
null_values = self._profile[prof_idx]._null_values | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a huge add and most LGTM, one big here though bc a doctor is mutable, this will change self._null_values with the update. If we instead copy prior to a variable, that would alleviate the issue. Great job though! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just need to fix in the locations where we update There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. added |
||
if self.options.column_null_values: | ||
null_values.update( | ||
self.options.column_null_values.get(col_idx, {}) | ||
) | ||
|
||
multi_process_dict[col_idx] = pool.apply_async( | ||
self._profile[prof_idx].clean_data_and_get_base_stats, | ||
( | ||
|
@@ -2551,7 +2566,13 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: | |
prof_idx = col_idx_to_prof_idx[col_idx] | ||
if min_true_samples is None: | ||
min_true_samples = self._profile[prof_idx]._min_true_samples | ||
|
||
null_values = self._profile[prof_idx]._null_values | ||
if self.options.column_null_values: | ||
null_values.update( | ||
self.options.column_null_values.get(col_idx, {}) | ||
) | ||
|
||
clean_sampled_dict[prof_idx], base_stats = self._profile[ | ||
prof_idx | ||
].clean_data_and_get_base_stats( | ||
|
@@ -2569,7 +2590,11 @@ def tqdm(level: Set[int]) -> Generator[int, None, None]: | |
prof_idx = col_idx_to_prof_idx[col_idx] | ||
if min_true_samples is None: | ||
min_true_samples = self._profile[prof_idx]._min_true_samples | ||
|
||
null_values = self._profile[prof_idx]._null_values | ||
if self.options.column_null_values: | ||
null_values.update(self.options.column_null_values.get(col_idx, {})) | ||
|
||
clean_sampled_dict[prof_idx], base_stats = self._profile[ | ||
prof_idx | ||
].clean_data_and_get_base_stats( | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
added copy