From 73abac3156636f26ff308db639f39618fb00ecbb Mon Sep 17 00:00:00 2001 From: Jeremy Goodsitt Date: Mon, 6 Jun 2022 20:36:30 -0500 Subject: [PATCH] fix: overflow --- dataprofiler/profilers/utils.py | 46 +++++++++++++++++---------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/dataprofiler/profilers/utils.py b/dataprofiler/profilers/utils.py index 27dd3f86c..3601ccfb5 100644 --- a/dataprofiler/profilers/utils.py +++ b/dataprofiler/profilers/utils.py @@ -18,24 +18,24 @@ def dict_merge(dct, merge_dct): # Recursive dictionary merge # Copyright (C) 2016 Paul Durivage - # + # # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. - # + # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. - # + # # You should have received a copy of the GNU General Public License # along with this program. If not, see . """ Recursive dict merge. Inspired by :meth:``dict.update()``, instead of updating only top-level keys, dict_merge recurses down into dicts nested to an arbitrary depth, updating keys. The ``merge_dct`` is merged into ``dct``. - + :param dct: dict onto which the merge is executed :param merge_dct: dct merged into dct :return: None @@ -77,14 +77,14 @@ def _combine_unique_sets(a, b): combined_list = set().union(a, b) return list(combined_list) - + def shuffle_in_chunks(data_length, chunk_size): """ A generator for creating shuffled indexes in chunks. This reduces the cost of having to create all indexes, but only of that what is needed. Initial Code idea from: https://www.drmaciver.com/2018/01/lazy-fisher-yates-shuffling-for-precise-rejection-sampling/ - + :param data_length: length of data to be shuffled :param chunk_size: size of shuffled chunks :return: list of shuffled indices of chunk size @@ -105,14 +105,14 @@ def shuffle_in_chunks(data_length, chunk_size): indices = KeyDict() j = 0 - + # loop through all chunks for chunk_ind in range(max(math.ceil(data_length / chunk_size), 1)): # determine the chunk size and preallocate an array true_chunk_size = min(chunk_size, data_length - chunk_size * chunk_ind) values = [-1] * true_chunk_size - + # Generate random list of indexes lower_bound_list = np.array(range(j, j + true_chunk_size)) random_list = rng.integers(lower_bound_list, data_length) @@ -129,7 +129,7 @@ def shuffle_in_chunks(data_length, chunk_size): # increment so as not to include values already swapped j += 1 - + yield values @@ -147,7 +147,7 @@ def warn_on_profile(col_profile, e): warning_msg += "Profiling Type: {}".format(col_profile) warning_msg += "\nException: {}".format(type(e).__name__) warning_msg += "\nMessage: {}".format(e) - # This is considered a major error + # This is considered a major error if type(e).__name__ == "ValueError": raise ValueError(e) warning_msg += "\n\nFor labeler errors, try installing " warning_msg += "the extra ml requirements via:\n\n" @@ -159,7 +159,7 @@ def partition(data, chunk_size): """ Creates a generator which returns the data in the specified chunk size. - + :param data: list, dataframe, etc :type data: list, dataframe, etc :param chunk_size: size of partition to return @@ -177,10 +177,10 @@ def suggest_pool_size(data_size=None, cols=None): :type data_size: int :param cols: columns of the dataset :type cols: int - :return suggested_pool_size: suggeseted pool size + :return suggested_pool_size: suggeseted pool size :rtype suggested_pool_size: int """ - + # Return if there's no data_size if data_size is None: return None @@ -203,10 +203,10 @@ def suggest_pool_size(data_size=None, cols=None): suggested_pool_size = min(max_pool_mem, max_pool_cpu) if cols is not None: suggested_pool_size = min(suggested_pool_size, cols) - + return suggested_pool_size - + def generate_pool(max_pool_size=None, data_size=None, cols=None): """ Generate a multiprocessing pool to allocate functions too @@ -224,12 +224,12 @@ def generate_pool(max_pool_size=None, data_size=None, cols=None): """ suggested_pool_size = suggest_pool_size(data_size, cols) - if max_pool_size is None or suggested_pool_size is None: + if max_pool_size is None or suggested_pool_size is None: max_pool_size = suggested_pool_size - + # Always leave 1 cores free pool = None - if max_pool_size is not None and max_pool_size > 2: + if max_pool_size is not None and max_pool_size > 2: try: pool = mp.Pool(max_pool_size) except Exception as e: @@ -238,7 +238,7 @@ def generate_pool(max_pool_size=None, data_size=None, cols=None): 'Multiprocessing disabled, please change the multiprocessing'+ ' start method, via: multiprocessing.set_start_method()'+ ' Possible methods include: fork, spawn, forkserver, None' - ) + ) return pool, max_pool_size @@ -318,7 +318,8 @@ def biased_skew(df_series): if (M2 == 0): return 0.0 - skew = np.sqrt(n) * M3 / M2 ** 1.5 + with np.errstate(all='ignore'): + skew = np.sqrt(n) * M3 / np.power(M2, 1.5) return skew @@ -354,7 +355,8 @@ def biased_kurt(df_series): if (M2 == 0): return -3.0 - kurt = n * M4 / M2 ** 2 - 3 + with np.errstate(all='ignore'): + kurt = n * M4 / np.power(M2, 2) - 3 return kurt @@ -399,7 +401,7 @@ def find_diff_of_strings_and_bools(stat1, stat2): def find_diff_of_lists_and_sets(stat1, stat2): """ Finds the difference between two stats. If there is no difference, returns - "unchanged". Removes duplicates and returns [unique values of stat1, + "unchanged". Removes duplicates and returns [unique values of stat1, shared values, unique values of stat2]. :param stat1: the first statistical input