From 85f9a248f418766ef434edc3dc31b9bc2c24c42c Mon Sep 17 00:00:00 2001 From: Callow Date: Fri, 2 Aug 2024 15:22:10 +0200 Subject: [PATCH 1/2] Allow for arbitrary grid points (WIP) --- mala/datahandling/data_shuffler.py | 79 ++++++++++++++++++++++-------- 1 file changed, 59 insertions(+), 20 deletions(-) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index 62d6e11a..f12b9c62 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -130,11 +130,33 @@ def __shuffle_numpy( ) ) + # if the number of new snapshots is not a divisor of the grid size + # then we have to trim the original snapshots to size + # the indicies to be removed are selected at random + if self.data_points_to_remove is not None: + if self.parameters.shuffling_seed is not None: + np.random.seed(idx * self.parameters.shuffling_seed) + ngrid = descriptor_data[idx].shape[0] + n_desciptor = descriptor_data[idx].shape[-1] + n_target = target_data[idx].shape[-1] + + current_target = target_data[idx].reshape(-1, n_target) + current_descriptor = target_data[idx].reshape(-1, n_descriptor) + + indices = np.random.choice( + ngrid, size=ngrid - self.data_points_to_remove[idx] + ) + + descriptor_data[idx] = current_descriptor[indices] + target_data[idx] = current_target[indicies] + # Do the actual shuffling. - target_name_openpmd = os.path.join(target_save_path, - save_name.replace("*", "%T")) - descriptor_name_openpmd = os.path.join(descriptor_save_path, - save_name.replace("*", "%T")) + target_name_openpmd = os.path.join( + target_save_path, save_name.replace("*", "%T") + ) + descriptor_name_openpmd = os.path.join( + descriptor_save_path, save_name.replace("*", "%T") + ) for i in range(0, number_of_new_snapshots): new_descriptors = np.zeros( (int(np.prod(shuffle_dimensions)), self.input_dimension), @@ -163,16 +185,12 @@ def __shuffle_numpy( ) new_descriptors[ last_start : current_chunk + last_start - ] = descriptor_data[j].reshape( - current_grid_size, self.input_dimension - )[ + ] = descriptor_data[j].reshape(-1, self.input_dimension)[ i * current_chunk : (i + 1) * current_chunk, : ] new_targets[ last_start : current_chunk + last_start - ] = target_data[j].reshape( - current_grid_size, self.output_dimension - )[ + ] = target_data[j].reshape(-1, self.output_dimension)[ i * current_chunk : (i + 1) * current_chunk, : ] @@ -238,7 +256,6 @@ def __shuffle_numpy( # It will be executed one after another for both of them. # Use this class to parameterize which of both should be shuffled. class __DescriptorOrTarget: - def __init__( self, save_path, @@ -256,7 +273,6 @@ def __init__( self.dimension = dimension class __MockedMPIComm: - def __init__(self): self.rank = 0 self.size = 1 @@ -363,9 +379,7 @@ def from_chunk_i(i, n, dset, slice_dimension=0): import json # Do the actual shuffling. - name_prefix = os.path.join( - dot.save_path, save_name.replace("*", "%T") - ) + name_prefix = os.path.join(dot.save_path, save_name.replace("*", "%T")) for i in range(my_items_start, my_items_end): # We check above that in the non-numpy case, OpenPMD will work. dot.calculator.grid_dimensions = list(shuffle_dimensions) @@ -584,11 +598,37 @@ def shuffle_snapshots( del specified_number_of_new_snapshots if number_of_data_points % number_of_new_snapshots != 0: - raise Exception( - "Cannot create this number of snapshots " - "from data provided." - ) + if snapshot_type == numpy: + self.data_points_to_remove = [] + for i in range(0, self.nr_snapshots): + gridsize = self.parameters.directories_list[ + i + ].grid_size + shuffled_gridsize = int( + gridsize / number_of_new_snapshots + ) + self.data_points_to_remove.append( + gridsize + - shuffled_gridsize * number_of_new_snapshots + ) + tot_points_missing = sum(self.data_points_to_remove) + + raise Warning( + "Number of requested snapshots is not a divisor of the original grid sizes.\n" + + str(tot_points_missing) + + "/" + + str(number_of_data_points) + + " will be left out of the shuffled snapshots." + ) + + elif snapshot_type == "openpmd": + # TODO implement arbitrary grid sizes for openpmd + raise Exception( + "Cannot create this number of snapshots " + "from data provided." + ) else: + self.data_points_to_remove = None shuffle_dimensions = [ int(number_of_data_points / number_of_new_snapshots), 1, @@ -606,7 +646,6 @@ def shuffle_snapshots( permutations = [] seeds = [] for i in range(0, number_of_new_snapshots): - # This makes the shuffling deterministic, if specified by the user. if self.parameters.shuffling_seed is not None: np.random.seed(i * self.parameters.shuffling_seed) From 82881e2b216a74b89c930a1079707d970ced10a8 Mon Sep 17 00:00:00 2001 From: Callow Date: Wed, 14 Aug 2024 12:57:56 +0200 Subject: [PATCH 2/2] fix errors --- mala/datahandling/data_shuffler.py | 35 +++++++++++++++++++----------- 1 file changed, 22 insertions(+), 13 deletions(-) diff --git a/mala/datahandling/data_shuffler.py b/mala/datahandling/data_shuffler.py index f12b9c62..223f51b9 100644 --- a/mala/datahandling/data_shuffler.py +++ b/mala/datahandling/data_shuffler.py @@ -137,18 +137,21 @@ def __shuffle_numpy( if self.parameters.shuffling_seed is not None: np.random.seed(idx * self.parameters.shuffling_seed) ngrid = descriptor_data[idx].shape[0] - n_desciptor = descriptor_data[idx].shape[-1] + n_descriptor = descriptor_data[idx].shape[-1] n_target = target_data[idx].shape[-1] current_target = target_data[idx].reshape(-1, n_target) - current_descriptor = target_data[idx].reshape(-1, n_descriptor) + current_descriptor = descriptor_data[idx].reshape( + -1, n_descriptor + ) indices = np.random.choice( - ngrid, size=ngrid - self.data_points_to_remove[idx] + ngrid**3, + size=ngrid**3 - self.data_points_to_remove[idx], ) descriptor_data[idx] = current_descriptor[indices] - target_data[idx] = current_target[indicies] + target_data[idx] = current_target[indices] # Do the actual shuffling. target_name_openpmd = os.path.join( @@ -535,6 +538,8 @@ def shuffle_snapshots( ] number_of_data_points = np.sum(snapshot_size_list) + self.data_points_to_remove = None + if number_of_shuffled_snapshots is None: # If the user does not tell us how many snapshots to use, # we have to check if the number of snapshots is straightforward. @@ -598,10 +603,10 @@ def shuffle_snapshots( del specified_number_of_new_snapshots if number_of_data_points % number_of_new_snapshots != 0: - if snapshot_type == numpy: + if snapshot_type == "numpy": self.data_points_to_remove = [] for i in range(0, self.nr_snapshots): - gridsize = self.parameters.directories_list[ + gridsize = self.parameters.snapshot_directories_list[ i ].grid_size shuffled_gridsize = int( @@ -613,14 +618,19 @@ def shuffle_snapshots( ) tot_points_missing = sum(self.data_points_to_remove) - raise Warning( - "Number of requested snapshots is not a divisor of the original grid sizes.\n" - + str(tot_points_missing) - + "/" - + str(number_of_data_points) - + " will be left out of the shuffled snapshots." + printout( + "Warning: number of requested snapshots is not a divisor of", + "the original grid sizes.\n", + f"{tot_points_missing} / {number_of_data_points} data points", + "will be left out of the shuffled snapshots." ) + shuffle_dimensions = [ + int(number_of_data_points / number_of_new_snapshots), + 1, + 1, + ] + elif snapshot_type == "openpmd": # TODO implement arbitrary grid sizes for openpmd raise Exception( @@ -628,7 +638,6 @@ def shuffle_snapshots( "from data provided." ) else: - self.data_points_to_remove = None shuffle_dimensions = [ int(number_of_data_points / number_of_new_snapshots), 1,