From 1aa8eebb7942995b1a61161f82b9ddce310961b1 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 22 May 2024 10:53:50 -0600 Subject: [PATCH 01/10] data export fixes --- particula/data/loader.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/particula/data/loader.py b/particula/data/loader.py index d295e3fea..724033ca7 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -2,6 +2,7 @@ from typing import Union, List from typing import List, Union, Tuple, Dict, Any, Optional +from datetime import datetime, timezone import warnings import glob import os @@ -643,6 +644,10 @@ def save_stream_to_csv( Subfolder within path to save the CSV file. The default is 'output'. include_time : bool, optional Whether to include time data in the first column. The default is True. + include_iso_datatime : bool, optional + Whether to include ISO formatted datetime in the second column. + The default is True. The format is ISO 8601, + '2021-01-01T00:00:00Z'. """ # Validate path if not os.path.isdir(path): @@ -664,7 +669,7 @@ def save_stream_to_csv( # Prepare header header = stream.header if include_time: - header = ['Epoch_UTC'] + header + header = ['DateTime[ISO8601]'] + ['Epoch_UTC'] + header csv_writer.writerow(header) # Write data rows @@ -672,7 +677,10 @@ def save_stream_to_csv( row = stream.data[i, :].tolist() if include_time and len(stream.time) == len(stream.data): time_val = stream.time[i] - row = [time_val] + row + # Convert epoch time to a readable string (ISO 8601 format) + readable_time = datetime.fromtimestamp( + time_val, timezone.utc).isoformat() + 'Z' + row = [readable_time] + [time_val] + row csv_writer.writerow(row) print(f"Stream saved to CSV: {file_name}") except (FileNotFoundError, PermissionError, IOError, OSError) as e: From 54053b2bd5ed2d191bc6968e847a5d9d73003cb0 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 22 May 2024 12:18:04 -0600 Subject: [PATCH 02/10] added nan entry first --- particula/data/stream.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/particula/data/stream.py b/particula/data/stream.py index dc6594c0e..4020213bd 100644 --- a/particula/data/stream.py +++ b/particula/data/stream.py @@ -78,7 +78,10 @@ def __setitem__(self, index: Union[int, str], value): if isinstance(index, str): if index not in self.header: self.header.append(index) # add new header element - self.data = np.hstack((self.data, value)) + if value.ndim == 1: + zeros_array = np.zeros_like(value) * np.nan + zeros_array = zeros_array[:, np.newaxis] # add dimension + self.data = np.hstack((self.data, zeros_array)) index = self.header.index(index) # if index is an int, set the data at that index self.data[:, index] = value From 95436869f581d16b647516699665a6666307b953 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 22 May 2024 16:21:11 -0600 Subject: [PATCH 03/10] updated stats --- particula/data/stream_stats.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/particula/data/stream_stats.py b/particula/data/stream_stats.py index 0f4cfb700..8a85e8af0 100644 --- a/particula/data/stream_stats.py +++ b/particula/data/stream_stats.py @@ -74,8 +74,8 @@ def average_std( step=average_interval ) # generate empty arrays for averaged data and std to be filled in - average = np.zeros([len(new_time_array), len(stream.header)]) - std = np.zeros_like(average) + average = np.zeros([len(new_time_array), len(stream.header)]) * np.nan + std = np.zeros_like(average) * np.nan # average data average, std = stats.average_to_interval( From bbe5cf8934cfb5c18ba878f09282d78a59e01ef2 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Thu, 23 May 2024 11:00:56 -0600 Subject: [PATCH 04/10] added filter --- particula/data/loader.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/particula/data/loader.py b/particula/data/loader.py index 724033ca7..2c6b78e25 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -146,6 +146,9 @@ def data_format_checks(data: List[str], data_checks: dict) -> List[str]: < data_checks['characters'][1] ) ] + if 'filter' in data_checks: + filter_chars = data_checks['filter'] + data = [x for x in data if not any(char in x for char in filter_chars)] if len(data) / length_initial < FILTER_WARNING_FRACTION: warnings.warn( f"More than {FILTER_WARNING_FRACTION} rows are filtered based on " From 3cdc82b8e551a3bb556c61495600a07f9205d0e8 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Thu, 23 May 2024 11:16:22 -0600 Subject: [PATCH 05/10] removed loader filter, as it can us char counts filter --- particula/data/loader.py | 3 --- 1 file changed, 3 deletions(-) diff --git a/particula/data/loader.py b/particula/data/loader.py index 2c6b78e25..724033ca7 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -146,9 +146,6 @@ def data_format_checks(data: List[str], data_checks: dict) -> List[str]: < data_checks['characters'][1] ) ] - if 'filter' in data_checks: - filter_chars = data_checks['filter'] - data = [x for x in data if not any(char in x for char in filter_chars)] if len(data) / length_initial < FILTER_WARNING_FRACTION: warnings.warn( f"More than {FILTER_WARNING_FRACTION} rows are filtered based on " From ce5be2565c77dc7ab97d724d8a002004b5284b0f Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Thu, 23 May 2024 14:24:03 -0600 Subject: [PATCH 06/10] update size dist to list --- particula/data/process/size_distribution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/particula/data/process/size_distribution.py b/particula/data/process/size_distribution.py index b073b74ab..1e0f0294a 100644 --- a/particula/data/process/size_distribution.py +++ b/particula/data/process/size_distribution.py @@ -496,6 +496,6 @@ def resample_distribution( # assemble the stream stream.data = new_concentration - stream.header = new_diameters.astype(str) + stream.header = new_diameters.astype(str).tolist() return stream From c664fbee1be382e25655c551d11b7ed8204ab269 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 5 Jun 2024 10:02:59 -0600 Subject: [PATCH 07/10] save export --- particula/data/loader.py | 313 ++++++++++++++++++++++----------------- 1 file changed, 180 insertions(+), 133 deletions(-) diff --git a/particula/data/loader.py b/particula/data/loader.py index 724033ca7..dbd3971bd 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -37,7 +37,7 @@ def data_raw_loader(file_path: str) -> list: ['line 1', 'line 2', 'line 3'] """ try: - with open(file_path, 'r', encoding='utf8', errors='replace') as file: + with open(file_path, "r", encoding="utf8", errors="replace") as file: data = [line.rstrip() for line in file] except FileNotFoundError: print(f"File not found: {file_path}") @@ -84,11 +84,13 @@ def filter_list(data: List[str], char_counts: dict) -> List[str]: for char, count in char_counts.items(): if count > -1: filtered_data = [ - row for row in filtered_data if row.count(char) == count] + row for row in filtered_data if row.count(char) == count + ] if len(filtered_data) / len(data) < FILTER_WARNING_FRACTION: warnings.warn( - f"More than {FILTER_WARNING_FRACTION} of the rows have " + - f"been filtered out based on the character: {char}.") + f"More than {FILTER_WARNING_FRACTION} of the rows have " + + f"been filtered out based on the character: {char}." + ) return filtered_data @@ -121,29 +123,24 @@ def data_format_checks(data: List[str], data_checks: dict) -> List[str]: if not isinstance(data, list): raise TypeError("data must be a list") length_initial = len(data) - if data_checks.get('skip_rows', 0) > 0: - data = data[data_checks['skip_rows']:] - if data_checks.get('skip_end', 0) > 0: - data = data[:-data_checks['skip_end']] - if len(data_checks.get('characters', [])) == 1: + if data_checks.get("skip_rows", 0) > 0: + data = data[data_checks["skip_rows"] :] + if data_checks.get("skip_end", 0) > 0: + data = data[: -data_checks["skip_end"]] + if len(data_checks.get("characters", [])) == 1: # Filter out any rows with fewer than the specified number of # characters. - data = [ - x for x in data - if ( - len(x) - > data_checks['characters'][0] - ) - ] - elif len(data_checks.get('characters', [])) == 2: + data = [x for x in data if (len(x) > data_checks["characters"][0])] + elif len(data_checks.get("characters", [])) == 2: # Filter out any rows with fewer than the minimum or more than the # maximum number of characters. data = [ - x for x in data + x + for x in data if ( - data_checks['characters'][0] + data_checks["characters"][0] < len(x) - < data_checks['characters'][1] + < data_checks["characters"][1] ) ] if len(data) / length_initial < FILTER_WARNING_FRACTION: @@ -151,13 +148,13 @@ def data_format_checks(data: List[str], data_checks: dict) -> List[str]: f"More than {FILTER_WARNING_FRACTION} rows are filtered based on " + f"{data_checks['characters']} or skip rows." ) - if 'char_counts' in data_checks: - char_counts = data_checks.get('char_counts', {}) + if "char_counts" in data_checks: + char_counts = data_checks.get("char_counts", {}) data = filter_list(data, char_counts) if data := [x.strip() for x in data]: return data else: - raise ValueError('No data left in file') + raise ValueError("No data left in file") def parse_time_column( @@ -166,7 +163,7 @@ def parse_time_column( line: np.ndarray, date_offset: Optional[str] = None, seconds_shift: int = 0, - timezone_identifier: str = 'UTC' + timezone_identifier: str = "UTC", ) -> float: """ Parses the time column of a data line and returns it as a timestamp. @@ -195,7 +192,7 @@ def parse_time_column( ValueError If an invalid time column or format is specified. """ - if time_format == 'epoch': + if time_format == "epoch": return float(line[time_column]) + seconds_shift if date_offset: # if the time is in one column, and the date is fixed @@ -219,7 +216,8 @@ def parse_time_column( + seconds_shift ) raise ValueError( - f"Invalid time column or format: {time_column}, {time_format}") + f"Invalid time column or format: {time_column}, {time_format}" + ) def sample_data( @@ -230,7 +228,7 @@ def sample_data( delimiter: str, date_offset: Optional[str] = None, seconds_shift: int = 0, - timezone_identifier: str = 'UTC' + timezone_identifier: str = "UTC", ) -> Tuple[np.ndarray, np.ndarray]: """ Samples the data to get the time and data streams. @@ -288,48 +286,94 @@ def sample_data( line=line_array, date_offset=date_offset, seconds_shift=seconds_shift, - timezone_identifier=timezone_identifier + timezone_identifier=timezone_identifier, ) for j, col in enumerate(data_columns): - value = line_array[col].strip() if col < len(line_array) else '' - if value in ['', '.']: # no data + value = line_array[col].strip() if col < len(line_array) else "" + if value in ["", "."]: # no data data_array[i, j] = np.nan - elif value.count('�') > 0: + elif value.count("�") > 0: data_array[i, j] = np.nan elif value[0].isnumeric(): # if the first character is a number data_array[i, j] = float(value) elif value[-1].isnumeric(): data_array[i, j] = float(value) - elif value[0] == '-': + elif value[0] == "-": data_array[i, j] = float(value) - elif value[0] == '+': + elif value[0] == "+": data_array[i, j] = float(value) - elif value[0] == '.': + elif value[0] == ".": try: data_array[i, j] = float(value) except ValueError as exc: print(line_array) raise ValueError( - f'Data is not a float: row {i}, col {j}, value {value}' + f"Data is not a float: row {i}, col {j}, value {value}" ) from exc elif value.isalpha(): true_match = [ - 'ON', 'on', 'On', 'oN', '1', 'True', 'true', - 'TRUE', 'tRUE', 't', 'T', 'Yes', 'yes', 'YES', - 'yES', 'y', 'Y' + "ON", + "on", + "On", + "oN", + "1", + "True", + "true", + "TRUE", + "tRUE", + "t", + "T", + "Yes", + "yes", + "YES", + "yES", + "y", + "Y", ] false_match = [ - 'OFF', 'off', 'Off', 'oFF', '0', - 'False', 'false', 'FALSE', 'fALSE', 'f', - 'F', 'No', 'no', 'NO', 'nO', 'n', 'N' + "OFF", + "off", + "Off", + "oFF", + "0", + "False", + "false", + "FALSE", + "fALSE", + "f", + "F", + "No", + "no", + "NO", + "nO", + "n", + "N", ] nan_match = [ - 'NaN', 'nan', 'Nan', 'nAN', 'NAN', 'NaN', - 'nAn', 'naN', 'NA', 'Na', 'nA', 'na', - 'N', 'n', '', 'aN', 'null', 'NULL', 'Null', - '-99999', '-9999', '.' + "NaN", + "nan", + "Nan", + "nAN", + "NAN", + "NaN", + "nAn", + "naN", + "NA", + "Na", + "nA", + "na", + "N", + "n", + "", + "aN", + "null", + "NULL", + "Null", + "-99999", + "-9999", + ".", ] if value in true_match: data_array[i, j] = 1 @@ -339,8 +383,8 @@ def sample_data( data_array[i, j] = np.nan else: raise ValueError( - f'No match for data value: row {i}, \ - col {j}, value {value}' + f"No match for data value: row {i}, \ + col {j}, value {value}" ) return epoch_time, data_array @@ -352,11 +396,11 @@ def general_data_formatter( data_column: list, time_column: Union[int, List[int]], time_format: str, - delimiter: str = ',', + delimiter: str = ",", header_row: int = 0, date_offset: Optional[str] = None, seconds_shift: int = 0, - timezone_identifier: str = 'UTC' + timezone_identifier: str = "UTC", ) -> Tuple[np.ndarray, np.ndarray]: """ Formats and samples the data to get the time and data streams. @@ -391,8 +435,7 @@ def general_data_formatter( if isinstance(data_column[0], str): data_header = data[header_row].split(delimiter) # Get data column indices - data_column = [data_header.index(x) - for x in data_column] + data_column = [data_header.index(x) for x in data_column] # Check the data format data = data_format_checks(data, data_checks) @@ -406,7 +449,7 @@ def general_data_formatter( delimiter, date_offset, seconds_shift, - timezone_identifier + timezone_identifier, ) return epoch_time, data_array @@ -438,7 +481,8 @@ def keyword_to_index(keyword: Union[str, int], header: List[str]) -> int: if isinstance(keyword, int): if keyword < 0 or keyword >= len(header): raise ValueError( - f"Index {keyword} is out of range for the header.") + f"Index {keyword} is out of range for the header." + ) return keyword elif keyword in header: return header.index(keyword) @@ -452,11 +496,11 @@ def sizer_data_formatter( data_sizer_reader: Dict[str, str], time_column: Union[int, List[int]], time_format: str, - delimiter: str = ',', + delimiter: str = ",", header_row: int = 0, date_offset: Optional[str] = None, seconds_shift: int = 0, - timezone_identifier: str = 'UTC' + timezone_identifier: str = "UTC", ) -> Tuple[np.ndarray, np.ndarray, list]: """ Formats data from a particle sizer. @@ -491,18 +535,21 @@ def sizer_data_formatter( data_header = data[header_row].split(delimiter) # Convert start and end keywords to indices dp_start_index = keyword_to_index( - data_sizer_reader["Dp_start_keyword"], data_header) + data_sizer_reader["Dp_start_keyword"], data_header + ) dp_end_index = keyword_to_index( - data_sizer_reader["Dp_end_keyword"], data_header) + data_sizer_reader["Dp_end_keyword"], data_header + ) # Ensure dp_start_index and dp_end_index are within valid range if dp_start_index > dp_end_index: raise ValueError( - "Dp_start_keyword must come before Dp_end_keyword in the header") + "Dp_start_keyword must come before Dp_end_keyword in the header" + ) # Generate the range of column indices to include dp_columns = list( range(dp_start_index, dp_end_index + 1) - ) # +1 to include the end index + ) # +1 to include the end index # Extract headers for the specified range header = [data_header[i] for i in dp_columns] @@ -518,7 +565,7 @@ def sizer_data_formatter( delimiter, date_offset, seconds_shift=seconds_shift, - timezone_identifier=timezone_identifier + timezone_identifier=timezone_identifier, ) if "convert_scale_from" in data_sizer_reader: @@ -527,16 +574,13 @@ def sizer_data_formatter( data_2d[i, :] = convert.convert_sizer_dn( diameter=np.array(header).astype(float), dn_dlogdp=data_2d[i, :], - inverse=True + inverse=True, ) return epoch_time, data_2d, header -def non_standard_date_location( - data: list, - date_location: dict -) -> str: +def non_standard_date_location(data: list, date_location: dict) -> str: """ Extracts the date from a non-standard location in the data. @@ -562,12 +606,12 @@ def non_standard_date_location( ValueError If an unsupported or invalid method is specified in date_location. """ - if date_location['method'] != 'file_header_block': - raise ValueError('Invalid date location method specified') + if date_location["method"] != "file_header_block": + raise ValueError("Invalid date location method specified") - row_index = date_location['row'] - delimiter = date_location['delimiter'] - index = date_location['index'] + row_index = date_location["row"] + delimiter = date_location["delimiter"] + index = date_location["index"] return data[row_index].split(delimiter)[index].strip() @@ -610,13 +654,13 @@ def get_files_in_folder_with_size( # filter the files by size full_path = [ - file for file in file_list + file + for file in file_list if os.path.getsize(os.path.join(search_path, file)) > min_size ] # get the file names only - file_list = [os.path.split(path)[-1] - for path in full_path] + file_list = [os.path.split(path)[-1] for path in full_path] file_size_in_bytes = [os.path.getsize(path) for path in full_path] return file_list, full_path, file_size_in_bytes @@ -626,12 +670,12 @@ def save_stream_to_csv( stream: Stream, path: str, suffix_name: Optional[str] = None, - folder: Optional[str] = 'output', + folder: Optional[str] = "output", include_time: bool = True, ) -> None: """ Save stream object as a CSV file, with an option to include formatted time. - + Args: ---------- stream : Stream @@ -657,29 +701,34 @@ def save_stream_to_csv( os.makedirs(output_folder, exist_ok=True) # Add suffix to file name if present - file_name = f'data{suffix_name}.csv' \ - if suffix_name is not None else 'data.csv' + file_name = ( + f"data{suffix_name}.csv" if suffix_name is not None else "data.csv" + ) file_path = os.path.join(output_folder, file_name) try: # Save stream data to CSV - with open(file_path, mode='w', newline='') as csv_file: + with open(file_path, mode="w", newline="") as csv_file: csv_writer = csv.writer(csv_file) - + # Prepare header header = stream.header if include_time: - header = ['DateTime[ISO8601]'] + ['Epoch_UTC'] + header + header = ["DateTime[ISO8601]"] + ["Epoch_UTC"] + header csv_writer.writerow(header) - + # Write data rows for i in range(len(stream.data)): row = stream.data[i, :].tolist() if include_time and len(stream.time) == len(stream.data): time_val = stream.time[i] # Convert epoch time to a readable string (ISO 8601 format) - readable_time = datetime.fromtimestamp( - time_val, timezone.utc).isoformat() + 'Z' + readable_time = ( + datetime.fromtimestamp( + time_val, timezone.utc + ).isoformat() + + "Z" + ) row = [readable_time] + [time_val] + row csv_writer.writerow(row) print(f"Stream saved to CSV: {file_name}") @@ -697,11 +746,11 @@ def save_stream( path: str, stream: Stream, sufix_name: Optional[str] = None, - folder: Optional[str] = 'output' + folder: Optional[str] = "output", ) -> None: """ Save stream object as a pickle file. - + Args ---------- stream : Stream @@ -720,14 +769,15 @@ def save_stream( os.makedirs(output_folder, exist_ok=True) # add suffix to file name if present - file_name = f'stream{sufix_name}.pk' \ - if sufix_name is not None else 'stream.pk' + file_name = ( + f"stream{sufix_name}.pk" if sufix_name is not None else "stream.pk" + ) # path to save pickle file file_path = os.path.join(output_folder, file_name) try: # Attempt to save the stream - with open(file_path, 'wb') as file: + with open(file_path, "wb") as file: pickle.dump(stream, file) print(f"Stream saved: {file_name}") except IOError as e: @@ -744,11 +794,11 @@ def save_stream( def load_stream( path: str, sufix_name: Optional[str] = None, - folder: Optional[str] = 'output' + folder: Optional[str] = "output", ) -> Stream: """ Load stream object from a pickle file. - + Args ---------- path : str @@ -757,7 +807,7 @@ def load_stream( Suffix to add to pickle file name. The default is None. folder : str, optional Folder to load pickle file from. The default is 'output'. - + Returns ------- Stream @@ -767,13 +817,14 @@ def load_stream( if not os.path.isdir(path): raise ValueError(f"Provided path '{path}' is not a directory.") # add suffix to file name if present - file_name = f'stream{sufix_name}.pk' \ - if sufix_name is not None else 'stream.pk' + file_name = ( + f"stream{sufix_name}.pk" if sufix_name is not None else "stream.pk" + ) # path to load pickle file file_path = os.path.join(path, folder, file_name) # load stream - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: stream = pickle.load(file) return stream @@ -783,7 +834,7 @@ def save_lake( path: str, lake: Lake, sufix_name: Optional[str] = None, - folder: Optional[str] = 'output' + folder: Optional[str] = "output", ) -> None: """ Save lake object as a pickle file. @@ -797,7 +848,7 @@ def save_lake( sufix_name : str, optional Suffix to add to pickle file name. The default is None. """ - print('Saving lake...') + print("Saving lake...") # Validate path if not os.path.isdir(path): raise ValueError(f"Provided path '{path}' is not a directory.") @@ -806,14 +857,13 @@ def save_lake( os.makedirs(output_folder, exist_ok=True) # add suffix to file name if present - file_name = f'lake{sufix_name}.pk' \ - if sufix_name is not None else 'lake.pk' + file_name = f"lake{sufix_name}.pk" if sufix_name is not None else "lake.pk" # path to save pickle file file_path = os.path.join(output_folder, file_name) try: # Attempt to save the datalake - with open(file_path, 'wb') as file: + with open(file_path, "wb") as file: pickle.dump(lake, file) print(f"Lake saved: {file_name}") except IOError as e: @@ -829,7 +879,8 @@ def save_lake( def load_lake( path: str, - sufix_name: Optional[str] = None + sufix_name: Optional[str] = None, + folder: str = "output", ) -> Lake: """ Load datalake object from a pickle file. @@ -843,23 +894,23 @@ def load_lake( ------- data_lake : DataLake Loaded DataLake object. + + Notes: + spell correction sufix -> suffix """ - file_name = f'lake{sufix_name}.pk' \ - if sufix_name is not None else 'lake.pk' + file_name = f"lake{sufix_name}.pk" if sufix_name is not None else "lake.pk" # path to load pickle file - file_path = os.path.join(path, 'output', file_name) + load_folder = os.path.join(path, folder) + file_path = os.path.join(load_folder, file_name) # load datalake - with open(file_path, 'rb') as file: + with open(file_path, "rb") as file: lake = pickle.load(file) return lake -def netcdf_get_epoch_time( - file_path: str, - settings: dict -) -> np.ndarray: +def netcdf_get_epoch_time(file_path: str, settings: dict) -> np.ndarray: """ Given a netCDF file path and settings, returns an array of epoch times in seconds as a float. @@ -877,9 +928,9 @@ def netcdf_get_epoch_time( """ nc_file = nc.Dataset(file_path) # type: ignore - epoch_time = np.zeros(nc_file.dimensions['time'].size) + epoch_time = np.zeros(nc_file.dimensions["time"].size) - for time_col in settings['time_column']: + for time_col in settings["time_column"]: epoch_time += nc_file.variables.get(time_col)[:] epoch_time = np.array(epoch_time.astype(float)) nc_file.close() @@ -888,8 +939,7 @@ def netcdf_get_epoch_time( def netcdf_data_1d_load( - file_path: str, - settings: dict + file_path: str, settings: dict ) -> Tuple[np.ndarray, list, np.ndarray]: """ Given a netCDF file path and settings, returns a tuple containing the @@ -911,11 +961,11 @@ def netcdf_data_1d_load( KeyError: If the settings dictionary does not contain 'data_1d'. """ # check if data_1d is in the settings dic - if 'data_1d' not in settings['netcdf_reader']: + if "data_1d" not in settings["netcdf_reader"]: raise KeyError("data_1d not in settings['netcdf_reader']") # get header - header_1d = settings['netcdf_reader']['header_1d'] + header_1d = settings["netcdf_reader"]["header_1d"] nc_file = nc.Dataset(file_path) # type: ignore # get epoch time @@ -923,11 +973,13 @@ def netcdf_data_1d_load( # empty array to store data data_1d = np.zeros( - (len(settings['netcdf_reader']['data_1d']), - nc_file.dimensions['time'].size) + ( + len(settings["netcdf_reader"]["data_1d"]), + nc_file.dimensions["time"].size, + ) ) # select and fill masked array with nan - for i, data_col in enumerate(settings['netcdf_reader']['data_1d']): + for i, data_col in enumerate(settings["netcdf_reader"]["data_1d"]): try: data = nc_file.variables.get(data_col)[:] data_1d[i, :] = np.ma.filled(data.astype(float), np.nan) @@ -938,17 +990,15 @@ def netcdf_data_1d_load( # check data shape, transpose if necessary so that time is last dimension data_1d = convert.data_shape_check( - time=epoch_time, - data=data_1d, - header=header_1d) + time=epoch_time, data=data_1d, header=header_1d + ) return epoch_time, header_1d, data_1d # pylint: disable-all def netcdf_data_2d_load( - file_path: str, - settings: dict + file_path: str, settings: dict ) -> Tuple[np.ndarray, list, np.ndarray]: """ Given a netCDF file path and settings, returns a tuple containing the @@ -970,7 +1020,7 @@ def netcdf_data_2d_load( KeyError: If the settings dictionary does not contain 'data_2d'. """ # check if data_1d is in the settings dic - if 'data_2d' not in settings['netcdf_reader']: + if "data_2d" not in settings["netcdf_reader"]: raise KeyError("data_2d not in settings['netcdf_reader']") # get epoch time @@ -979,13 +1029,13 @@ def netcdf_data_2d_load( nc_file = nc.Dataset(file_path) # type: ignore # select data_2d - data_2d = nc_file.variables.get(settings['netcdf_reader']['data_2d'])[:] + data_2d = nc_file.variables.get(settings["netcdf_reader"]["data_2d"])[:] # convert masked array to numpy array data_2d = np.ma.filled(data_2d.astype(float), np.nan) # get header - header_2d = nc_file.variables.get( - settings['netcdf_reader']['header_2d'] - )[:] + header_2d = nc_file.variables.get(settings["netcdf_reader"]["header_2d"])[ + : + ] nc_file.close() # convert header to list of strings @@ -993,9 +1043,8 @@ def netcdf_data_2d_load( # check data shape, transpose if necessary so that time is last dimension data_2d = convert.data_shape_check( - time=epoch_time, - data=data_2d, - header=header_2d) + time=epoch_time, data=data_2d, header=header_2d + ) return epoch_time, header_2d, data_2d @@ -1022,9 +1071,7 @@ def netcdf_info_print(file_path, file_return=False): print(dim, len(nc_file.dimensions[dim])) print("\nVariables:") for var in nc_file.variables: - print(var, - nc_file.variables[var].shape, - nc_file.variables[var].dtype) + print(var, nc_file.variables[var].shape, nc_file.variables[var].dtype) print("\nHeaders:") for attr in nc_file.ncattrs(): print(attr, "=", getattr(nc_file, attr)) From 15cfabc0bea5edac28627da9522097879909f5ae Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 5 Jun 2024 10:55:36 -0600 Subject: [PATCH 08/10] update load more flexibility --- particula/data/loader.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/particula/data/loader.py b/particula/data/loader.py index dbd3971bd..ed39df9b0 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -880,7 +880,7 @@ def save_lake( def load_lake( path: str, sufix_name: Optional[str] = None, - folder: str = "output", + folder: Optional[str] = None, ) -> Lake: """ Load datalake object from a pickle file. @@ -900,7 +900,10 @@ def load_lake( """ file_name = f"lake{sufix_name}.pk" if sufix_name is not None else "lake.pk" # path to load pickle file - load_folder = os.path.join(path, folder) + if folder is not None: + load_folder = os.path.join(path, folder) + else: + load_folder = path file_path = os.path.join(load_folder, file_name) # load datalake From a381a08ad3712dad9e719c1c4016d7c303e546ec Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 5 Jun 2024 12:05:18 -0600 Subject: [PATCH 09/10] updated header --- particula/data/loader.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/particula/data/loader.py b/particula/data/loader.py index ed39df9b0..989d91f0e 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -1,4 +1,4 @@ -"""File readers and loaders for datacula.""" +"""File readers and loaders.""" from typing import Union, List from typing import List, Union, Tuple, Dict, Any, Optional @@ -20,7 +20,8 @@ def data_raw_loader(file_path: str) -> list: - """ + """Loads raw data from file. + Load raw data from a file at the specified file path and return it as a list of strings. From 6475eac74134573eefdb0af03021b728ae59de66 Mon Sep 17 00:00:00 2001 From: Kyle Gorkowski Date: Wed, 5 Jun 2024 12:21:55 -0600 Subject: [PATCH 10/10] Update particula/data/loader.py Co-authored-by: sourcery-ai[bot] <58596630+sourcery-ai[bot]@users.noreply.github.com> --- particula/data/loader.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/particula/data/loader.py b/particula/data/loader.py index 989d91f0e..2b1a4299e 100644 --- a/particula/data/loader.py +++ b/particula/data/loader.py @@ -901,10 +901,7 @@ def load_lake( """ file_name = f"lake{sufix_name}.pk" if sufix_name is not None else "lake.pk" # path to load pickle file - if folder is not None: - load_folder = os.path.join(path, folder) - else: - load_folder = path + load_folder = os.path.join(path, folder) if folder is not None else path file_path = os.path.join(load_folder, file_name) # load datalake