diff --git a/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py b/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py index c39d1c8..7441060 100644 --- a/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py +++ b/jlab_datascience_toolkit/data_parser/parser_to_dataframe.py @@ -1,4 +1,5 @@ from jlab_datascience_toolkit.core.jdst_data_parser import JDSTDataParser +from jlab_datascience_toolkit.utils.io import save_yaml_config, load_yaml_config from pathlib import Path import pandas as pd import logging @@ -119,11 +120,7 @@ def load(self, path: str): path (str): Path to folder containing module files. """ base_path = Path(path) - with open(base_path.joinpath('config.yaml'), 'r') as f: - loaded_config = yaml.safe_load(f) - - self.config.update(loaded_config) - self.setup() + self.load_config(base_path) def save(self, path: str): """Save the entire module state to a folder at `path` @@ -133,8 +130,7 @@ def save(self, path: str): """ save_dir = Path(path) os.makedirs(save_dir) - with open(save_dir.joinpath('config.yaml'), 'w') as f: - yaml.safe_dump(self.config, f) + self.save_config(save_dir) def load_data(self) -> pd.DataFrame: """ Loads all files listed in `config['filepaths']` @@ -166,13 +162,19 @@ def load_data(self) -> pd.DataFrame: return output - def load_config(self, path: str): - parser_log.debug('Calling load()...') - return self.load(path) + def load_config(self, path: Path | str): + self.config.update(load_yaml_config(path)) + self.setup() + + def save_config(self, path: Path | str, overwrite=False): + """ Saves this modules configuration to the file specified by path + If path is a directory, we save the configuration as config.yaml - def save_config(self, path: str): - parser_log.debug('Calling save()...') - return self.save(path) + Args: + path (Path | str): Location for saved configuration. Either a filename or directory is + acceptable. + """ + save_yaml_config(self.config, path, overwrite) def save_data(self): return super().save_data() diff --git a/jlab_datascience_toolkit/data_prep/__init__.py b/jlab_datascience_toolkit/data_prep/__init__.py index 0f8b07c..f641563 100644 --- a/jlab_datascience_toolkit/data_prep/__init__.py +++ b/jlab_datascience_toolkit/data_prep/__init__.py @@ -7,3 +7,7 @@ from jlab_datascience_toolkit.data_prep.numpy_minmax_scaler import NumpyMinMaxScaler +register( + id = "PandasStandardScaler_v0", + entry_point="jlab_datascience_toolkit.data_prep.pandas_standard_scaler:PandasStandardScaler" +) \ No newline at end of file diff --git a/jlab_datascience_toolkit/data_prep/pandas_standard_scaler.py b/jlab_datascience_toolkit/data_prep/pandas_standard_scaler.py new file mode 100644 index 0000000..aaab014 --- /dev/null +++ b/jlab_datascience_toolkit/data_prep/pandas_standard_scaler.py @@ -0,0 +1,226 @@ +from jlab_datascience_toolkit.core.jdst_data_prep import JDSTDataPrep +from jlab_datascience_toolkit.utils.io import save_yaml_config, load_yaml_config +from pathlib import Path +import pandas as pd +import numpy as np +import logging +import inspect +import yaml +import os + +prep_log = logging.getLogger("Prep Logger") + + +def _fix_small_scales(scale, epsilon): + """Updates scale parameters below epsilon to 1 to prevent issues with small divisors + + Args: + scale (array_like): Scale parameters to (potentially) fix + epsilon (float): Smallest allowable value for scale parameters + + Returns: + array_like: Updated scale parameters + """ + return np.where(scale < epsilon, 1, scale) + + +class PandasStandardScaler(JDSTDataPrep): + """Module performs standard scaling on Pandas DataFrames. + + Intialization arguments: + config: dict + + Optional configuration keys: + axis: int = 0 + Axis to perform scaling on. Accepts 0,1 or None. Defaults to 0. + epsilon: float = 1e-7 + Smallest allowable value for the standard deviation. Defaults to 1e-7. + If smaller than epsilon, the output variance will not be modified. + This avoids exploding small noise variance values. + inplace: bool = False + If True, operations modify the original DataFrame. Defaults to False. + + Attributes + ---------- + name : str + Name of the module + config: dict + Configuration information + + Methods + ------- + get_info() + Prints this docstring + load(path) + Loads this module (including fit scaler parameters) from `path` + save(path) + Saves this module (including fit scaler parameters) to `path` + load_config(path) + Loads a configuration file. Scaler parameters will be fit to new data. + save_config(path) + Calls `save(path)` + run(data) + Performs standard scaling on `data`. If the scaler has not been previously + fit, the scaler parameters will be fit to `data`. Otherwise, the scaling + will utilize mean and variance information from the most recent `fit()` call. + fit(data) + Sets scaler parameters for mean and variance based on `data` + reverse(data) + Performs inverse scaling on `data`. + save_data(path) + Does nothing. + + """ + + def __init__(self, config: dict = None, registry_config: dict = None): + # Set default config + self.config = dict(axis=0, epsilon=1e-7, inplace=False) + + if registry_config is not None: + self.config.update(registry_config) + if config is not None: + self.config.update(config) + + self.setup() + + @property + def name(self): + return "PandasStandardScaler_v0" + + def setup(self): + self.mean = None + self.var = None + self.scale = None + self.n_samples = 0 + + def get_info(self): + """Prints this module's docstring.""" + print(inspect.getdoc(self)) + + def save(self, path: str): + """Save entire module to a folder at `path` + + Args: + path (str): Location to save the module. This path must not currently exist. + """ + os.makedirs(path) + self.save_config(path) + self.save_internal_state(path) + + def load(self, path: str): + """Load entire saved module from `path` + + Args: + path (str): Directory to load module from. Should include a config.yaml + and scaler_state.npz files. + """ + self.load_config(path) + self.load_internal_state(path) + + def save_config(self, path: str, overwrite: bool = False): + """Save the module configuration to a folder at `path` + + Args: + path (str): Location to save the module config yaml file + overwrite (bool, optional): If True, overwrites file at path if it exists. + Defaults to False. + """ + save_dir = Path(path) + save_yaml_config(self.config, save_dir, overwrite) + + def load_config(self, path: str): + """Load the entire module state from `path` + + Args: + path (str): Path to folder containing module files. + """ + base_path = Path(path) + self.config.update(load_yaml_config(base_path)) + self.setup() + + def save_internal_state(self, path: str): + internal_state = dict( + mean=self.mean, var=self.var, scale=self.scale, n_samples=self.n_samples + ) + save_dir = Path(path) + if not save_dir.exists(): + os.makedirs(save_dir) + np.savez(save_dir.joinpath("scaler_state.npz"), **internal_state) + + def load_internal_state(self, path: str): + save_dir = Path(path) + internal_state = np.load(save_dir.joinpath("scaler_state.npz")) + self.mean = internal_state["mean"] + self.var = internal_state["var"] + self.scale = internal_state["scale"] + self.n_samples = internal_state["n_samples"] + + def run(self, data: pd.DataFrame): + if self.mean is None: + prep_log.debug("Fitting new data on run()") + self.fit(data) + + return self.transform(data) + + def reverse(self, data: pd.DataFrame): + """Performs inverse scaling on `data` + + Args: + data (pd.DataFrame): Data to perform inverse scaling on. + + Returns: + pd.DataFrame: Inverse scaled DataFrame + """ + return self.inverse_transform(data) + + def fit(self, data: pd.DataFrame): + """Sets internal scaler parameters based on the mean and variance of `data` + + Args: + data (pd.DataFrame): DataFrame used to fit the scaler + """ + # Since we do not modify data here, we can avoid a copy using np.asarray + data_view = np.asarray(data) + self.mean = np.mean(data_view, axis=self.config["axis"]) + self.var = np.var(data_view, axis=self.config["axis"]) + self.scale = _fix_small_scales(np.sqrt(self.var), self.config["epsilon"]) + self.n_samples = data.shape[0] + + def transform(self, data): + if self.mean is None: + raise RuntimeError() + data_view = np.array(data, copy=not self.config["inplace"]) + if self.config["axis"] is not None: + data_rotated = np.rollaxis(data_view, self.config["axis"]) + else: + data_rotated = data_view + data_rotated -= self.mean + data_rotated /= self.scale + + if self.config["inplace"]: + return + + output = data.copy() + output.values[:] = data_view + return output + + def inverse_transform(self, data): + if self.mean is None: + raise RuntimeError() + data_view = np.array(data, copy=not self.config["inplace"]) + if self.config["axis"] is not None: + data_rotated = np.rollaxis(data_view, self.config["axis"]) + else: + data_rotated = data_view + data_rotated *= self.scale + data_rotated += self.mean + + if self.config["inplace"]: + return + + output = data.copy() + output.values[:] = data_view + return output + + def save_data(self, data): + super().save_data() diff --git a/jlab_datascience_toolkit/utils/io.py b/jlab_datascience_toolkit/utils/io.py new file mode 100644 index 0000000..6a824d2 --- /dev/null +++ b/jlab_datascience_toolkit/utils/io.py @@ -0,0 +1,50 @@ +import yaml +from pathlib import Path +import tempfile +import logging +import sys + +io_log = logging.getLogger('io_log') + +def save_yaml_config(config: dict, path: str | Path, overwrite: bool = False): + """ Saves configuration dictionary to a yaml file + + Args: + config (dict): Dictionary to save + path (str | Path): Location to save configuration. + If `path` does not exist, it will be created. + If `path` is a directory, the configuration will be saved to config.yaml + If `path` is a filename, the configuration will be saved to that filename + overwrite (bool, optional): If True, the passed configuration will overwrite any existing + file with the same `path`. Defaults to False. + + Raises: + FileExistsError: If `path` exists and `overwrite==False` a FileExistsError will be raised. + """ + path = Path(path) + + if path.is_dir(): + io_log.info('path.is_dir() == True') + path = path.joinpath('config.yaml') + + path.parent.mkdir(exist_ok=True) + + if path.exists() and not overwrite: + io_log.error(f'File {path} exists without overwrite flag set') + raise FileExistsError('File already exists. Set overwrite=True if you would like to overwrite it.') + + with open(path, 'w') as f: + io_log.info(f'Writing config to {path}') + yaml.safe_dump(config, f) + +def load_yaml_config(path: str | Path): + path = Path(path) + if path.is_dir(): + path = path.joinpath('config.yaml') + + if not path.exists(): + io_log.error(f'Configuration file {path} not found.') + raise FileNotFoundError(f'Configuration file {path} not found.') + + with open(path, 'r') as f: + return yaml.safe_load(f) diff --git a/utests/utest_io_utils.py b/utests/utest_io_utils.py new file mode 100644 index 0000000..17acfb6 --- /dev/null +++ b/utests/utest_io_utils.py @@ -0,0 +1,85 @@ +from jlab_datascience_toolkit.utils.io import save_yaml_config, load_yaml_config +from pathlib import Path +import unittest +import logging +import tempfile +import random +import string +import shutil +import sys +import os + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) + +def generate_random_string(length): + alphanumeric = string.ascii_letters + string.digits + return ''.join(random.choice(alphanumeric) for _ in range(length)) + +class TestIOUtils(unittest.TestCase): + + # Initialize: + # ***************************************** + def __init__(self, *args, **kwargs): + super(TestIOUtils, self).__init__(*args, **kwargs) + + @classmethod + def setUpClass(self) -> None: + print('Setting up all tests...') + self.config = {'name': 'test', 'scale': 1, 'list_example': [0.1, 1.2, 2.3]} + self.test_path = Path('./temp_dir_' + generate_random_string(6)) + self.existing_file = self.test_path.joinpath('existing_file.yaml') + + @classmethod + def tearDownClass(self) -> None: + print('\nHave a good day!') + + def setUp(self) -> None: + print('\n----------------------------------------------------------------------') + os.makedirs(self.test_path) + with open(self.existing_file, 'w'): + pass + return super().setUp() + + def tearDown(self) -> None: + # print('\nEnd of Test') + print('----------------------------------------------------------------------') + shutil.rmtree(self.test_path) + return super().tearDown() + + def test_save_load_with_dir(self): + save_yaml_config(self.config, self.test_path) + self.assertTrue(self.test_path.joinpath('config.yaml').exists()) + config = load_yaml_config(self.test_path) + for k in self.config: + self.assertEqual(self.config[k], config[k]) + + def test_save_existing_no_overwrite(self): + with self.assertRaises(FileExistsError): + save_yaml_config(self.config, self.existing_file) + + def test_load_not_existing(self): + with self.assertRaises(FileNotFoundError): + load_yaml_config(self.test_path.joinpath('no_file_exists_here.yaml')) + + def test_save_load_filename(self): + new_filename = self.test_path.joinpath('new_config.yaml') + save_yaml_config(self.config, new_filename) + load_yaml_config(new_filename) + + def test_overwrite_filename(self): + # We will simply try saving the same thing three times. First should succeed, + # second should fail with overwrite==False, third should succeed with overwrite==True + new_filename = self.test_path.joinpath('new_config.yaml') + save_yaml_config(self.config, new_filename) + with self.assertRaises(FileExistsError): + save_yaml_config(self.config, new_filename) + config = self.config.copy() + config['name'] = 'train' + save_yaml_config(config, new_filename, overwrite=True) + loaded_config = load_yaml_config(new_filename) + for k in config: + self.assertEqual(config[k], loaded_config[k]) + +# Run this file via: python utest_io_utils.py +if __name__ == "__main__": + unittest.main() diff --git a/utests/utest_pandas_standard_scaler.py b/utests/utest_pandas_standard_scaler.py new file mode 100644 index 0000000..562d580 --- /dev/null +++ b/utests/utest_pandas_standard_scaler.py @@ -0,0 +1,123 @@ +from jlab_datascience_toolkit.data_prep import make +import unittest +import logging +import matplotlib.pyplot as plt +import inspect +import pandas as pd +import numpy as np +import shutil +import sys +import os + +logging.basicConfig(stream=sys.stdout, level=logging.DEBUG) +rng = np.random.default_rng(seed=42) +prep_id = 'PandasStandardScaler_v0' + + +class TestPandasStandardScalerv0(unittest.TestCase): + + # Initialize: + # ***************************************** + def __init__(self, *args, **kwargs): + super(TestPandasStandardScalerv0, self).__init__(*args, **kwargs) + + @classmethod + def setUpClass(self) -> None: + print('Setting up all tests...') + + @classmethod + def tearDownClass(self) -> None: + print('\nHave a good day!') + + def setUp(self) -> None: + x1 = pd.Series(rng.normal(loc=1, scale=2, size=(100,)), name='X1') + x2 = pd.Series(rng.normal(loc=3, scale=4, size=(100,)), name='X2') + x3 = pd.Series(rng.uniform(low=3, high=10, size=(100,)), name='X3') + x4 = pd.Series(rng.uniform(low=-4, high=1, size=(100,)), name='X4') + data = pd.concat([x1,x2,x3,x4], axis=1) + self.data = data + print('\n----------------------------------------------------------------------') + return super().setUp() + + def tearDown(self) -> None: + # print('\nEnd of Test') + print('----------------------------------------------------------------------') + return super().tearDown() + + def test_output_types(self): + + prep = make(prep_id, config={'inplace': True}) + output = prep.run(self.data) + self.assertIsNone(output) + + prep = make(prep_id, config={'inplace': False}) + output = prep.run(self.data) + self.assertEqual(type(output), pd.DataFrame, msg='Output not DataFrame when inplace==False') + + def test_axis_zero(self): + prep = make(prep_id, config={'axis': 0}) + scaled_data = prep.run(self.data) + mean = scaled_data.mean(axis=0) + var = scaled_data.var(axis=0, ddof=0) + self.assertTrue(np.allclose(mean, np.zeros_like(mean)), msg='Column mean not equal to zero') + self.assertTrue(np.allclose(var, np.ones_like(var)), msg='Column variance not equal to one') + + def test_inplace_run(self): + prep = make(prep_id, config={'inplace': True}) + out = prep.run(self.data) + mean = self.data.mean(axis=0) + var = self.data.var(axis=0, ddof=0) + self.assertTrue(np.allclose(mean, np.zeros_like(mean)), msg='Column mean not equal to zero') + self.assertTrue(np.allclose(var, np.ones_like(var)), msg='Column variance not equal to one') + + def test_zero_variance(self): + original_shape = self.data.shape + self.data['X5'] = pd.Series(4*np.ones(shape=(100,)), name='X5') + prep = make(prep_id) + scaled_data = prep.run(self.data) + mean = scaled_data.mean(axis=0) + var = scaled_data.var(axis=0, ddof=0) + self.assertTrue(np.allclose(mean, np.zeros_like(mean)), msg='Column mean not equal to zero') + + theory_var = np.ones_like(var) + theory_var[-1] = 0 + self.assertTrue(np.allclose(var, theory_var), msg='Scaled variance is incorrect') + + def test_multi_run(self): + # Should set mean and scale only based on first dataset called with run + prep = make(prep_id) + scaled_data = prep.run(self.data) + saved_mean = prep.mean + saved_scale = prep.scale + + scaled_data2 = prep.run(self.data + 5) + self.assertTrue((saved_mean == prep.mean).all(), msg='Mean has changed after second run()') + self.assertTrue((saved_scale == prep.scale).all(), msg='Scale has changed after second run()') + + # Mean in data+5 after scaling should be 5 / scale + self.assertTrue(np.allclose(scaled_data2.mean(), 5/prep.scale), msg='Mean of second run() is incorrect.') + + + def test_save_load(self): + prep = make(prep_id) + scaled_data = prep.run(self.data) + save_path = './test_saved_prep' + try: + prep.save(save_path) + new_prep = make(prep_id) + new_prep.load(save_path) + new_scaled_data = new_prep.run(self.data) + self.assertTrue(np.allclose(new_scaled_data, scaled_data), msg='Scaled data after load() does not match') + finally: + shutil.rmtree(save_path) + + def test_reverse_scaling(self): + prep = make(prep_id) + scaled_data = prep.run(self.data) + unscaled_data = prep.reverse(scaled_data) + + self.assertTrue(np.allclose(self.data, unscaled_data)) + +# Run this file via: python utest_csv_parser_v0.py +if __name__ == "__main__": + unittest.main()