From 7f6cd6473700d5188bfb7d51029e4e16fa0ff778 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Thu, 31 Aug 2023 15:32:27 -0400 Subject: [PATCH 1/3] updates to `generate_regression_results_dict`, see extended description - clarified in the README that this is only intended for regression (we should add an equivalent for classification, with different default metrics, that would perhaps involve creating a new abstract function that both this function and the new one would call) - make more general by allowing users to pass in other metrics, or to include their own --- README.md | 19 +++++++++++++++---- astartes/utils/user_utils.py | 8 ++++++++ 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 41806c9..d062096 100644 --- a/README.md +++ b/README.md @@ -86,18 +86,19 @@ X_train, X_val, X_test = train_val_test_split(X, sampler = 'sphere_exclusion') ``` You can now train your model with `X_train`, optimize your model with `X_val`, and measure its performance with `X_test`. -### Evaluate the Impact of Splitting Algorithms +### Evaluate the Impact of Splitting Algorithms on Regression Models For data with many features it can be difficult to visualize how different sampling algorithms change the distribution of data into training, validation, and testing like we do in some of the demo notebooks. To aid in analyzing the impact of the algorithms, `astartes` provides `generate_regression_results_dict`. -This function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument. +This function allows users to quickly evaluate the impact of different splitting techniques on any model supported by `sklearn`. +All results are stored in a dictionary format and can be displayed in a neatly formatted table using the optional `print_results` argument. ```python from sklearn.svm import LinearSVR -from astartes.utils import generate_regression_results_dict +from astartes.utils import generate_regression_results_dict as grrd sklearn_model = LinearSVR() -results_dict = generate_regression_results_dict( +results_dict = grrd( sklearn_model, X, y, @@ -112,6 +113,16 @@ R2 0.90745 0.80787 0.78412 ``` +Additional metrics can be passed to `generate_regression_results_dict` via the `additional_metrics` argument, which should be a dictionary mapping the name of the metric (as a `string`) to the function itself, like this: + +```python +from sklearn.metrics import mean_absolute_percentage_error + +add_met = {"mape": mean_absolute_percentage_error} + +grrd(sklearn_model, X, y, additional_metric=add_met) +``` + ### Access Sampling Algorithms Directly The sampling algorithms implemented in `astartes` can also be directly accessed and run if it is more useful for your applications. In the below example, we import the Kennard Stone sampler, use it to partition a simple array, and then retrieve a sample. diff --git a/astartes/utils/user_utils.py b/astartes/utils/user_utils.py index 41ff343..3343a0f 100644 --- a/astartes/utils/user_utils.py +++ b/astartes/utils/user_utils.py @@ -16,6 +16,7 @@ def generate_regression_results_dict( val_size=0.1, test_size=0.1, print_results=False, + additional_metrics={}, ): """ Helper function to train a sklearn model using the provided data @@ -32,6 +33,8 @@ def generate_regression_results_dict( the sampler and the values being another dictionary with the corresponding hyperparameters. Defaults to {}. print_results (bool, optional): whether to print the resulting dictionary as a neat table + additional_metrics (dict, optional): mapping of name (str) to metric (func) for additional metrics + such as those in sklearn.metrics or user-provided functions Returns: dict: nested dictionary with the format of @@ -148,6 +151,11 @@ def generate_regression_results_dict( final_dict[sampler] = error_dict + for metric_name, metric_function in additional_metrics.items(): + error_dict[metric_name]["train"] = metric_function(y_train, y_pred_train) + error_dict[metric_name]["val"] = metric_function(y_val, y_pred_val) + error_dict[metric_name]["test"] = metric_function(y_test, y_pred_test) + if print_results: print(f"\nDisplaying results for {sampler} sampler") display_results_as_table(error_dict) From d2c54fb8c9e76ad93256f987b9f5771b38eca0dc Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Fri, 22 Sep 2023 10:56:42 -0400 Subject: [PATCH 2/3] remove the unnecessary constructor overrides and MTSD (didn't get to it) --- astartes/samplers/__init__.py | 3 +-- astartes/samplers/extrapolation/dbscan.py | 3 --- astartes/samplers/extrapolation/kmeans.py | 3 --- astartes/samplers/extrapolation/optisim.py | 3 --- astartes/samplers/extrapolation/scaffold.py | 3 --- .../extrapolation/sphere_exclusion.py | 3 --- astartes/samplers/extrapolation/time_based.py | 3 --- astartes/samplers/interpolation/__init__.py | 1 - .../samplers/interpolation/kennardstone.py | 3 --- astartes/samplers/interpolation/mtsd.py | 5 ---- .../samplers/interpolation/random_split.py | 3 --- astartes/samplers/interpolation/spxy.py | 3 --- docs/_sources/astartes.samplers.rst.txt | 8 ------ test/unit/samplers/interpolative/test_MTSD.py | 26 ------------------- 14 files changed, 1 insertion(+), 69 deletions(-) delete mode 100644 astartes/samplers/interpolation/mtsd.py delete mode 100644 test/unit/samplers/interpolative/test_MTSD.py diff --git a/astartes/samplers/__init__.py b/astartes/samplers/__init__.py index 7374b0b..eaff11c 100644 --- a/astartes/samplers/__init__.py +++ b/astartes/samplers/__init__.py @@ -3,12 +3,11 @@ # implementations from .extrapolation import DBSCAN, KMeans, OptiSim, Scaffold, SphereExclusion, TimeBased -from .interpolation import MTSD, SPXY, KennardStone, Random +from .interpolation import SPXY, KennardStone, Random IMPLEMENTED_INTERPOLATION_SAMPLERS = ( "random", "kennard_stone", - # "mtsd", "spxy", ) diff --git a/astartes/samplers/extrapolation/dbscan.py b/astartes/samplers/extrapolation/dbscan.py index db4f3b8..f400cfa 100644 --- a/astartes/samplers/extrapolation/dbscan.py +++ b/astartes/samplers/extrapolation/dbscan.py @@ -4,9 +4,6 @@ class DBSCAN(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """ Implements the DBSCAN sampler to identify clusters. diff --git a/astartes/samplers/extrapolation/kmeans.py b/astartes/samplers/extrapolation/kmeans.py index 468877d..8b262b8 100644 --- a/astartes/samplers/extrapolation/kmeans.py +++ b/astartes/samplers/extrapolation/kmeans.py @@ -6,9 +6,6 @@ class KMeans(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Implements the K-Means sampler to identify clusters.""" # use the sklearn kmeans model diff --git a/astartes/samplers/extrapolation/optisim.py b/astartes/samplers/extrapolation/optisim.py index ea7ff67..c25f7af 100644 --- a/astartes/samplers/extrapolation/optisim.py +++ b/astartes/samplers/extrapolation/optisim.py @@ -59,9 +59,6 @@ class OptiSim(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Implementes the OptiSim sampler""" self._init_random(self.get_config("random_state", 42)) diff --git a/astartes/samplers/extrapolation/scaffold.py b/astartes/samplers/extrapolation/scaffold.py index 27bf4e6..148f362 100644 --- a/astartes/samplers/extrapolation/scaffold.py +++ b/astartes/samplers/extrapolation/scaffold.py @@ -27,9 +27,6 @@ class Scaffold(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _before_sample(self): # ensure that X contains entries that are either a SMILES string or an RDKit Molecule if not all(isinstance(i, str) for i in self.X) and not all(isinstance(i, Chem.rdchem.Mol) for i in self.X): diff --git a/astartes/samplers/extrapolation/sphere_exclusion.py b/astartes/samplers/extrapolation/sphere_exclusion.py index 7d4ceee..4afed54 100644 --- a/astartes/samplers/extrapolation/sphere_exclusion.py +++ b/astartes/samplers/extrapolation/sphere_exclusion.py @@ -20,9 +20,6 @@ class SphereExclusion(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Cluster X according to a Sphere Exclusion-like algorithm with arbitrary distance metrics.""" # euclidian, cosine, or city block from get_configs diff --git a/astartes/samplers/extrapolation/time_based.py b/astartes/samplers/extrapolation/time_based.py index 72c49dc..3e6112e 100644 --- a/astartes/samplers/extrapolation/time_based.py +++ b/astartes/samplers/extrapolation/time_based.py @@ -6,9 +6,6 @@ class TimeBased(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _before_sample(self): # verify that the user provided time as the labels (i.e. args[2]) if self.labels is None: diff --git a/astartes/samplers/interpolation/__init__.py b/astartes/samplers/interpolation/__init__.py index 6f6c1f2..b7d4c7d 100644 --- a/astartes/samplers/interpolation/__init__.py +++ b/astartes/samplers/interpolation/__init__.py @@ -1,4 +1,3 @@ from .kennardstone import KennardStone -from .mtsd import MTSD from .random_split import Random from .spxy import SPXY diff --git a/astartes/samplers/interpolation/kennardstone.py b/astartes/samplers/interpolation/kennardstone.py index 12ec473..e0134d0 100644 --- a/astartes/samplers/interpolation/kennardstone.py +++ b/astartes/samplers/interpolation/kennardstone.py @@ -5,9 +5,6 @@ class KennardStone(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """ Implements the Kennard-Stone algorithm diff --git a/astartes/samplers/interpolation/mtsd.py b/astartes/samplers/interpolation/mtsd.py deleted file mode 100644 index d9a8205..0000000 --- a/astartes/samplers/interpolation/mtsd.py +++ /dev/null @@ -1,5 +0,0 @@ -from astartes.samplers import AbstractSampler - - -class MTSD(AbstractSampler): - pass diff --git a/astartes/samplers/interpolation/random_split.py b/astartes/samplers/interpolation/random_split.py index 318bec0..69ce3d7 100644 --- a/astartes/samplers/interpolation/random_split.py +++ b/astartes/samplers/interpolation/random_split.py @@ -5,9 +5,6 @@ class Random(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Passthrough to sklearn train_test_split""" idx_list = list(range(len(self.X))) diff --git a/astartes/samplers/interpolation/spxy.py b/astartes/samplers/interpolation/spxy.py index 69ce303..999ea90 100644 --- a/astartes/samplers/interpolation/spxy.py +++ b/astartes/samplers/interpolation/spxy.py @@ -20,9 +20,6 @@ class SPXY(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _before_sample(self): if self.y is None: raise InvalidConfigurationError("SPXY sampler requires both X and y arrays. Provide y or switch to kennard_stone.") diff --git a/docs/_sources/astartes.samplers.rst.txt b/docs/_sources/astartes.samplers.rst.txt index f3c0fc9..1d92b78 100644 --- a/docs/_sources/astartes.samplers.rst.txt +++ b/docs/_sources/astartes.samplers.rst.txt @@ -36,14 +36,6 @@ astartes.samplers.kennard\_stone module :undoc-members: :show-inheritance: -astartes.samplers.mtsd module ------------------------------ - -.. automodule:: astartes.samplers.mtsd - :members: - :undoc-members: - :show-inheritance: - astartes.samplers.optisim module -------------------------------- diff --git a/test/unit/samplers/interpolative/test_MTSD.py b/test/unit/samplers/interpolative/test_MTSD.py deleted file mode 100644 index ec3a8b4..0000000 --- a/test/unit/samplers/interpolative/test_MTSD.py +++ /dev/null @@ -1,26 +0,0 @@ -import os -import sys -import unittest - -import numpy as np - -from astartes.samplers.interpolation import mtsd - - -class Test_mtsd(unittest.TestCase): - """ - Test the various functionalities of mtsd. - """ - - @classmethod - def setUpClass(self): - """Convenience attributes for later tests.""" - return - - def test_mtsd(self): - """ """ - return - - -if __name__ == "__main__": - unittest.main() From b4148162060c425abb055f3263fb1037ec014188 Mon Sep 17 00:00:00 2001 From: Jackson Burns Date: Sun, 17 Sep 2023 23:04:34 -0400 Subject: [PATCH 3/3] single-source the package version --- astartes/__init__.py | 1 - pyproject.toml | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/astartes/__init__.py b/astartes/__init__.py index fac66be..7bef331 100644 --- a/astartes/__init__.py +++ b/astartes/__init__.py @@ -1,7 +1,6 @@ # convenience import to enable 'from astartes import train_test_split' from .main import train_test_split, train_val_test_split -# update this in pyproject.toml, too __version__ = "1.1.2" # DO NOT do this: diff --git a/pyproject.toml b/pyproject.toml index db7bc7b..db0e115 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "astartes" -# update this in astartes/__init__.py, too -version = "1.1.2" +dynamic = ["version"] authors = [ { name = "Jackson Burns", email = "jwburns@mit.edu" }, { name = "Himaghna Bhattacharjee", email = "himaghna@udel.edu" }, @@ -43,3 +42,6 @@ include-package-data = true where = ["."] include = ["astartes*"] exclude = ["docs*", "examples*", "test*"] + +[tool.setuptools.dynamic] +version = {attr = "astartes.__version__"}