diff --git a/README.md b/README.md index 31f341a2..3e21068d 100644 --- a/README.md +++ b/README.md @@ -130,7 +130,7 @@ X_train, X_val, X_test = train_val_test_split(X, sampler = 'sphere_exclusion') ``` You can now train your model with `X_train`, optimize your model with `X_val`, and measure its performance with `X_test`. -### Evaluate the Impact of Splitting Algorithms +### Evaluate the Impact of Splitting Algorithms on Regression Models For data with many features it can be difficult to visualize how different sampling algorithms change the distribution of data into training, validation, and testing like we do in some of the demo notebooks. To aid in analyzing the impact of the algorithms, `astartes` provides `generate_regression_results_dict`. This function allows users to quickly evaluate the impact of different splitting techniques on any `sklearn`-compatible model's performance. @@ -139,10 +139,10 @@ All results are stored in a nested dictionary (`{sampler:{metric:{split:score}}} ```python from sklearn.svm import LinearSVR -from astartes.utils import generate_regression_results_dict +from astartes.utils import generate_regression_results_dict as grrd sklearn_model = LinearSVR() -results_dict = generate_regression_results_dict( +results_dict = grrd( sklearn_model, X, y, diff --git a/astartes/__init__.py b/astartes/__init__.py index fac66be6..7bef331a 100644 --- a/astartes/__init__.py +++ b/astartes/__init__.py @@ -1,7 +1,6 @@ # convenience import to enable 'from astartes import train_test_split' from .main import train_test_split, train_val_test_split -# update this in pyproject.toml, too __version__ = "1.1.2" # DO NOT do this: diff --git a/astartes/samplers/__init__.py b/astartes/samplers/__init__.py index 7374b0bb..eaff11cc 100644 --- a/astartes/samplers/__init__.py +++ b/astartes/samplers/__init__.py @@ -3,12 +3,11 @@ # implementations from .extrapolation import DBSCAN, KMeans, OptiSim, Scaffold, SphereExclusion, TimeBased -from .interpolation import MTSD, SPXY, KennardStone, Random +from .interpolation import SPXY, KennardStone, Random IMPLEMENTED_INTERPOLATION_SAMPLERS = ( "random", "kennard_stone", - # "mtsd", "spxy", ) diff --git a/astartes/samplers/extrapolation/dbscan.py b/astartes/samplers/extrapolation/dbscan.py index db4f3b80..f400cfa3 100644 --- a/astartes/samplers/extrapolation/dbscan.py +++ b/astartes/samplers/extrapolation/dbscan.py @@ -4,9 +4,6 @@ class DBSCAN(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """ Implements the DBSCAN sampler to identify clusters. diff --git a/astartes/samplers/extrapolation/kmeans.py b/astartes/samplers/extrapolation/kmeans.py index 468877d5..8b262b8c 100644 --- a/astartes/samplers/extrapolation/kmeans.py +++ b/astartes/samplers/extrapolation/kmeans.py @@ -6,9 +6,6 @@ class KMeans(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Implements the K-Means sampler to identify clusters.""" # use the sklearn kmeans model diff --git a/astartes/samplers/extrapolation/optisim.py b/astartes/samplers/extrapolation/optisim.py index ea7ff676..c25f7af0 100644 --- a/astartes/samplers/extrapolation/optisim.py +++ b/astartes/samplers/extrapolation/optisim.py @@ -59,9 +59,6 @@ class OptiSim(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Implementes the OptiSim sampler""" self._init_random(self.get_config("random_state", 42)) diff --git a/astartes/samplers/extrapolation/scaffold.py b/astartes/samplers/extrapolation/scaffold.py index 27bf4e6c..148f362c 100644 --- a/astartes/samplers/extrapolation/scaffold.py +++ b/astartes/samplers/extrapolation/scaffold.py @@ -27,9 +27,6 @@ class Scaffold(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _before_sample(self): # ensure that X contains entries that are either a SMILES string or an RDKit Molecule if not all(isinstance(i, str) for i in self.X) and not all(isinstance(i, Chem.rdchem.Mol) for i in self.X): diff --git a/astartes/samplers/extrapolation/sphere_exclusion.py b/astartes/samplers/extrapolation/sphere_exclusion.py index 7d4ceee1..4afed548 100644 --- a/astartes/samplers/extrapolation/sphere_exclusion.py +++ b/astartes/samplers/extrapolation/sphere_exclusion.py @@ -20,9 +20,6 @@ class SphereExclusion(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Cluster X according to a Sphere Exclusion-like algorithm with arbitrary distance metrics.""" # euclidian, cosine, or city block from get_configs diff --git a/astartes/samplers/extrapolation/time_based.py b/astartes/samplers/extrapolation/time_based.py index 72c49dc2..3e6112e4 100644 --- a/astartes/samplers/extrapolation/time_based.py +++ b/astartes/samplers/extrapolation/time_based.py @@ -6,9 +6,6 @@ class TimeBased(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _before_sample(self): # verify that the user provided time as the labels (i.e. args[2]) if self.labels is None: diff --git a/astartes/samplers/interpolation/__init__.py b/astartes/samplers/interpolation/__init__.py index 6f6c1f22..b7d4c7d4 100644 --- a/astartes/samplers/interpolation/__init__.py +++ b/astartes/samplers/interpolation/__init__.py @@ -1,4 +1,3 @@ from .kennardstone import KennardStone -from .mtsd import MTSD from .random_split import Random from .spxy import SPXY diff --git a/astartes/samplers/interpolation/kennardstone.py b/astartes/samplers/interpolation/kennardstone.py index 12ec4737..e0134d02 100644 --- a/astartes/samplers/interpolation/kennardstone.py +++ b/astartes/samplers/interpolation/kennardstone.py @@ -5,9 +5,6 @@ class KennardStone(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """ Implements the Kennard-Stone algorithm diff --git a/astartes/samplers/interpolation/mtsd.py b/astartes/samplers/interpolation/mtsd.py deleted file mode 100644 index d9a82052..00000000 --- a/astartes/samplers/interpolation/mtsd.py +++ /dev/null @@ -1,5 +0,0 @@ -from astartes.samplers import AbstractSampler - - -class MTSD(AbstractSampler): - pass diff --git a/astartes/samplers/interpolation/random_split.py b/astartes/samplers/interpolation/random_split.py index 318bec02..69ce3d76 100644 --- a/astartes/samplers/interpolation/random_split.py +++ b/astartes/samplers/interpolation/random_split.py @@ -5,9 +5,6 @@ class Random(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _sample(self): """Passthrough to sklearn train_test_split""" idx_list = list(range(len(self.X))) diff --git a/astartes/samplers/interpolation/spxy.py b/astartes/samplers/interpolation/spxy.py index 69ce3039..999ea908 100644 --- a/astartes/samplers/interpolation/spxy.py +++ b/astartes/samplers/interpolation/spxy.py @@ -20,9 +20,6 @@ class SPXY(AbstractSampler): - def __init__(self, *args): - super().__init__(*args) - def _before_sample(self): if self.y is None: raise InvalidConfigurationError("SPXY sampler requires both X and y arrays. Provide y or switch to kennard_stone.") diff --git a/astartes/utils/user_utils.py b/astartes/utils/user_utils.py index 41ff343d..3343a0f1 100644 --- a/astartes/utils/user_utils.py +++ b/astartes/utils/user_utils.py @@ -16,6 +16,7 @@ def generate_regression_results_dict( val_size=0.1, test_size=0.1, print_results=False, + additional_metrics={}, ): """ Helper function to train a sklearn model using the provided data @@ -32,6 +33,8 @@ def generate_regression_results_dict( the sampler and the values being another dictionary with the corresponding hyperparameters. Defaults to {}. print_results (bool, optional): whether to print the resulting dictionary as a neat table + additional_metrics (dict, optional): mapping of name (str) to metric (func) for additional metrics + such as those in sklearn.metrics or user-provided functions Returns: dict: nested dictionary with the format of @@ -148,6 +151,11 @@ def generate_regression_results_dict( final_dict[sampler] = error_dict + for metric_name, metric_function in additional_metrics.items(): + error_dict[metric_name]["train"] = metric_function(y_train, y_pred_train) + error_dict[metric_name]["val"] = metric_function(y_val, y_pred_val) + error_dict[metric_name]["test"] = metric_function(y_test, y_pred_test) + if print_results: print(f"\nDisplaying results for {sampler} sampler") display_results_as_table(error_dict) diff --git a/docs/_sources/astartes.samplers.rst.txt b/docs/_sources/astartes.samplers.rst.txt index f3c0fc96..1d92b78a 100644 --- a/docs/_sources/astartes.samplers.rst.txt +++ b/docs/_sources/astartes.samplers.rst.txt @@ -36,14 +36,6 @@ astartes.samplers.kennard\_stone module :undoc-members: :show-inheritance: -astartes.samplers.mtsd module ------------------------------ - -.. automodule:: astartes.samplers.mtsd - :members: - :undoc-members: - :show-inheritance: - astartes.samplers.optisim module -------------------------------- diff --git a/pyproject.toml b/pyproject.toml index db7bc7ba..db0e1156 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,8 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "astartes" -# update this in astartes/__init__.py, too -version = "1.1.2" +dynamic = ["version"] authors = [ { name = "Jackson Burns", email = "jwburns@mit.edu" }, { name = "Himaghna Bhattacharjee", email = "himaghna@udel.edu" }, @@ -43,3 +42,6 @@ include-package-data = true where = ["."] include = ["astartes*"] exclude = ["docs*", "examples*", "test*"] + +[tool.setuptools.dynamic] +version = {attr = "astartes.__version__"} diff --git a/test/unit/samplers/interpolative/test_MTSD.py b/test/unit/samplers/interpolative/test_MTSD.py deleted file mode 100644 index ec3a8b41..00000000 --- a/test/unit/samplers/interpolative/test_MTSD.py +++ /dev/null @@ -1,26 +0,0 @@ -import os -import sys -import unittest - -import numpy as np - -from astartes.samplers.interpolation import mtsd - - -class Test_mtsd(unittest.TestCase): - """ - Test the various functionalities of mtsd. - """ - - @classmethod - def setUpClass(self): - """Convenience attributes for later tests.""" - return - - def test_mtsd(self): - """ """ - return - - -if __name__ == "__main__": - unittest.main()