PyOpenSci Review - Code Updates (#155)

This pull request contains updates to the source code as part of the PyOpenSci review (see pyOpenSci/software-submission#120). Each commit message contains additional clarifying details.
JacksonBurns · Oct 11, 2023 · 8e3d1b5 · 8e3d1b5
2 parents eb39380 + 132f6b3
commit 8e3d1b5
Show file tree

Hide file tree

Showing 18 changed files with 16 additions and 75 deletions.
diff --git a/README.md b/README.md
@@ -130,7 +130,7 @@ X_train, X_val, X_test = train_val_test_split(X, sampler = 'sphere_exclusion')
 ```
 You can now train your model with `X_train`, optimize your model with `X_val`, and measure its performance with `X_test`.
 
-### Evaluate the Impact of Splitting Algorithms
+### Evaluate the Impact of Splitting Algorithms on Regression Models
 For data with many features it can be difficult to visualize how different sampling algorithms change the distribution of data into training, validation, and testing like we do in some of the demo notebooks.
 To aid in analyzing the impact of the algorithms, `astartes` provides `generate_regression_results_dict`.
 This function allows users to quickly evaluate the impact of different splitting techniques on any `sklearn`-compatible model's performance.
@@ -139,10 +139,10 @@ All results are stored in a nested dictionary (`{sampler:{metric:{split:score}}}
 ```python
 from sklearn.svm import LinearSVR
 
-from astartes.utils import generate_regression_results_dict
+from astartes.utils import generate_regression_results_dict as grrd
 
 sklearn_model = LinearSVR()
-results_dict = generate_regression_results_dict(
+results_dict = grrd(
     sklearn_model,
     X,
     y,

diff --git a/astartes/__init__.py b/astartes/__init__.py
@@ -1,7 +1,6 @@
 # convenience import to enable 'from astartes import train_test_split'
 from .main import train_test_split, train_val_test_split
 
-# update this in pyproject.toml, too
 __version__ = "1.1.2"
 
 # DO NOT do this:

diff --git a/astartes/samplers/__init__.py b/astartes/samplers/__init__.py
@@ -3,12 +3,11 @@
 
 # implementations
 from .extrapolation import DBSCAN, KMeans, OptiSim, Scaffold, SphereExclusion, TimeBased
-from .interpolation import MTSD, SPXY, KennardStone, Random
+from .interpolation import SPXY, KennardStone, Random
 
 IMPLEMENTED_INTERPOLATION_SAMPLERS = (
     "random",
     "kennard_stone",
-    # "mtsd",
     "spxy",
 )
 

diff --git a/astartes/samplers/extrapolation/dbscan.py b/astartes/samplers/extrapolation/dbscan.py
@@ -4,9 +4,6 @@
 
 
 class DBSCAN(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _sample(self):
         """
         Implements the DBSCAN sampler to identify clusters.

diff --git a/astartes/samplers/extrapolation/kmeans.py b/astartes/samplers/extrapolation/kmeans.py
@@ -6,9 +6,6 @@
 
 
 class KMeans(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _sample(self):
         """Implements the K-Means sampler to identify clusters."""
         # use the sklearn kmeans model

diff --git a/astartes/samplers/extrapolation/optisim.py b/astartes/samplers/extrapolation/optisim.py
@@ -59,9 +59,6 @@
 
 
 class OptiSim(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _sample(self):
         """Implementes the OptiSim sampler"""
         self._init_random(self.get_config("random_state", 42))

diff --git a/astartes/samplers/extrapolation/scaffold.py b/astartes/samplers/extrapolation/scaffold.py
@@ -27,9 +27,6 @@
 
 
 class Scaffold(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _before_sample(self):
         # ensure that X contains entries that are either a SMILES string or an RDKit Molecule
         if not all(isinstance(i, str) for i in self.X) and not all(isinstance(i, Chem.rdchem.Mol) for i in self.X):

diff --git a/astartes/samplers/extrapolation/sphere_exclusion.py b/astartes/samplers/extrapolation/sphere_exclusion.py
@@ -20,9 +20,6 @@
 
 
 class SphereExclusion(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _sample(self):
         """Cluster X according to a Sphere Exclusion-like algorithm with arbitrary distance metrics."""
         # euclidian, cosine, or city block from get_configs

diff --git a/astartes/samplers/extrapolation/time_based.py b/astartes/samplers/extrapolation/time_based.py
@@ -6,9 +6,6 @@
 
 
 class TimeBased(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _before_sample(self):
         # verify that the user provided time as the labels (i.e. args[2])
         if self.labels is None:

diff --git a/astartes/samplers/interpolation/__init__.py b/astartes/samplers/interpolation/__init__.py
@@ -1,4 +1,3 @@
 from .kennardstone import KennardStone
-from .mtsd import MTSD
 from .random_split import Random
 from .spxy import SPXY
diff --git a/astartes/samplers/interpolation/kennardstone.py b/astartes/samplers/interpolation/kennardstone.py
@@ -5,9 +5,6 @@
 
 
 class KennardStone(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _sample(self):
         """
         Implements the Kennard-Stone algorithm

diff --git a/astartes/samplers/interpolation/mtsd.py b/astartes/samplers/interpolation/mtsd.py
diff --git a/astartes/samplers/interpolation/random_split.py b/astartes/samplers/interpolation/random_split.py
@@ -5,9 +5,6 @@
 
 
 class Random(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _sample(self):
         """Passthrough to sklearn train_test_split"""
         idx_list = list(range(len(self.X)))

diff --git a/astartes/samplers/interpolation/spxy.py b/astartes/samplers/interpolation/spxy.py
@@ -20,9 +20,6 @@
 
 
 class SPXY(AbstractSampler):
-    def __init__(self, *args):
-        super().__init__(*args)
-
     def _before_sample(self):
         if self.y is None:
             raise InvalidConfigurationError("SPXY sampler requires both X and y arrays. Provide y or switch to kennard_stone.")

diff --git a/astartes/utils/user_utils.py b/astartes/utils/user_utils.py
@@ -16,6 +16,7 @@ def generate_regression_results_dict(
     val_size=0.1,
     test_size=0.1,
     print_results=False,
+    additional_metrics={},
 ):
     """
     Helper function to train a sklearn model using the provided data
@@ -32,6 +33,8 @@ def generate_regression_results_dict(
                                          the sampler and the values being another dictionary with the
                                          corresponding hyperparameters. Defaults to {}.
         print_results (bool, optional): whether to print the resulting dictionary as a neat table
+        additional_metrics (dict, optional): mapping of name (str) to metric (func) for additional metrics
+                                             such as those in sklearn.metrics or user-provided functions
 
     Returns:
         dict: nested dictionary with the format of
@@ -148,6 +151,11 @@ def generate_regression_results_dict(
 
         final_dict[sampler] = error_dict
 
+        for metric_name, metric_function in additional_metrics.items():
+            error_dict[metric_name]["train"] = metric_function(y_train, y_pred_train)
+            error_dict[metric_name]["val"] = metric_function(y_val, y_pred_val)
+            error_dict[metric_name]["test"] = metric_function(y_test, y_pred_test)
+
         if print_results:
             print(f"\nDisplaying results for {sampler} sampler")
             display_results_as_table(error_dict)

diff --git a/docs/_sources/astartes.samplers.rst.txt b/docs/_sources/astartes.samplers.rst.txt
@@ -36,14 +36,6 @@ astartes.samplers.kennard\_stone module
    :undoc-members:
    :show-inheritance:
 
-astartes.samplers.mtsd module
------------------------------
-
-.. automodule:: astartes.samplers.mtsd
-   :members:
-   :undoc-members:
-   :show-inheritance:
-
 astartes.samplers.optisim module
 --------------------------------
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -4,8 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "astartes"
-# update this in astartes/__init__.py, too
-version = "1.1.2"
+dynamic = ["version"]
 authors = [
     { name = "Jackson Burns", email = "[email protected]" },
     { name = "Himaghna Bhattacharjee", email = "[email protected]" },
@@ -43,3 +42,6 @@ include-package-data = true
 where = ["."]
 include = ["astartes*"]
 exclude = ["docs*", "examples*", "test*"]
+
+[tool.setuptools.dynamic]
+version = {attr = "astartes.__version__"}
diff --git a/test/unit/samplers/interpolative/test_MTSD.py b/test/unit/samplers/interpolative/test_MTSD.py