Skip to content

Commit

Permalink
PyOpenSci Review - Code Updates (#155)
Browse files Browse the repository at this point in the history
This pull request contains updates to the source code as part of the
PyOpenSci review (see
pyOpenSci/software-submission#120).

Each commit message contains additional clarifying details.
  • Loading branch information
JacksonBurns committed Oct 11, 2023
2 parents eb39380 + 132f6b3 commit 8e3d1b5
Show file tree
Hide file tree
Showing 18 changed files with 16 additions and 75 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ X_train, X_val, X_test = train_val_test_split(X, sampler = 'sphere_exclusion')
```
You can now train your model with `X_train`, optimize your model with `X_val`, and measure its performance with `X_test`.

### Evaluate the Impact of Splitting Algorithms
### Evaluate the Impact of Splitting Algorithms on Regression Models
For data with many features it can be difficult to visualize how different sampling algorithms change the distribution of data into training, validation, and testing like we do in some of the demo notebooks.
To aid in analyzing the impact of the algorithms, `astartes` provides `generate_regression_results_dict`.
This function allows users to quickly evaluate the impact of different splitting techniques on any `sklearn`-compatible model's performance.
Expand All @@ -139,10 +139,10 @@ All results are stored in a nested dictionary (`{sampler:{metric:{split:score}}}
```python
from sklearn.svm import LinearSVR

from astartes.utils import generate_regression_results_dict
from astartes.utils import generate_regression_results_dict as grrd

sklearn_model = LinearSVR()
results_dict = generate_regression_results_dict(
results_dict = grrd(
sklearn_model,
X,
y,
Expand Down
1 change: 0 additions & 1 deletion astartes/__init__.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# convenience import to enable 'from astartes import train_test_split'
from .main import train_test_split, train_val_test_split

# update this in pyproject.toml, too
__version__ = "1.1.2"

# DO NOT do this:
Expand Down
3 changes: 1 addition & 2 deletions astartes/samplers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@

# implementations
from .extrapolation import DBSCAN, KMeans, OptiSim, Scaffold, SphereExclusion, TimeBased
from .interpolation import MTSD, SPXY, KennardStone, Random
from .interpolation import SPXY, KennardStone, Random

IMPLEMENTED_INTERPOLATION_SAMPLERS = (
"random",
"kennard_stone",
# "mtsd",
"spxy",
)

Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/extrapolation/dbscan.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,6 @@


class DBSCAN(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _sample(self):
"""
Implements the DBSCAN sampler to identify clusters.
Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/extrapolation/kmeans.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@


class KMeans(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _sample(self):
"""Implements the K-Means sampler to identify clusters."""
# use the sklearn kmeans model
Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/extrapolation/optisim.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,6 @@


class OptiSim(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _sample(self):
"""Implementes the OptiSim sampler"""
self._init_random(self.get_config("random_state", 42))
Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/extrapolation/scaffold.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,9 +27,6 @@


class Scaffold(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _before_sample(self):
# ensure that X contains entries that are either a SMILES string or an RDKit Molecule
if not all(isinstance(i, str) for i in self.X) and not all(isinstance(i, Chem.rdchem.Mol) for i in self.X):
Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/extrapolation/sphere_exclusion.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@


class SphereExclusion(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _sample(self):
"""Cluster X according to a Sphere Exclusion-like algorithm with arbitrary distance metrics."""
# euclidian, cosine, or city block from get_configs
Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/extrapolation/time_based.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@


class TimeBased(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _before_sample(self):
# verify that the user provided time as the labels (i.e. args[2])
if self.labels is None:
Expand Down
1 change: 0 additions & 1 deletion astartes/samplers/interpolation/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
from .kennardstone import KennardStone
from .mtsd import MTSD
from .random_split import Random
from .spxy import SPXY
3 changes: 0 additions & 3 deletions astartes/samplers/interpolation/kennardstone.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@


class KennardStone(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _sample(self):
"""
Implements the Kennard-Stone algorithm
Expand Down
5 changes: 0 additions & 5 deletions astartes/samplers/interpolation/mtsd.py

This file was deleted.

3 changes: 0 additions & 3 deletions astartes/samplers/interpolation/random_split.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,6 @@


class Random(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _sample(self):
"""Passthrough to sklearn train_test_split"""
idx_list = list(range(len(self.X)))
Expand Down
3 changes: 0 additions & 3 deletions astartes/samplers/interpolation/spxy.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,6 @@


class SPXY(AbstractSampler):
def __init__(self, *args):
super().__init__(*args)

def _before_sample(self):
if self.y is None:
raise InvalidConfigurationError("SPXY sampler requires both X and y arrays. Provide y or switch to kennard_stone.")
Expand Down
8 changes: 8 additions & 0 deletions astartes/utils/user_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ def generate_regression_results_dict(
val_size=0.1,
test_size=0.1,
print_results=False,
additional_metrics={},
):
"""
Helper function to train a sklearn model using the provided data
Expand All @@ -32,6 +33,8 @@ def generate_regression_results_dict(
the sampler and the values being another dictionary with the
corresponding hyperparameters. Defaults to {}.
print_results (bool, optional): whether to print the resulting dictionary as a neat table
additional_metrics (dict, optional): mapping of name (str) to metric (func) for additional metrics
such as those in sklearn.metrics or user-provided functions
Returns:
dict: nested dictionary with the format of
Expand Down Expand Up @@ -148,6 +151,11 @@ def generate_regression_results_dict(

final_dict[sampler] = error_dict

for metric_name, metric_function in additional_metrics.items():
error_dict[metric_name]["train"] = metric_function(y_train, y_pred_train)
error_dict[metric_name]["val"] = metric_function(y_val, y_pred_val)
error_dict[metric_name]["test"] = metric_function(y_test, y_pred_test)

if print_results:
print(f"\nDisplaying results for {sampler} sampler")
display_results_as_table(error_dict)
Expand Down
8 changes: 0 additions & 8 deletions docs/_sources/astartes.samplers.rst.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,6 @@ astartes.samplers.kennard\_stone module
:undoc-members:
:show-inheritance:

astartes.samplers.mtsd module
-----------------------------

.. automodule:: astartes.samplers.mtsd
:members:
:undoc-members:
:show-inheritance:

astartes.samplers.optisim module
--------------------------------

Expand Down
6 changes: 4 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "astartes"
# update this in astartes/__init__.py, too
version = "1.1.2"
dynamic = ["version"]
authors = [
{ name = "Jackson Burns", email = "[email protected]" },
{ name = "Himaghna Bhattacharjee", email = "[email protected]" },
Expand Down Expand Up @@ -43,3 +42,6 @@ include-package-data = true
where = ["."]
include = ["astartes*"]
exclude = ["docs*", "examples*", "test*"]

[tool.setuptools.dynamic]
version = {attr = "astartes.__version__"}
26 changes: 0 additions & 26 deletions test/unit/samplers/interpolative/test_MTSD.py

This file was deleted.

0 comments on commit 8e3d1b5

Please sign in to comment.