Skip to content

Commit

Permalink
Merge pull request #85 from VIDA-NYU/simplify_classes
Browse files Browse the repository at this point in the history
  • Loading branch information
roquelopez authored Nov 20, 2023
2 parents 91c5d59 + 6a5fef9 commit 5349343
Show file tree
Hide file tree
Showing 2 changed files with 350 additions and 313 deletions.
90 changes: 45 additions & 45 deletions alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,19 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo
"""
Create/instantiate an BaseAutoML object.
:param time_bound: Limit time in minutes to perform the search
:param time_bound: Limit time in minutes to perform the search.
:param metric: A str (see in the documentation the list of available metrics) or a callable object/function
:param split_strategy: Method to score the pipeline: `holdout`, `cross_validation` or an instance of
BaseCrossValidator, BaseShuffleSplit, RepeatedSplits
:param time_bound_run: Limit time in minutes to score a pipeline
:param task: The task to be solved
BaseCrossValidator, BaseShuffleSplit, RepeatedSplits.
:param time_bound_run: Limit time in minutes to score a pipeline.
:param task: The task to be solved.
:param score_sorting: The sort used to order the scores. It could be `auto`, `ascending` or `descending`.
`auto` is used for the built-in metrics. For the user-defined metrics, this param must be passed.
:param metric_kwargs: Additional arguments for metric
:param metric_kwargs: Additional arguments for metric.
:param split_strategy_kwargs: Additional arguments for splitting_strategy.
:param output_folder: Path to the output directory. If it is None, create a temp folder automatically.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs
:param verbose: The logs level.
"""

hide_logs(verbose)
Expand Down Expand Up @@ -295,9 +295,9 @@ def get_serialized_pipeline(self, pipeline_id=None):
return serialized_pipeline


class AutoMLClassifier(BaseAutoML):
class ClassifierBaseAutoML(BaseAutoML):

def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5,
def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5, task=None,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
start_mode='auto', verbose=logging.INFO):
"""
Expand All @@ -308,16 +308,16 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo
:param split_strategy: Method to score the pipeline: `holdout`, `cross_validation` or an instance of
BaseCrossValidator, BaseShuffleSplit, RepeatedSplits.
:param time_bound_run: Limit time in minutes to score a pipeline.
:param task: The task to be solved.
:param score_sorting: The sort used to order the scores. It could be `auto`, `ascending` or `descending`.
`auto` is used for the built-in metrics. For the user-defined metrics, this param must be passed.
:param metric_kwargs: Additional arguments for metric.
:param split_strategy_kwargs: Additional arguments for splitting_strategy.
:param output_folder: Path to the output directory. If it is None, create a temp folder automatically.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs.
:param verbose: The logs level.
"""

task = 'CLASSIFICATION'
super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)

Expand Down Expand Up @@ -351,6 +351,33 @@ def score_pipeline(self, X, y, pipeline_id):
return super().score_pipeline(X, y, pipeline_id)


class AutoMLClassifier(ClassifierBaseAutoML):

def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
start_mode='auto', verbose=logging.INFO):
"""
Create/instantiate an AutoMLClassifier object.
:param time_bound: Limit time in minutes to perform the search.
:param metric: A str (see in the documentation the list of available metrics) or a callable object/function.
:param split_strategy: Method to score the pipeline: `holdout`, `cross_validation` or an instance of
BaseCrossValidator, BaseShuffleSplit, RepeatedSplits.
:param time_bound_run: Limit time in minutes to score a pipeline.
:param score_sorting: The sort used to order the scores. It could be `auto`, `ascending` or `descending`.
`auto` is used for the built-in metrics. For the user-defined metrics, this param must be passed.
:param metric_kwargs: Additional arguments for metric.
:param split_strategy_kwargs: Additional arguments for splitting_strategy.
:param output_folder: Path to the output directory. If it is None, create a temp folder automatically.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: The logs level.
"""

task = 'CLASSIFICATION'
super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)


class AutoMLRegressor(BaseAutoML):

def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy='holdout', time_bound_run=5,
Expand All @@ -370,7 +397,7 @@ def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy='
:param split_strategy_kwargs: Additional arguments for splitting_strategy.
:param output_folder: Path to the output directory. If it is None, create a temp folder automatically.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs.
:param verbose: The logs level.
"""

task = 'REGRESSION'
Expand All @@ -396,7 +423,7 @@ def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='t
:param split_strategy_kwargs: Additional arguments for TimeSeriesSplit, E.g. n_splits and test_size(int).
:param output_folder: Path to the output directory. If it is None, create a temp folder automatically.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs.
:param verbose: The logs level.
"""

task = 'TIME_SERIES_FORECAST'
Expand All @@ -419,11 +446,11 @@ def fit(self, X, y=None):
super().fit(X, y)


class AutoMLSemiSupervisedClassifier(BaseAutoML):
class AutoMLSemiSupervisedClassifier(ClassifierBaseAutoML):

def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', time_bound_run=5,
score_sorting='auto', metric_kwargs={'average': 'micro'}, split_strategy_kwargs=None,
output_folder=None, start_mode='auto', verbose=logging.INFO):
def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5,
score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None,
start_mode='auto', verbose=logging.INFO):
"""
Create/instantiate an AutoMLSemiSupervisedClassifier object.
Expand All @@ -439,42 +466,15 @@ def __init__(self, time_bound=15, metric='f1_score', split_strategy='holdout', t
and `test_size`(test proportion from 0 to 1) can be pass to the splitter.
:param output_folder: Path to the output directory. If it is None, create a temp folder automatically.
:param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`.
:param verbose: Whether or not to show additional logs.
:param verbose: The logs level.
"""

task = 'SEMISUPERVISED'
super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs,
split_strategy_kwargs, output_folder, start_mode, verbose)

if split_strategy_kwargs is None:
split_strategy_kwargs = {'test_size': 0.2}
split_strategy_kwargs = {'test_size': 0.25}

self.splitter = SemiSupervisedSplitter(**split_strategy_kwargs)
self.label_encoder = SemiSupervisedLabelEncoder()

def fit(self, X, y):
y = self.label_encoder.fit_transform(y)
super().fit(X, y)

def predict(self, X):
predictions = super().predict(X)

return self.label_encoder.inverse_transform(predictions)

def score(self, X, y):
y = self.label_encoder.transform(y)

return super().score(X, y)

def fit_pipeline(self, pipeline_id):
super().fit_pipeline(pipeline_id)

def predict_pipeline(self, X, pipeline_id):
predictions = super().predict_pipeline(X, pipeline_id)

return self.label_encoder.inverse_transform(predictions)

def score_pipeline(self, X, y, pipeline_id):
y = self.label_encoder.transform(y)

return super().score_pipeline(X, y, pipeline_id)
Loading

0 comments on commit 5349343

Please sign in to comment.