Skip to content

Commit

Permalink
[FIX] Tests after rebase of reg_cocktails (#359)
Browse files Browse the repository at this point in the history
* update requirements

* update requirements

* resolve remaining conflicts and fix flake and mypy

* Fix remaining tests and examples

* fix failing checks

* fix flake
  • Loading branch information
ravinkohli committed Jan 24, 2022
1 parent ff6e8c4 commit d49ed68
Show file tree
Hide file tree
Showing 40 changed files with 329 additions and 1,057 deletions.
90 changes: 49 additions & 41 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@

import pandas as pd

from smac.runhistory.runhistory import DataOrigin, RunHistory, RunInfo, RunValue
from smac.runhistory.runhistory import DataOrigin, RunHistory
from smac.stats.stats import Stats
from smac.tae import StatusType

Expand Down Expand Up @@ -593,11 +593,16 @@ def _load_models(self) -> bool:
raise ValueError("Resampling strategy is needed to determine what models to load")
self.ensemble_ = self._backend.load_ensemble(self.seed)

if isinstance(self._disable_file_output, List):
disabled_file_outputs = self._disable_file_output
# TODO: remove this code after `fit_pipeline` is rebased.
if hasattr(self, '_disable_file_output'):
if isinstance(self._disable_file_output, List):
disabled_file_outputs = self._disable_file_output
disable_file_output = False
elif isinstance(self._disable_file_output, bool):
disable_file_output = self._disable_file_output
disabled_file_outputs = []
else:
disable_file_output = False
elif isinstance(self._disable_file_output, bool):
disable_file_output = self._disable_file_output
disabled_file_outputs = []

# If no ensemble is loaded, try to get the best performing model
Expand Down Expand Up @@ -901,18 +906,15 @@ def run_traditional_ml(
learning algorithm runs over the time limit.
"""
assert self._logger is not None # for mypy compliancy
if STRING_TO_TASK_TYPES[self.task_type] in REGRESSION_TASKS:
self._logger.warning("Traditional Pipeline is not enabled for regression. Skipping...")
else:
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
time_for_traditional = int(runtime_limit - elapsed_time)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)
traditional_task_name = 'runTraditional'
self._stopwatch.start_task(traditional_task_name)
elapsed_time = self._stopwatch.wall_elapsed(current_task_name)
time_for_traditional = int(runtime_limit - elapsed_time)
self._do_traditional_prediction(
func_eval_time_limit_secs=func_eval_time_limit_secs,
time_left=time_for_traditional,
)
self._stopwatch.stop_task(traditional_task_name)

def _search(
self,
Expand Down Expand Up @@ -1282,22 +1284,7 @@ def _search(
self._logger.info("Starting Shutdown")

if proc_ensemble is not None:
self._results_manager.ensemble_performance_history = list(proc_ensemble.history)

if len(proc_ensemble.futures) > 0:
# Also add ensemble runs that did not finish within smac time
# and add them into the ensemble history
self._logger.info("Ensemble script still running, waiting for it to finish.")
result = proc_ensemble.futures.pop().result()
if result:
ensemble_history, _, _, _ = result
self._results_manager.ensemble_performance_history.extend(ensemble_history)
self._logger.info("Ensemble script finished, continue shutdown.")

# save the ensemble performance history file
if len(self.ensemble_performance_history) > 0:
pd.DataFrame(self.ensemble_performance_history).to_json(
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))
self._collect_results_ensemble(proc_ensemble)

if load_models:
self._logger.info("Loading models...")
Expand Down Expand Up @@ -1557,7 +1544,7 @@ def fit_pipeline(
exclude=self.exclude_components,
search_space_updates=self.search_space_updates)
dataset_properties = dataset.get_dataset_properties(dataset_requirements)
self._backend.replace_datamanager(dataset)
self._backend.save_datamanager(dataset)

if self._logger is None:
self._logger = self._get_logger(dataset.dataset_name)
Expand Down Expand Up @@ -1747,7 +1734,7 @@ def fit_ensemble(
ensemble_fit_task_name = 'EnsembleFit'
self._stopwatch.start_task(ensemble_fit_task_name)
if enable_traditional_pipeline:
if func_eval_time_limit_secs is None or func_eval_time_limit_secs > time_for_task:
if func_eval_time_limit_secs > time_for_task:
self._logger.warning(
'Time limit for a single run is higher than total time '
'limit. Capping the limit for a single run to the total '
Expand Down Expand Up @@ -1788,12 +1775,8 @@ def fit_ensemble(
)

manager.build_ensemble(self._dask_client)
future = manager.futures.pop()
result = future.result()
if result is None:
raise ValueError("Errors occurred while building the ensemble - please"
" check the log file and command line output for error messages.")
self.ensemble_performance_history, _, _, _ = result
if manager is not None:
self._collect_results_ensemble(manager)

if load_models:
self._load_models()
Expand Down Expand Up @@ -1871,6 +1854,31 @@ def _init_ensemble_builder(

return proc_ensemble

def _collect_results_ensemble(
self,
manager: EnsembleBuilderManager
) -> None:

if self._logger is None:
raise ValueError("logger should be initialized to fit ensemble")

self._results_manager.ensemble_performance_history = list(manager.history)

if len(manager.futures) > 0:
# Also add ensemble runs that did not finish within smac time
# and add them into the ensemble history
self._logger.info("Ensemble script still running, waiting for it to finish.")
result = manager.futures.pop().result()
if result:
ensemble_history, _, _, _ = result
self._results_manager.ensemble_performance_history.extend(ensemble_history)
self._logger.info("Ensemble script finished, continue shutdown.")

# save the ensemble performance history file
if len(self.ensemble_performance_history) > 0:
pd.DataFrame(self.ensemble_performance_history).to_json(
os.path.join(self._backend.internals_directory, 'ensemble_history.json'))

def predict(
self,
X_test: np.ndarray,
Expand Down
22 changes: 17 additions & 5 deletions autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -371,6 +371,18 @@ def search(
self
"""
if dataset_name is None:
dataset_name = str(uuid.uuid1(clock_seq=os.getpid()))

# we have to create a logger for at this point for the validator
self._logger = self._get_logger(dataset_name)

# Create a validator object to make sure that the data provided by
# the user matches the autopytorch requirements
self.InputValidator = TabularInputValidator(
is_classification=True,
logger_port=self._logger_port,
)

self.dataset, self.InputValidator = self._get_dataset_input_validator(
X_train=X_train,
Expand All @@ -389,9 +401,9 @@ def search(
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
)


if self.dataset is None:
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))

return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand Down Expand Up @@ -431,23 +443,23 @@ def predict(
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator search() method.")

X_test = self.input_validator.feature_validator.transform(X_test)
X_test = self.InputValidator.feature_validator.transform(X_test)
predicted_probabilities = super().predict(X_test, batch_size=batch_size,
n_jobs=n_jobs)

if self.input_validator.target_validator.is_single_column_target():
if self.InputValidator.target_validator.is_single_column_target():
predicted_indexes = np.argmax(predicted_probabilities, axis=1)
else:
predicted_indexes = (predicted_probabilities > 0.5).astype(int)

# Allow to predict in the original domain -- that is, the user is not interested
# in our encoded values
return self.input_validator.target_validator.inverse_transform(predicted_indexes)
return self.InputValidator.target_validator.inverse_transform(predicted_indexes)

def predict_proba(self,
X_test: Union[np.ndarray, pd.DataFrame, List],
batch_size: Optional[int] = None, n_jobs: int = 1) -> np.ndarray:
if self.input_validator is None or not self.input_validator._is_fitted:
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator search() method.")
X_test = self.InputValidator.feature_validator.transform(X_test)
Expand Down
14 changes: 7 additions & 7 deletions autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,9 @@ def __init__(
delete_output_folder_after_terminate: bool = True,
include_components: Optional[Dict] = None,
exclude_components: Optional[Dict] = None,
resampling_strategy:Union[CrossValTypes,
HoldoutValTypes,
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[CrossValTypes,
HoldoutValTypes,
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
backend: Optional[Backend] = None,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
Expand Down Expand Up @@ -386,9 +386,9 @@ def search(
'(CrossValTypes, HoldoutValTypes), but got {}'.format(self.resampling_strategy)
)


if self.dataset is None:
raise ValueError("`dataset` in {} must be initialized, but got None".format(self.__class__.__name__))

return self._search(
dataset=self.dataset,
optimize_metric=optimize_metric,
Expand All @@ -414,14 +414,14 @@ def predict(
batch_size: Optional[int] = None,
n_jobs: int = 1
) -> np.ndarray:
if self.input_validator is None or not self.input_validator._is_fitted:
if self.InputValidator is None or not self.InputValidator._is_fitted:
raise ValueError("predict() is only supported after calling search. Kindly call first "
"the estimator search() method.")

X_test = self.input_validator.feature_validator.transform(X_test)
X_test = self.InputValidator.feature_validator.transform(X_test)
predicted_values = super().predict(X_test, batch_size=batch_size,
n_jobs=n_jobs)

# Allow to predict in the original domain -- that is, the user is not interested
# in our encoded values
return self.input_validator.target_validator.inverse_transform(predicted_values)
return self.InputValidator.target_validator.inverse_transform(predicted_values)
1 change: 0 additions & 1 deletion autoPyTorch/data/base_target_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,6 @@ def fit(
np.shape(y_test)
))
if isinstance(y_train, pd.DataFrame):
y_train = cast(pd.DataFrame, y_train)
y_test = cast(pd.DataFrame, y_test)
if y_train.columns.tolist() != y_test.columns.tolist():
raise ValueError(
Expand Down
7 changes: 2 additions & 5 deletions autoPyTorch/data/tabular_feature_validator.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import functools
from typing import Dict, List, Optional, Tuple, Union, cast
from typing import Dict, List, Optional, Tuple, Type, Union, cast

import numpy as np

Expand Down Expand Up @@ -263,7 +263,7 @@ def transform(
X = self.numpy_to_pandas(X)

if hasattr(X, "iloc") and not scipy.sparse.issparse(X):
X = cast(pd.DataFrame, X)
X = cast(Type[pd.DataFrame], X)

# Check the data here so we catch problems on new test data
self._check_data(X)
Expand Down Expand Up @@ -391,9 +391,6 @@ def _get_columns_info(
Type of each column numerical/categorical
"""

if len(self.transformed_columns) > 0 and self.feat_type is not None:
return self.transformed_columns, self.feat_type

# Register if a column needs encoding
numerical_columns = []
categorical_columns = []
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/data/tabular_target_validator.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Union, cast
from typing import List, Optional, cast

import numpy as np

Expand Down
7 changes: 5 additions & 2 deletions autoPyTorch/datasets/resampling_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,10 @@ class NoResamplingStrategyTypes(IntEnum):
RESAMPLING_STRATEGIES = [CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]


DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[HoldoutValTypes, CrossValTypes], Dict[str, Any]] = {
DEFAULT_RESAMPLING_PARAMETERS: Dict[Union[CrossValTypes,
HoldoutValTypes,
NoResamplingStrategyTypes],
Dict[str, Any]] = {
HoldoutValTypes.holdout_validation: {
'val_share': 0.33,
},
Expand All @@ -117,7 +120,7 @@ class NoResamplingStrategyTypes(IntEnum):
NoResamplingStrategyTypes.shuffle_no_resampling: {
'shuffle': True
}
} # type: Dict[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes], Dict[str, Any]]
}


class HoldOutFuncs():
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/evaluation/fit_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,13 @@

from smac.tae import StatusType

from autoPyTorch.automl_common.common.utils.backend import Backend
from autoPyTorch.datasets.resampling_strategy import NoResamplingStrategyTypes
from autoPyTorch.evaluation.abstract_evaluator import (
AbstractEvaluator,
fit_and_suppress_warnings
)
from autoPyTorch.pipeline.components.training.metrics.base import autoPyTorchMetric
from autoPyTorch.utils.backend import Backend
from autoPyTorch.utils.common import subsampler
from autoPyTorch.utils.hyperparameter_search_space_update import HyperparameterSearchSpaceUpdates

Expand Down
1 change: 1 addition & 0 deletions autoPyTorch/evaluation/tae.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@

import autoPyTorch.evaluation.fit_evaluator
import autoPyTorch.evaluation.train_evaluator
from autoPyTorch.automl_common.common.utils.backend import Backend
from autoPyTorch.datasets.resampling_strategy import (
CrossValTypes,
HoldoutValTypes,
Expand Down
11 changes: 9 additions & 2 deletions autoPyTorch/optimizer/smbo.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
CrossValTypes,
DEFAULT_RESAMPLING_PARAMETERS,
HoldoutValTypes,
NoResamplingStrategyTypes
)
from autoPyTorch.ensemble.ensemble_builder import EnsembleBuilderManager
from autoPyTorch.evaluation.tae import ExecuteTaFuncWithQueue, get_cost_of_crash
Expand Down Expand Up @@ -98,11 +99,13 @@ def __init__(self,
pipeline_config: Dict[str, Any],
start_num_run: int = 1,
seed: int = 1,
resampling_strategy: Union[HoldoutValTypes, CrossValTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy: Union[HoldoutValTypes,
CrossValTypes,
NoResamplingStrategyTypes] = HoldoutValTypes.holdout_validation,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
include: Optional[Dict[str, Any]] = None,
exclude: Optional[Dict[str, Any]] = None,
disable_file_output: List = [],
disable_file_output: Union[bool, List[str]] = False,
smac_scenario_args: Optional[Dict[str, Any]] = None,
get_smac_object_callback: Optional[Callable] = None,
all_supported_metrics: bool = True,
Expand Down Expand Up @@ -245,6 +248,10 @@ def __init__(self,
if portfolio_selection is not None:
self.initial_configurations = read_return_initial_configurations(config_space=config_space,
portfolio_selection=portfolio_selection)
if len(self.initial_configurations) == 0:
self.initial_configurations = None
self.logger.warning("None of the portfolio configurations are compatible"
" with the current search space. Skipping initial configuration...")

def reset_data_manager(self) -> None:
if self.datamanager is not None:
Expand Down
Loading

0 comments on commit d49ed68

Please sign in to comment.