Skip to content

Commit

Permalink
[rebase] Rebase to the latest version and merge test_evaluator to tra…
Browse files Browse the repository at this point in the history
…in_evaluator

Since test_evaluator can be merged, I merged it.

* [rebase] Rebase and merge the changes in non-test files without issues
* [refactor] Merge test- and train-evaluator
* [fix] Fix the import error due to the change xxx_evaluator --> evaluator
* [test] Fix errors in tests
* [fix] Fix the handling of test pred in no resampling
* [refactor] Move save_y_opt=False for no resampling deepter for simplicity
* [test] Increase the budget size for no resample tests
* [test] [fix] Rebase, modify tests, and increase the coverage
  • Loading branch information
nabenabe0928 committed Feb 23, 2022
1 parent a7be464 commit 8d9c132
Show file tree
Hide file tree
Showing 16 changed files with 465 additions and 1,012 deletions.
10 changes: 5 additions & 5 deletions autoPyTorch/api/base_task.py
Original file line number Diff line number Diff line change
Expand Up @@ -315,7 +315,7 @@ def _get_dataset_input_validator(
Testing feature set
y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
Testing target set
resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
resampling_strategy (Optional[ResamplingStrategies]):
Strategy to split the training data. if None, uses
HoldoutValTypes.holdout_validation.
resampling_strategy_args (Optional[Dict[str, Any]]):
Expand Down Expand Up @@ -355,7 +355,7 @@ def get_dataset(
Testing feature set
y_test (Optional[Union[List, pd.DataFrame, np.ndarray]]):
Testing target set
resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
resampling_strategy (Optional[ResamplingStrategies]):
Strategy to split the training data. if None, uses
HoldoutValTypes.holdout_validation.
resampling_strategy_args (Optional[Dict[str, Any]]):
Expand Down Expand Up @@ -973,7 +973,7 @@ def _search(
`SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
tae_func (Optional[Callable]):
TargetAlgorithm to be optimised. If None, `eval_function`
available in autoPyTorch/evaluation/train_evaluator is used.
available in autoPyTorch/evaluation/evaluator is used.
Must be child class of AbstractEvaluator.
all_supported_metrics (bool: default=True):
If True, all metrics supporting current task will be calculated
Expand Down Expand Up @@ -1380,7 +1380,7 @@ def fit_pipeline(
X_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
y_test: Optional[Union[List, pd.DataFrame, np.ndarray]] = None,
dataset_name: Optional[str] = None,
resampling_strategy: Optional[Union[HoldoutValTypes, CrossValTypes, NoResamplingStrategyTypes]] = None,
resampling_strategy: Optional[ResamplingStrategies] = None,
resampling_strategy_args: Optional[Dict[str, Any]] = None,
run_time_limit_secs: int = 60,
memory_limit: Optional[int] = None,
Expand Down Expand Up @@ -1415,7 +1415,7 @@ def fit_pipeline(
be provided to track the generalization performance of each stage.
dataset_name (Optional[str]):
Name of the dataset, if None, random value is used.
resampling_strategy (Optional[RESAMPLING_STRATEGIES]):
resampling_strategy (Optional[ResamplingStrategies]):
Strategy to split the training data. if None, uses
HoldoutValTypes.holdout_validation.
resampling_strategy_args (Optional[Dict[str, Any]]):
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/api/tabular_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -330,7 +330,7 @@ def search(
`SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
tae_func (Optional[Callable]):
TargetAlgorithm to be optimised. If None, `eval_function`
available in autoPyTorch/evaluation/train_evaluator is used.
available in autoPyTorch/evaluation/evaluator is used.
Must be child class of AbstractEvaluator.
all_supported_metrics (bool: default=True):
If True, all metrics supporting current task will be calculated
Expand Down
2 changes: 1 addition & 1 deletion autoPyTorch/api/tabular_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,7 @@ def search(
`SMAC <https://automl.github.io/SMAC3/master/index.html>`_.
tae_func (Optional[Callable]):
TargetAlgorithm to be optimised. If None, `eval_function`
available in autoPyTorch/evaluation/train_evaluator is used.
available in autoPyTorch/evaluation/evaluator is used.
Must be child class of AbstractEvaluator.
all_supported_metrics (bool: default=True):
If True, all metrics supporting current task will be calculated
Expand Down
8 changes: 8 additions & 0 deletions autoPyTorch/datasets/resampling_strategy.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,14 @@ def is_stratified(self) -> bool:
# TODO: replace it with another way
ResamplingStrategies = Union[CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes]


def check_resampling_strategy(resampling_strategy: Optional[ResamplingStrategies]) -> None:
choices = (CrossValTypes, HoldoutValTypes, NoResamplingStrategyTypes)
if not isinstance(resampling_strategy, choices):
rs_names = (rs.__mro__[0].__name__ for rs in choices)
raise ValueError(f'resampling_strategy must be in {rs_names}, but got {resampling_strategy}')


DEFAULT_RESAMPLING_PARAMETERS: Dict[
ResamplingStrategies,
Dict[str, Any]
Expand Down
246 changes: 108 additions & 138 deletions autoPyTorch/evaluation/abstract_evaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,47 +167,87 @@ class FixedPipelineParams(NamedTuple):
search_space_updates (Optional[HyperparameterSearchSpaceUpdates]):
An object used to fine tune the hyperparameter search space of the pipeline
"""
def __init__(self, backend: Backend,
queue: Queue,
metric: autoPyTorchMetric,
budget: float,
configuration: Union[int, str, Configuration],
budget_type: str = None,
pipeline_config: Optional[Dict[str, Any]] = None,
seed: int = 1,
output_y_hat_optimization: bool = True,
num_run: Optional[int] = None,
include: Optional[Dict[str, Any]] = None,
exclude: Optional[Dict[str, Any]] = None,
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None,
init_params: Optional[Dict[str, Any]] = None,
logger_port: Optional[int] = None,
all_supported_metrics: bool = True,
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None
) -> None:

self.starttime = time.time()

self.configuration = configuration
self.backend: Backend = backend
self.queue = queue

self.include = include
self.exclude = exclude
self.search_space_updates = search_space_updates

self.metric = metric


self._init_datamanager_info()

# Flag to save target for ensemble
self.output_y_hat_optimization = output_y_hat_optimization
backend: Backend
seed: int
metric: autoPyTorchMetric
budget_type: str # Literal['epochs', 'runtime']
pipeline_config: Dict[str, Any]
save_y_opt: bool = True
include: Optional[Dict[str, Any]] = None
exclude: Optional[Dict[str, Any]] = None
disable_file_output: Optional[List[Union[str, DisableFileOutputParameters]]] = None
logger_port: Optional[int] = None
all_supported_metrics: bool = True
search_space_updates: Optional[HyperparameterSearchSpaceUpdates] = None

@classmethod
def with_default_pipeline_config(
cls,
pipeline_config: Optional[Dict[str, Any]] = None,
choice: str = 'default',
**kwargs: Any
) -> 'FixedPipelineParams':

if 'budget_type' in kwargs:
raise TypeError(
f'{cls.__name__}.with_default_pipeline_config() got multiple values for argument `budget_type`'
)

budget_type_choices = ('epochs', 'runtime')
if pipeline_config is None:
pipeline_config = get_default_pipeline_config(choice=choice)
if 'budget_type' not in pipeline_config:
raise ValueError('pipeline_config must have `budget_type`')

budget_type = pipeline_config['budget_type']
if pipeline_config['budget_type'] not in budget_type_choices:
raise ValueError(f"budget_type must be in {budget_type_choices}, but got {budget_type}")

kwargs.update(pipeline_config=pipeline_config, budget_type=budget_type)
return cls(**kwargs)


class EvaluatorParams(NamedTuple):
"""
Attributes:
configuration (Union[int, str, Configuration]):
Determines the pipeline to be constructed. A dummy estimator is created for
integer configurations, a traditional machine learning pipeline is created
for string based configuration, and NAS is performed when a configuration
object is passed.
num_run (Optional[int]):
An identifier of the current configuration being fit. This number is unique per
configuration.
init_params (Optional[Dict[str, Any]]):
Optional argument that is passed to each pipeline step. It is the equivalent of
kwargs for the pipeline steps.
"""
budget: float
configuration: Union[int, str, Configuration]
num_run: Optional[int] = None
init_params: Optional[Dict[str, Any]] = None

@classmethod
def with_default_budget(
cls,
budget: float = 0,
choice: str = 'default',
**kwargs: Any
) -> 'EvaluatorParams':
budget = get_default_budget(choice=choice) if budget == 0 else budget
kwargs.update(budget=budget)
return cls(**kwargs)


class AbstractEvaluator(object):
"""
This method defines the interface that pipeline evaluators should follow, when
interacting with SMAC through TargetAlgorithmQuery.
An evaluator is an object that:
+ constructs a pipeline (i.e. a classification or regression estimator) for a given
pipeline_config and run settings (budget, seed)
+ Fits and trains this pipeline (TrainEvaluator) or tests a given
+ Fits and trains this pipeline (Evaluator) or tests a given
configuration (TestEvaluator)
The provided configuration determines the type of pipeline created. For more
Expand Down Expand Up @@ -244,21 +284,33 @@ def _init_miscellaneous(self) -> None:
DisableFileOutputParameters.check_compatibility(disable_file_output)
self.disable_file_output = disable_file_output
else:
if isinstance(self.configuration, int):
self.pipeline_class = DummyClassificationPipeline
elif isinstance(self.configuration, str):
if self.task_type in TABULAR_TASKS:
self.pipeline_class = MyTraditionalTabularClassificationPipeline
else:
raise ValueError("Only tabular tasks are currently supported with traditional methods")
elif isinstance(self.configuration, Configuration):
if self.task_type in TABULAR_TASKS:
self.pipeline_class = autoPyTorch.pipeline.tabular_classification.TabularClassificationPipeline
elif self.task_type in IMAGE_TASKS:
self.pipeline_class = autoPyTorch.pipeline.image_classification.ImageClassificationPipeline
else:
raise ValueError('task {} not available'.format(self.task_type))
self.predict_function = self._predict_proba
self.disable_file_output = []

if self.num_folds == 1: # not save cv model when we perform holdout
self.disable_file_output.append('cv_model')

def _init_dataset_properties(self) -> None:
datamanager: BaseDataset = self.fixed_pipeline_params.backend.load_datamanager()
if datamanager.task_type is None:
raise ValueError(f"Expected dataset {datamanager.__class__.__name__} to have task_type got None")
if datamanager.splits is None:
raise ValueError(f"cannot fit pipeline {self.__class__.__name__} with datamanager.splits None")

self.splits = datamanager.splits
self.num_folds: int = len(self.splits)
# Since cv might not finish in time, we take self.pipelines as None by default
self.pipelines: List[Optional[BaseEstimator]] = [None] * self.num_folds
self.task_type = STRING_TO_TASK_TYPES[datamanager.task_type]
self.num_classes = getattr(datamanager, 'num_classes', 1)
self.output_type = datamanager.output_type

search_space_updates = self.fixed_pipeline_params.search_space_updates
self.dataset_properties = datamanager.get_dataset_properties(
get_dataset_requirements(info=datamanager.get_required_dataset_info(),
include=self.fixed_pipeline_params.include,
exclude=self.fixed_pipeline_params.exclude,
search_space_updates=search_space_updates
))

self.X_train, self.y_train = datamanager.train_tensors
self.unique_train_labels = [
Expand All @@ -271,6 +323,8 @@ def _init_miscellaneous(self) -> None:
if datamanager.test_tensors is not None:
self.X_test, self.y_test = datamanager.test_tensors

del datamanager # Delete datamanager to release the memory

def _init_additional_metrics(self) -> None:
all_supported_metrics = self.fixed_pipeline_params.all_supported_metrics
metric = self.fixed_pipeline_params.metric
Expand All @@ -282,59 +336,7 @@ def _init_additional_metrics(self) -> None:
all_supported_metrics=all_supported_metrics)
self.metrics_dict = {'additional_metrics': [m.name for m in [metric] + self.additional_metrics]}

def _init_datamanager_info(
self,
) -> None:
"""
Initialises instance attributes that come from the datamanager.
For example,
X_train, y_train, etc.
"""

datamanager: BaseDataset = self.backend.load_datamanager()

assert datamanager.task_type is not None, \
"Expected dataset {} to have task_type got None".format(datamanager.__class__.__name__)
self.task_type = STRING_TO_TASK_TYPES[datamanager.task_type]
self.output_type = STRING_TO_OUTPUT_TYPES[datamanager.output_type]
self.issparse = datamanager.issparse

self.X_train, self.y_train = datamanager.train_tensors

if datamanager.val_tensors is not None:
self.X_valid, self.y_valid = datamanager.val_tensors
else:
self.X_valid, self.y_valid = None, None

if datamanager.test_tensors is not None:
self.X_test, self.y_test = datamanager.test_tensors
else:
self.X_test, self.y_test = None, None

self.resampling_strategy = datamanager.resampling_strategy

self.num_classes: Optional[int] = getattr(datamanager, "num_classes", None)

self.dataset_properties = datamanager.get_dataset_properties(
get_dataset_requirements(info=datamanager.get_required_dataset_info(),
include=self.include,
exclude=self.exclude,
search_space_updates=self.search_space_updates
))
self.splits = datamanager.splits
if self.splits is None:
raise AttributeError(f"create_splits on {datamanager.__class__.__name__} must be called "
f"before the instantiation of {self.__class__.__name__}")

# delete datamanager from memory
del datamanager

def _init_fit_dictionary(
self,
logger_port: int,
pipeline_config: Dict[str, Any],
metrics_dict: Optional[Dict[str, List[str]]] = None,
) -> None:
def _init_fit_dictionary(self) -> None:
"""
Initialises the fit dictionary
Expand Down Expand Up @@ -617,36 +619,4 @@ def _is_output_possible(
if y is not None and not np.all(np.isfinite(y)):
return False # Model predictions contains NaNs

Args:
prediction (np.ndarray):
The un-formatted predictions of a pipeline
Y_train (np.ndarray):
The labels from the dataset to give an intuition of the expected
predictions dimensionality
Returns:
(np.ndarray):
The formatted prediction
"""
assert self.num_classes is not None, "Called function on wrong task"

if self.output_type == MULTICLASS and \
prediction.shape[1] < self.num_classes:
if Y_train is None:
raise ValueError('Y_train must not be None!')
classes = list(np.unique(Y_train))

mapping = dict()
for class_number in range(self.num_classes):
if class_number in classes:
index = classes.index(class_number)
mapping[index] = class_number
new_predictions = np.zeros((prediction.shape[0], self.num_classes),
dtype=np.float32)

for index in mapping:
class_index = mapping[index]
new_predictions[:, class_index] = prediction[:, index]

return new_predictions

return prediction
return True
Loading

0 comments on commit 8d9c132

Please sign in to comment.