diff --git a/.gitignore b/.gitignore index 190eb0dc..2d498ce2 100644 --- a/.gitignore +++ b/.gitignore @@ -60,3 +60,6 @@ lightning_logs .cache **/smac3_output /venv + +.vscode +**/.vscode \ No newline at end of file diff --git a/alpha_automl/automl_api.py b/alpha_automl/automl_api.py index d0ff15e5..a261f89d 100644 --- a/alpha_automl/automl_api.py +++ b/alpha_automl/automl_api.py @@ -24,7 +24,7 @@ class BaseAutoML(): def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bound_run=5, task=None, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None, - num_cpus=None, start_mode='auto', verbose=logging.INFO): + checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO): """ Create/instantiate an BaseAutoML object. @@ -39,6 +39,8 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo :param metric_kwargs: Additional arguments for metric. :param split_strategy_kwargs: Additional arguments for splitting_strategy. :param output_folder: Path to the output directory. If it is None, create a temp folder automatically. + :param checkpoints_folder: Path to the directory to load and save the checkpoints. If it is None, + it will use the default checkpoints and save the new checkpoints in output_folder. :param num_cpus: Number of CPUs to be used. :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`. :param verbose: The logs level. @@ -60,11 +62,11 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo self.X = None self.y = None self.leaderboard = None - self.automl_manager = AutoMLManager(self.output_folder, time_bound, time_bound_run, task, num_cpus, verbose) - self._start_method = get_start_method(start_mode) - set_start_method(self._start_method, force=True) - check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric') - check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy') + self.automl_manager = AutoMLManager(self.output_folder, checkpoints_folder, time_bound, time_bound_run, task, num_cpus, verbose) + #self._start_method = get_start_method(start_mode) + #set_start_method(self._start_method, force=True) + #check_input_for_multiprocessing(self._start_method, self.scorer._score_func, 'metric') + #check_input_for_multiprocessing(self._start_method, self.splitter, 'split strategy') self.label_encoder = None self.task_type = task @@ -194,7 +196,7 @@ def add_primitives(self, new_primitives): :param new_primitives: Set of new primitives, tuples of name and object primitive """ for primitive_object, primitive_type in new_primitives: - check_input_for_multiprocessing(self._start_method, primitive_object, 'primitive') + #check_input_for_multiprocessing(self._start_method, primitive_object, 'primitive') primitive_name = f'{primitive_object.__module__}.{primitive_object.__class__.__name__}' primitive_name = primitive_name.replace('__', '') # Sklearn restriction on estimator names self.new_primitives[primitive_name] = {'primitive_object': primitive_object, @@ -297,7 +299,7 @@ class ClassifierBaseAutoML(BaseAutoML): def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5, task=None, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None, - num_cpus=None, start_mode='auto', verbose=logging.INFO): + checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO): """ Create/instantiate an AutoMLClassifier object. @@ -312,13 +314,15 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo :param metric_kwargs: Additional arguments for metric. :param split_strategy_kwargs: Additional arguments for splitting_strategy. :param output_folder: Path to the output directory. If it is None, create a temp folder automatically. + :param checkpoints_folder: Path to the directory to load and save the checkpoints. If it is None, + it will use the default checkpoints and save the new checkpoints in output_folder. :param num_cpus: Number of CPUs to be used. :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`. :param verbose: The logs level. """ super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs, - split_strategy_kwargs, output_folder, num_cpus, start_mode, verbose) + split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode, verbose) self.label_encoder = LabelEncoder() @@ -354,7 +358,7 @@ class AutoMLClassifier(ClassifierBaseAutoML): def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None, - num_cpus=None, start_mode='auto', verbose=logging.INFO): + checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO): """ Create/instantiate an AutoMLClassifier object. @@ -368,6 +372,8 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo :param metric_kwargs: Additional arguments for metric. :param split_strategy_kwargs: Additional arguments for splitting_strategy. :param output_folder: Path to the output directory. If it is None, create a temp folder automatically. + :param checkpoints_folder: Path to the directory to load and save the checkpoints. If it is None, + it will use the default checkpoints and save the new checkpoints in output_folder. :param num_cpus: Number of CPUs to be used. :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`. :param verbose: The logs level. @@ -375,14 +381,14 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo task = 'CLASSIFICATION' super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs, - split_strategy_kwargs, output_folder, num_cpus, start_mode, verbose) + split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode, verbose) class AutoMLRegressor(BaseAutoML): def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy='holdout', time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None, - num_cpus=None, start_mode='auto', verbose=logging.INFO): + checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO): """ Create/instantiate an AutoMLRegressor object. @@ -396,6 +402,8 @@ def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy=' :param metric_kwargs: Additional arguments for metric. :param split_strategy_kwargs: Additional arguments for splitting_strategy. :param output_folder: Path to the output directory. If it is None, create a temp folder automatically. + :param checkpoints_folder: Path to the directory to load and save the checkpoints. If it is None, + it will use the default checkpoints and save the new checkpoints in output_folder. :param num_cpus: Number of CPUs to be used. :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`. :param verbose: The logs level. @@ -403,13 +411,14 @@ def __init__(self, time_bound=15, metric='mean_absolute_error', split_strategy=' task = 'REGRESSION' super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs, - split_strategy_kwargs, output_folder, num_cpus, start_mode, verbose) + split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode, verbose) class AutoMLTimeSeries(BaseAutoML): def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='timeseries', time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None, - num_cpus=None, start_mode='auto', verbose=logging.INFO, date_column=None, target_column=None): + checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO, date_column=None, + target_column=None): """ Create/instantiate an AutoMLTimeSeries object. @@ -423,6 +432,8 @@ def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='t :param metric_kwargs: Additional arguments for metric. :param split_strategy_kwargs: Additional arguments for TimeSeriesSplit, E.g. n_splits and test_size(int). :param output_folder: Path to the output directory. If it is None, create a temp folder automatically. + :param checkpoints_folder: Path to the directory to load and save the checkpoints. If it is None, + it will use the default checkpoints and save the new checkpoints in output_folder. :param num_cpus: Number of CPUs to be used. :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`. :param verbose: The logs level. @@ -433,7 +444,7 @@ def __init__(self, time_bound=15, metric='mean_squared_error', split_strategy='t self.target_column = target_column super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs, - split_strategy_kwargs, output_folder, num_cpus, start_mode, verbose) + split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode, verbose) def _column_parser(self, X): cols = list(X.columns.values) @@ -452,7 +463,7 @@ class AutoMLSemiSupervisedClassifier(ClassifierBaseAutoML): def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdout', time_bound_run=5, score_sorting='auto', metric_kwargs=None, split_strategy_kwargs=None, output_folder=None, - num_cpus=None, start_mode='auto', verbose=logging.INFO): + checkpoints_folder=None, num_cpus=None, start_mode='auto', verbose=logging.INFO): """ Create/instantiate an AutoMLSemiSupervisedClassifier object. @@ -467,6 +478,8 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo :param split_strategy_kwargs: Additional arguments for splitting_strategy. In SemiSupervised case, `n_splits` and `test_size`(test proportion from 0 to 1) can be pass to the splitter. :param output_folder: Path to the output directory. If it is None, create a temp folder automatically. + :param checkpoints_folder: Path to the directory to load and save the checkpoints. If it is None, + it will use the default checkpoints and save the new checkpoints in output_folder. :param num_cpus: Number of CPUs to be used. :param start_mode: The mode to start the multiprocessing library. It could be `auto`, `fork` or `spawn`. :param verbose: The logs level. @@ -474,7 +487,7 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo task = 'SEMISUPERVISED' super().__init__(time_bound, metric, split_strategy, time_bound_run, task, score_sorting, metric_kwargs, - split_strategy_kwargs, output_folder, num_cpus, start_mode, verbose) + split_strategy_kwargs, output_folder, checkpoints_folder, num_cpus, start_mode, verbose) if split_strategy_kwargs is None: split_strategy_kwargs = {'test_size': 0.25} diff --git a/alpha_automl/automl_manager.py b/alpha_automl/automl_manager.py index c0d2a448..c950c29c 100644 --- a/alpha_automl/automl_manager.py +++ b/alpha_automl/automl_manager.py @@ -1,7 +1,7 @@ import logging import time import multiprocessing -import queue as Q +from alpha_automl.pipeline import Pipeline from alpha_automl.data_profiler import profile_data from alpha_automl.scorer import make_splitter, score_pipeline from alpha_automl.utils import sample_dataset, is_equal_splitting @@ -22,8 +22,9 @@ class AutoMLManager(): - def __init__(self, output_folder, time_bound, time_bound_run, task, num_cpus, verbose): + def __init__(self, output_folder, checkpoints_folder, time_bound, time_bound_run, task, num_cpus, verbose): self.output_folder = output_folder + self.checkpoints_folder = checkpoints_folder self.time_bound = time_bound * 60 self.time_bound_run = time_bound_run * 60 self.task = task @@ -60,86 +61,33 @@ def _search_pipelines(self, automl_hyperparams): if not is_sample and is_equal_splitting(internal_splitting_strategy, self.splitting_strategy): need_rescoring = False - queue = multiprocessing.Queue() - search_process = multiprocessing.Process(target=search_pipelines_proc, - args=(X, y, self.scoring, internal_splitting_strategy, self.task, - automl_hyperparams, metadata, self.output_folder, self.verbose, - queue - ) - ) - - search_process.start() - self.running_processes += 1 - num_processes = self.num_cpus - 2 # Exclude the main process and search process - scoring_pool = multiprocessing.Pool(max(1, num_processes)) - pipelines_to_score = [] - scoring_results = [] - - while True: + pipelines = search_pipelines_proc(X, y, self.scoring, internal_splitting_strategy, self.task, + self.time_bound, automl_hyperparams, metadata, self.output_folder, + self.checkpoints_folder) + + found_pipelines = 0 + + pipeline_threshold = 20 + X, y, _ = sample_dataset(self.X, self.y, SAMPLE_SIZE, self.task) + while pipelines and found_pipelines < pipeline_threshold: + pipeline = pipelines.pop() try: - result = queue.get(timeout=10) - except Q.Empty: - logger.debug('Reached timeout getting new pipelines') - result = None - - if result == 'DONE': - search_process.terminate() - search_process.join(10) - scoring_pool.terminate() - scoring_pool.join() - logger.debug(f'Found {self.found_pipelines} pipelines') - logger.debug('Search done') - break - - elif result is not None: - pipeline = result - logger.debug('Found new pipeline') - yield {'pipeline': pipeline, 'message': 'FOUND'} - - if need_rescoring: - pipelines_to_score.append(pipeline) - else: - logger.debug(f'Pipeline scored successfully, score={pipeline.get_score()}') - self.found_pipelines += 1 - yield {'pipeline': pipeline, 'message': 'SCORED'} - - if len(pipelines_to_score) > 0: - if self.running_processes < MAX_RUNNING_PROCESSES: - pipeline = pipelines_to_score.pop(0).get_pipeline() - scoring_result = scoring_pool.apply_async( - score_pipeline, - args=(pipeline, self.X, self.y, self.scoring, self.splitting_strategy, self.task, self.verbose) - ) - scoring_results.append(scoring_result) - self.running_processes += 1 - - tmp_scoring_results = [] - for scoring_result in scoring_results: - if scoring_result.ready(): - self.running_processes -= 1 - pipeline = scoring_result.get() - if pipeline is not None: - logger.debug(f'Pipeline scored successfully, score={pipeline.get_score()}') - self.found_pipelines += 1 - yield {'pipeline': pipeline, 'message': 'SCORED'} - else: - tmp_scoring_results.append(scoring_result) - - scoring_results = tmp_scoring_results - - if time.time() > search_start_time + self.time_bound: - logger.debug('Reached search timeout') - search_process.terminate() - search_process.join(10) - scoring_pool.terminate() - scoring_pool.join() - logger.debug(f'Found {self.found_pipelines} pipelines') - break + alphaautoml_pipeline = score_pipeline(pipeline, X, y, self.scoring, self.splitting_strategy, self.task) + + if alphaautoml_pipeline is not None: + score = alphaautoml_pipeline.get_score() + logger.debug(f'Pipeline scored successfully, score={score}') + found_pipelines += 1 + yield {'pipeline': alphaautoml_pipeline, 'message': 'SCORED'} + except: + logger.debug(f'Pipeline scoring error!') + continue + + logger.debug(f'Found {found_pipelines} pipelines') + logger.debug('Search done') - def check_automl_hyperparams(self, automl_hyperparams): - if 'use_automatic_grammar' not in automl_hyperparams: - automl_hyperparams['use_automatic_grammar'] = USE_AUTOMATIC_GRAMMAR + def check_automl_hyperparams(self, automl_hyperparams): if 'prioritize_primitives' not in automl_hyperparams: automl_hyperparams['prioritize_primitives'] = PRIORITIZE_PRIMITIVES diff --git a/alpha_automl/grammar_loader.py b/alpha_automl/grammar_loader.py index dfe645f0..eb6b2a17 100644 --- a/alpha_automl/grammar_loader.py +++ b/alpha_automl/grammar_loader.py @@ -48,8 +48,9 @@ def create_task_grammar(global_grammar, task): for production in global_grammar.productions(): for new_production in new_productions: - if production.lhs() in new_production.rhs() and production not in new_productions: - new_productions.append(production) + if production not in new_productions: + if str(production.lhs()) != "S" and "_TASK" not in str(production.lhs()): + new_productions.append(production) task_grammar = CFG(start_token, new_productions) logger.debug(f'Task grammar: {task_grammar}') diff --git a/alpha_automl/hyperparameter_tuning/bayesian.py b/alpha_automl/hyperparameter_tuning/bayesian.py deleted file mode 100644 index 4919f4af..00000000 --- a/alpha_automl/hyperparameter_tuning/bayesian.py +++ /dev/null @@ -1,87 +0,0 @@ -import logging -import numpy as np -from smac.configspace import ConfigurationSpace -from ConfigSpace.hyperparameters import IntegerHyperparameter, FloatHyperparameter, CategoricalHyperparameter, \ - OrdinalHyperparameter -from smac.facade.smac_ac_facade import SMAC4AC -from smac.scenario.scenario import Scenario - -MAX_RUNS = 100 -logger = logging.getLogger(__name__) - - -def build_configspace(primitives): - # Build Configuration Space which defines all parameters and their ranges - configspace = ConfigurationSpace() - for primitive in primitives: - pass # load_primitive_configspace(configspace, primitive) - - return configspace - - -def get_new_hyperparameters(primitive_name, configspace): - hyperparameters = {} # load_hyperparameters(primitive_name) - new_hyperparameters = {} - - for hyperparameter_name in hyperparameters: - hyperparameter_config_name = primitive_name + '|' + hyperparameter_name - hyperparameter_config_name_case = hyperparameter_config_name + '|case' - if hyperparameter_config_name in configspace: - value = None if configspace[hyperparameter_config_name] == 'None' \ - else configspace[hyperparameter_config_name] - new_hyperparameters[hyperparameter_name] = value - logger.debug('New value for %s=%s', hyperparameter_config_name, new_hyperparameters[hyperparameter_name]) - elif hyperparameter_config_name_case in configspace: - case = configspace[hyperparameter_config_name_case] - value = None if configspace[hyperparameter_config_name + '|' + case] == 'None' \ - else configspace[hyperparameter_config_name + '|' + case] - new_hyperparameters[hyperparameter_name] = {'case': case, - 'value': value} - logger.debug('New value for %s=%s', hyperparameter_config_name, new_hyperparameters[hyperparameter_name]) - - return new_hyperparameters - - -class HyperparameterTuning(object): - def __init__(self, primitives): - self.configspace = build_configspace(primitives) - # Avoiding too many iterations - self.runcount = 1 - - for param in self.configspace.get_hyperparameters(): - if isinstance(param, IntegerHyperparameter): - self.runcount *= (param.upper - param.lower) - elif isinstance(param, CategoricalHyperparameter): - self.runcount *= len(param.choices) - elif isinstance(param, OrdinalHyperparameter): - self.runcount *= len(param.sequence) - elif isinstance(param, FloatHyperparameter): - self.runcount = MAX_RUNS - break - - self.runcount = min(self.runcount, MAX_RUNS) - - def tune(self, runner, wallclock, output_dir): - # Scenario object - # Allow long pipelines to try to execute one fourth of the iterations limit - cutoff = wallclock / (self.runcount / 10) - scenario = Scenario({'run_obj': 'quality', # We optimize quality (alternatively runtime) - 'runcount-limit': self.runcount, # Maximum function evaluations - 'wallclock-limit': wallclock, - 'cutoff_time': cutoff, - 'cs': self.configspace, # Configuration space - 'deterministic': 'true', - 'output_dir': output_dir, - 'abort_on_first_run_crash': False - }) - smac = SMAC4AC(scenario=scenario, rng=np.random.RandomState(0), tae_runner=runner) - best_configuration = smac.optimize() - min_cost = float('inf') - best_scores = {} - - for _, run_data in smac.get_runhistory().data.items(): - if run_data.cost < min_cost: - min_cost = run_data.cost - best_scores = run_data.additional_info - - return best_configuration, best_scores diff --git a/alpha_automl/metalearning/__init__.py b/alpha_automl/metalearning/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/alpha_automl/metalearning/dataset_profiler.py b/alpha_automl/metalearning/dataset_profiler.py deleted file mode 100644 index a33c31d8..00000000 --- a/alpha_automl/metalearning/dataset_profiler.py +++ /dev/null @@ -1,38 +0,0 @@ -import logging -import pandas as pd -import datamart_profiler -from metalearn import Metafeatures - -logger = logging.getLogger(__name__) - -DEFAULT_METAFEATURES = [] - - -def extract_metafeatures(dataset_path, target_column): - data = pd.read_csv(dataset_path) - Y = data[target_column] - Y = pd.Series([str(i) for i in Y], name=target_column) # Cast to string to get metalearn lib working correctly - X = data.drop(columns=[target_column]) - metafeatures = Metafeatures() - mfs = metafeatures.compute(X, Y, metafeature_ids=DEFAULT_METAFEATURES, seed=0, timeout=300) - - return mfs - - -def extract_dataprofiles(dataset_path, target_column, ignore_target_column=False): - metadata = datamart_profiler.process_dataset(dataset_path, coverage=False) - feature_types = set() - missing_values = False - for item in metadata['columns']: - if ignore_target_column and item['name'] == target_column: - continue - identified_types = item['semantic_types'] if len(item['semantic_types']) > 0 else [item['structural_type']] - for feature_type in identified_types: - feature_types.add(feature_type) - - if 'missing_values_ratio' in item and item['name'] != target_column: - missing_values = True - - dps = {'feature_types': sorted(feature_types), 'missing_values': missing_values} - - return dps diff --git a/alpha_automl/metalearning/dataset_similarity.py b/alpha_automl/metalearning/dataset_similarity.py deleted file mode 100644 index 5348ae33..00000000 --- a/alpha_automl/metalearning/dataset_similarity.py +++ /dev/null @@ -1,161 +0,0 @@ -import logging -import math -import hashlib -import datamart_profiler -from metalearn import Metafeatures -from alphad3m.metalearning.resource_builder import load_precalculated_data -from alphad3m.metalearning.dataset_profiler import extract_dataprofiles, extract_metafeatures -from sklearn.metrics.pairwise import cosine_similarity -from d3m.metadata.problem import TaskKeywordBase - -logger = logging.getLogger(__name__) - - -def create_metafeatures_vector(metafeatures, metafeature_indices): - vector = [] - - for metafeatures_id in metafeature_indices: - value = metafeatures[metafeatures_id]['value'] - if isinstance(value, str): - value = int(hashlib.sha256(value.encode('utf-8')).hexdigest(), 16) % 256 - elif math.isnan(value) or math.isinf(value): - value = 0 - vector.append(value) - - return vector - - -def create_dataprofiles_vector(dataprofiles, dataprofile_indices): - vector = [] - - for dataprofile_id in dataprofile_indices: - if dataprofile_id in dataprofiles['feature_types']: - value = 1 - else: - value = 0 - vector.append(value) - - value = 1 if dataprofiles['missing_values'] else 0 - vector.append(value) # Add an extra value corresponding to the missing values data - - return vector - - -def create_taskkeywords_vector(task_keywords, taskkeyword_indices): - vector = [] - - for taskkeyword_id in taskkeyword_indices: - if taskkeyword_id in task_keywords['task_keywords']: - value = 1 - else: - value = 0 - vector.append(value) - - return vector - - -def create_metafeatures_vectors_mldb(metafeature_indices): - vectors = {} - metafeature_datasets = load_precalculated_data('metafeatures') - - for id_dataset, metafeatures in metafeature_datasets.items(): - vector = create_metafeatures_vector(metafeatures, metafeature_indices) - vectors[id_dataset] = vector - - return vectors - - -def create_dataprofiles_vectors_mldb(dataprofile_indices): - vectors = {} - dataprofile_datasets = load_precalculated_data('dataprofiles') - - for id_dataset, dataprofiles in dataprofile_datasets.items(): - vector = create_dataprofiles_vector(dataprofiles, dataprofile_indices) - vectors[id_dataset] = vector - - return vectors - - -def create_taskkeywords_vectors_mldb(taskkeyword_indices): - vectors = {} - taskkeyword_datasets = load_precalculated_data('task_keywords') - - for id_dataset, task_keywords in taskkeyword_datasets.items(): - vector = create_taskkeywords_vector(task_keywords, taskkeyword_indices) - vectors[id_dataset] = vector - - return vectors - - -def load_metafeatures_vectors(dataset_path, target_column): - mfs = extract_metafeatures(dataset_path, target_column) - metafeature_indices = Metafeatures.list_metafeatures(group='all') - target_metafeatures_vector = create_metafeatures_vector(mfs, metafeature_indices) - metalearningdb_vectors = create_metafeatures_vectors_mldb(metafeature_indices) - - return metalearningdb_vectors, target_metafeatures_vector - - -def load_profiles_vectors(dataset_path, target_column): - dps = extract_dataprofiles(dataset_path, target_column) - dataprofile_indices = [v for k, v in datamart_profiler.types.__dict__.items() if not k.startswith('_')] - target_dataprofile_vector = create_dataprofiles_vector(dps, dataprofile_indices) - metalearningdb_vectors = create_dataprofiles_vectors_mldb(dataprofile_indices) - - return metalearningdb_vectors, target_dataprofile_vector - - -def load_taskkeyword_vectors(task_keywords): - taskkeyword_indices = sorted([keyword for keyword in TaskKeywordBase.get_map().keys() if keyword is not None]) - target_dataprofile_vector = create_taskkeywords_vector({'task_keywords': task_keywords}, taskkeyword_indices) - metalearningdb_vectors = create_taskkeywords_vectors_mldb(taskkeyword_indices) - - return metalearningdb_vectors, target_dataprofile_vector - - -def calculate_similarity(metalearningdb_vectors, target_vector, threshold): - similar_datasets = {} - for id_dataset, vector in metalearningdb_vectors.items(): - similarity = round(cosine_similarity([target_vector], [vector]).flat[0], 5) - if similarity >= threshold: - similar_datasets[id_dataset] = similarity - - return similar_datasets - - -def similarity_repr(dataset_similarities): - similarity_string = [] - - for dataset_similarity in sorted(dataset_similarities.items(), key=lambda x: x[1], reverse=True): - pretty_string = '%s=%.2f' % dataset_similarity - similarity_string.append(pretty_string) - - return ', '.join(similarity_string) - - -def get_similar_datasets(mode, dataset_path, target_column, task_keywords, threshold=0.8, combined=False): - vectors_taskkeywords, target_vector_taskkeywords = load_taskkeyword_vectors(task_keywords) - - if mode == 'metafeatures': - vectors_dataset, target_vector_dataset = load_metafeatures_vectors(dataset_path, target_column) - elif mode == 'dataprofiles': - vectors_dataset, target_vector_dataset = load_profiles_vectors(dataset_path, target_column) - else: - raise ValueError('Unknown mode "%s" to load data' % mode) - - if combined: - # Concatenate the vectors of the dataset and task keywords - for id_dataset in vectors_dataset: - vectors_dataset[id_dataset] += vectors_taskkeywords[id_dataset] - target_vector_dataset += target_vector_taskkeywords - similar_datasets = calculate_similarity(vectors_dataset, target_vector_dataset, threshold) - logger.debug('Similar datasets found using both information:\n%s', similarity_repr(similar_datasets)) - else: - # Use threshold=1.0 to get datasets with the same task keywords - similar_datasets = calculate_similarity(vectors_taskkeywords, target_vector_taskkeywords, 1.0) - logger.debug('Similar datasets found using task_keywords features:\n%s', similarity_repr(similar_datasets)) - vectors_dataset = {k: vectors_dataset[k] for k in similar_datasets} # Use only the similar datasets - similar_datasets = calculate_similarity(vectors_dataset, target_vector_dataset, threshold) - logger.debug('Similar datasets found using %s features:\n%s', mode, similarity_repr(similar_datasets)) - - return similar_datasets diff --git a/alpha_automl/metalearning/grammar_builder.py b/alpha_automl/metalearning/grammar_builder.py deleted file mode 100644 index 33fc70c9..00000000 --- a/alpha_automl/metalearning/grammar_builder.py +++ /dev/null @@ -1,329 +0,0 @@ -import logging -import numpy as np -from scipy import stats -from collections import OrderedDict -from alphad3m.metalearning.resource_builder import load_metalearningdb -from alphad3m.metalearning.dataset_similarity import get_similar_datasets -from alphad3m.primitive_loader import load_primitives_by_name, load_primitives_by_id - -logger = logging.getLogger(__name__) - - -def load_related_pipelines(dataset_path, target_column, task_keywords): - available_primitives = load_primitives_by_id() - all_pipelines = load_metalearningdb() - similar_datasets = get_similar_datasets('dataprofiles', dataset_path, target_column, task_keywords) - task_pipelines = [] - - for similar_dataset in similar_datasets.keys(): - if similar_dataset not in all_pipelines['pipeline_performances']: - continue - - for pipeline_id, pipeline_performances in all_pipelines['pipeline_performances'][similar_dataset].items(): - primitive_ids = all_pipelines['pipeline_structure'][pipeline_id] - if is_available_primitive(primitive_ids, available_primitives): - for index in range(len(pipeline_performances['score'])): - primitives = [available_primitives[p] for p in primitive_ids] # Use the current names of primitives - score = pipeline_performances['score'][index] - metric = pipeline_performances['metric'][index] - task_pipelines.append({'pipeline': primitives, 'score': score, 'metric': metric, - 'dataset': similar_dataset, 'pipeline_repr': '_'.join(primitives)}) - - logger.debug('Found %d related pipelines', len(task_pipelines)) - - return task_pipelines - - -def create_metalearningdb_grammar(task_name, dataset_path, target_column, task_keywords, merge=False): - pipelines = load_related_pipelines(dataset_path, target_column, task_keywords) - patterns, primitives = extract_patterns(pipelines) - empty_elements = [] - - if merge: - patterns, empty_elements = merge_patterns(patterns) - - grammar = format_grammar(task_name, patterns, empty_elements) - - return grammar, primitives - - -def format_grammar(task_name, patterns, empty_elements): - if len(patterns) == 0: - logger.warning('Empty patterns, no grammar have been generated') - return None - - grammar = 'S -> %s\n' % task_name - grammar += task_name + ' -> ' + ' | '.join([' '.join(p) for p in patterns]) - - for element in sorted(set([e for sublist in patterns for e in sublist])): # Sort to have a deterministic grammar - production_rule = element + " -> 'primitive_terminal'" - if element in empty_elements: - production_rule += " | 'E'" - - grammar += '\n' + production_rule - logger.debug('Grammar obtained:\n%s', grammar) - - return grammar - - -def extract_patterns(pipelines, max_nro_patterns=15, min_frequency=3, adtm_threshold=0.5, mean_score_threshold=0.5, - ratio_datasets=0.2): - available_primitives = load_primitives_by_name() - pipelines = calculate_adtm(pipelines) - patterns = {} - - for pipeline_data in pipelines: - if pipeline_data['adtm'] > adtm_threshold: - # Skip pipelines with average distance to the minimum higher than the threshold - continue - - primitive_types = [available_primitives[p]['type'] for p in pipeline_data['pipeline']] - pattern_id = ' '.join(primitive_types) - if pattern_id not in patterns: - patterns[pattern_id] = {'structure': primitive_types, 'primitives': set(), 'datasets': set(), - 'pipelines': [], 'scores': [], 'adtms': [], 'frequency': 0} - patterns[pattern_id]['primitives'].update(pipeline_data['pipeline']) - patterns[pattern_id]['datasets'].add(pipeline_data['dataset']) - patterns[pattern_id]['pipelines'].append(pipeline_data['pipeline']) - patterns[pattern_id]['scores'].append(pipeline_data['score']) - patterns[pattern_id]['adtms'].append(pipeline_data['adtm']) - patterns[pattern_id]['frequency'] += 1 - - logger.debugggg('Found %d different patterns, after creating the portfolio', len(patterns)) - # TODO: Group these removing conditions into a single loop - # Remove patterns with fewer elements than the minimum frequency - patterns = {k: v for k, v in patterns.items() if v['frequency'] >= min_frequency} - logger.debug('Found %d different patterns, after removing uncommon patterns', len(patterns)) - - # Remove patterns with undesirable primitives (AlphaD3M doesn't have support to handle some of these primitives) - blacklist_primitives = {'d3m.primitives.data_transformation.dataframe_to_ndarray.Common', - 'd3m.primitives.data_transformation.list_to_dataframe.DistilListEncoder', - 'd3m.primitives.data_transformation.ndarray_to_dataframe.Common', - 'd3m.primitives.data_transformation.horizontal_concat.DSBOX', - 'd3m.primitives.data_transformation.horizontal_concat.DataFrameCommon', - 'd3m.primitives.data_transformation.multi_horizontal_concat.Common', - 'd3m.primitives.data_transformation.conditioner.Conditioner', - 'd3m.primitives.data_transformation.remove_semantic_types.Common', - 'd3m.primitives.data_transformation.replace_semantic_types.Common', - 'd3m.primitives.data_transformation.remove_columns.Common', - 'd3m.primitives.operator.dataset_map.DataFrameCommon', - 'd3m.primitives.data_transformation.i_vector_extractor.IVectorExtractor'} - patterns = {k: v for k, v in patterns.items() if not blacklist_primitives & v['primitives']} - logger.debug('Found %d different patterns, after blacklisting primitives', len(patterns)) - - unique_datasets = set() - for pattern_id in patterns: - scores = patterns[pattern_id]['scores'] - adtms = patterns[pattern_id]['adtms'] - patterns[pattern_id]['mean_score'] = np.mean(scores) - patterns[pattern_id]['mean_adtm'] = np.mean(adtms) - unique_datasets.update(patterns[pattern_id]['datasets']) - # Remove patterns with low performances - patterns = {k: v for k, v in patterns.items() if v['mean_score'] >= mean_score_threshold} - logger.debug('Found %d different patterns, after removing low-performance patterns', len(patterns)) - - # Remove patterns with low variability - patterns = {k: v for k, v in patterns.items() if len(v['datasets']) >= len(unique_datasets) * ratio_datasets} - logger.debug('Found %d different patterns, after removing low-variability patterns', len(patterns)) - - if len(patterns) > max_nro_patterns: - logger.debug('Found many patterns, selecting top %d (max_nro_patterns)' % max_nro_patterns) - sorted_patterns = sorted(patterns.items(), key=lambda x: x[1]['mean_score'], reverse=True) - patterns = {k: v for k, v in sorted_patterns[:max_nro_patterns]} - - primitive_info = add_correlations(patterns, available_primitives) - # Make deterministic the order of the patterns - patterns = sorted(patterns.values(), key=lambda x: x['mean_score'], reverse=True) - logger.debug('Patterns:\n%s', patterns_repr(patterns)) - logger.debug('Hierarchy:\n%s', '\n'.join(['%s:\n%s' % (k, ', '.join(v)) for k, v in - primitive_info['hierarchy'].items()])) - patterns = [p['structure'] for p in patterns] - - return patterns, primitive_info - - -def add_correlations(patterns, available_primitives): - primitive_hierarchy = {} - all_pipelines = [] - all_performances = [] - all_primitives = [] - - # Add local correlations - local_probabilities = {} - for pattern_id, pattern in patterns.items(): - for primitive in pattern['primitives']: - primitive_type = available_primitives[primitive]['type'] - if primitive_type not in primitive_hierarchy: - primitive_hierarchy[primitive_type] = set() - primitive_hierarchy[primitive_type].add(primitive) - performances = [1 - x for x in pattern['adtms']] # Use adtms as performances because their are scaled - all_pipelines += pattern['pipelines'] - all_primitives += pattern['primitives'] - all_performances += performances - correlations = calculate_correlations(pattern['primitives'], pattern['pipelines'], performances) - local_probabilities[pattern_id] = {} - for primitive, correlation in correlations.items(): - primitive_type = available_primitives[primitive]['type'] - if primitive_type not in local_probabilities[pattern_id]: - local_probabilities[pattern_id][primitive_type] = {} - local_probabilities[pattern_id][primitive_type][primitive] = correlation - - # Add global correlations - global_probabilities = {} - correlations = calculate_correlations(set(all_primitives), all_pipelines, all_performances) - - for primitive, correlation in correlations.items(): - primitive_type = available_primitives[primitive]['type'] - if primitive_type not in global_probabilities: - global_probabilities[primitive_type] = {} - global_probabilities[primitive_type][primitive] = correlation - - global_probabilities['S'] = {} - for pattern, pattern_data in patterns.items(): # Use the mean adtm values as probabilities for the patterns - global_probabilities['S'][pattern] = 1 - pattern_data['mean_adtm'] - - primitive_probabilities = {'global': global_probabilities, 'local': local_probabilities, - 'types': available_primitives} - # Make deterministic the order of the hierarchy - primitive_hierarchy = OrderedDict({k: sorted(v) for k, v in - sorted(primitive_hierarchy.items(), key=lambda x: x[0])}) - primitive_info = {'hierarchy': primitive_hierarchy, 'probabilities': primitive_probabilities} - - return primitive_info - - -def calculate_correlations(primitives, pipelines, scores, normalize=True): - correlations = {} - - for primitive in primitives: - occurrences = [1 if primitive in pipeline else 0 for pipeline in pipelines] - correlation_coefficient, p_value = stats.pointbiserialr(occurrences, scores) - if np.isnan(correlation_coefficient): # Assign a positive correlation (1) to NaN values - correlation_coefficient = 1 - if normalize: # Normalize the Pearson values, from [-1, 1] to [0, 1] range - correlation_coefficient = (correlation_coefficient - (-1)) / 2 # xi − min(x) / max(x) − min(x) - correlations[primitive] = round(correlation_coefficient, 4) - - return correlations - - -def calculate_adtm(pipelines): - dataset_performaces = {} - pipeline_performances = {} - - for pipeline_data in pipelines: - # Even the same dataset can be run under different metrics. So, use the metric to create the id of the dataset - id_dataset = pipeline_data['dataset'] + '_' + pipeline_data['metric'] - - if id_dataset not in dataset_performaces: - dataset_performaces[id_dataset] = {'min': float('inf'), 'max': float('-inf')} - performance = pipeline_data['score'] - - if performance > dataset_performaces[id_dataset]['max']: - dataset_performaces[id_dataset]['max'] = performance - - if performance < dataset_performaces[id_dataset]['min']: - dataset_performaces[id_dataset]['min'] = performance - - id_pipeline = pipeline_data['pipeline_repr'] - - if id_pipeline not in pipeline_performances: - pipeline_performances[id_pipeline] = {} - - if id_dataset not in pipeline_performances[id_pipeline]: - pipeline_performances[id_pipeline][id_dataset] = pipeline_data['score'] - else: - # A pipeline can have different performances for a given dataset, choose the best one - if pipeline_data['score'] > pipeline_performances[id_pipeline][id_dataset]: - pipeline_performances[id_pipeline][id_dataset] = pipeline_data['score'] - - add_adtm(pipelines, pipeline_performances, dataset_performaces) - - return pipelines - - -def add_adtm(pipelines, pipeline_performances, dataset_performaces): - for pipeline_data in pipelines: - id_pipeline = pipeline_data['pipeline_repr'] - id_dataset_pipeline = pipeline_data['dataset'] + '_' + pipeline_data['metric'] - dtm = 0 - - for id_dataset in pipeline_performances[id_pipeline]: # Iterate over the datasets where the pipeline was used - minimum = dataset_performaces[id_dataset]['min'] - maximum = dataset_performaces[id_dataset]['max'] - - if id_dataset_pipeline == id_dataset: - score = pipeline_data['score'] - else: - score = pipeline_performances[id_pipeline][id_dataset] - - if minimum != maximum: - dtm += (maximum - score) / (maximum - minimum) - - adtm = dtm / len(pipeline_performances[id_pipeline]) - pipeline_data['adtm'] = adtm - - -def merge_patterns(grammar_patterns): - patterns = sorted(grammar_patterns, key=lambda x: len(x), reverse=True) - empty_elements = set() - skip_patterns = [] - - for pattern in patterns: - for element in pattern: - modified_pattern = [e for e in pattern if e != element] - for current_pattern in patterns: - if modified_pattern == current_pattern: - empty_elements.add(element) - skip_patterns.append(modified_pattern) - - for skip_pattern in skip_patterns: - if skip_pattern in patterns: - patterns.remove(skip_pattern) - - return patterns, empty_elements - - -def is_available_primitive(pipeline_primitives, available_primitives, verbose=False): - for primitive in pipeline_primitives: - if primitive not in available_primitives: - if verbose: - logger.warning('Primitive %s is not longer available' % primitive) - return False - return True - - -def patterns_repr(patterns): - patterns_string = [] - - for pattern in patterns: - pretty_string = '' - pretty_string += 'structure: [%s]' % ', '.join([i for i in pattern['structure']]) - pretty_string += ', frequency: %d' % pattern['frequency'] - if 'mean_score' in pattern: - pretty_string += ', mean_score: %.3f' % pattern['mean_score'] - if 'mean_adtm' in pattern: - pretty_string += ', mean_adtm: %.3f' % pattern['mean_adtm'] - patterns_string.append(pretty_string) - - return '\n'.join(patterns_string) - - -def test_dataset(dataset_folder_path, task_name='TASK'): - from os.path import join - import json - dataset_path = join(dataset_folder_path, 'TRAIN/dataset_TRAIN/tables/learningData.csv') - problem_path = join(dataset_folder_path, 'TRAIN/problem_TRAIN/problemDoc.json') - - with open(problem_path) as fin: - problem_doc = json.load(fin) - task_keywords = problem_doc['about']['taskKeywords'] - target_column = problem_doc['inputs']['data'][0]['targets'][0]['colName'] - logger.debug('Evaluating dataset "%s" with task keywords=%s' % (dataset_folder_path, str(task_keywords))) - create_metalearningdb_grammar(task_name, dataset_path, target_column, task_keywords) - - -if __name__ == '__main__': - dataset_id = '185_baseball_MIN_METADATA' - dataset_folder_path = '/Users/rlopez/D3M/datasets/seed_datasets_current/%s' % dataset_id - test_dataset(dataset_folder_path) diff --git a/alpha_automl/metalearning/nnet_trainer.py b/alpha_automl/metalearning/nnet_trainer.py deleted file mode 100644 index 7ca8675f..00000000 --- a/alpha_automl/metalearning/nnet_trainer.py +++ /dev/null @@ -1,102 +0,0 @@ -import pickle -import logging -import copy -import pandas as pd -from alphad3m.primitive_loader import load_primitives_by_name, load_primitives_by_id, load_primitives_types -from alphad3m.metalearning.resource_builder import get_dataset_id, filter_primitives, load_precalculated_data -from alphad3m.metalearning.dataset_profiler import DEFAULT_METAFEATURES - -logger = logging.getLogger(__name__) - - -IGNORE_PRIMITIVES = { - # These primitives are static elements in the pipelines, not considered as part of the pattern - 'd3m.primitives.data_transformation.construct_predictions.Common', - 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common', - 'd3m.primitives.data_transformation.dataset_to_dataframe.Common', - 'd3m.primitives.data_transformation.denormalize.Common', - 'd3m.primitives.data_transformation.flatten.DataFrameCommon', - 'd3m.primitives.data_transformation.column_parser.Common', - 'd3m.primitives.data_transformation.simple_column_parser.DataFrameCommon', - 'd3m.primitives.data_transformation.do_nothing.DSBOX', - 'd3m.primitives.data_transformation.do_nothing_for_dataset.DSBOX', - 'd3m.primitives.data_transformation.add_semantic_types.Common', - 'd3m.primitives.schema_discovery.profiler.Common', - 'd3m.primitives.schema_discovery.profiler.DSBOX', - 'd3m.primitives.data_cleaning.column_type_profiler.Simon', - 'd3m.primitives.operator.compute_unique_values.Common', - 'd3m.primitives.data_transformation.construct_confidence.Common', - # We add these primitives internally because they require special connections - 'd3m.primitives.data_transformation.text_reader.Common', - 'd3m.primitives.data_transformation.image_reader.Common' -} - - -def create_csv_data(metalearningdb_pickle_path, pipelines_csv_path, use_primitive_names=True): - logger.debug('Creating CSV file...') - primitives_by_name = load_primitives_by_name(only_installed_primitives=False) - primitives_by_id = load_primitives_by_id(only_installed_primitives=False) - primitives_types = load_primitives_types(only_installed_primitives=False) - dataset_task_keywords = load_precalculated_data('task_keywords') - dataset_semantic_types = load_precalculated_data('dataprofiles') - dataset_metafeatures = load_precalculated_data('metafeatures') - ignore_primitives_ids = {primitives_by_name[ignore_primitive]['id'] for ignore_primitive in IGNORE_PRIMITIVES} - train_pipelines = [] - total_pipelines = 0 - - with open(metalearningdb_pickle_path, 'rb') as fin: - all_pipelines = pickle.load(fin) - - for pipeline_run in all_pipelines: - dataset_id = get_dataset_id(pipeline_run['problem']['id']) - - if dataset_id not in dataset_task_keywords: - continue - - pipeline_primitives = pipeline_run['steps'] - pipeline_primitives = filter_primitives(pipeline_primitives, ignore_primitives_ids) # Get the IDs of primitives - task_keywords = ' '.join(dataset_task_keywords[dataset_id]['task_keywords']) - semantic_types = ' '.join(dataset_semantic_types[dataset_id]['feature_types']) - metafeatures = [dataset_metafeatures[dataset_id][mf]['value'] for mf in DEFAULT_METAFEATURES] - - if len(pipeline_primitives) > 0: - score = pipeline_run['scores'][0]['normalized'] - metric = pipeline_run['scores'][0]['metric']['metric'] - try: - pipeline_primitive_types = [primitives_types[primitives_by_id[p]] for p in pipeline_primitives] - if use_primitive_names: - pipeline_primitives = [primitives_by_id[p] for p in pipeline_primitives] - pipeline_stages = generate_pipeline_stages(pipeline_primitives, pipeline_primitive_types) - pipeline_stages = [[' '.join(ps), task_keywords, semantic_types, metric, score] + metafeatures - for ps in pipeline_stages] - train_pipelines += pipeline_stages - total_pipelines += 1 - except Exception: - logger.warning(f'Primitives "{str(pipeline_primitives)}" are not longer available') - - logger.debug(f'Loaded {len(all_pipelines)} pipelines') - logger.debug(f'Found {total_pipelines} pipelines') - pipelines_df = pd.DataFrame.from_records(train_pipelines, columns=['primitives', 'task_keywords', 'semantic_types', - 'metric', 'score'] + DEFAULT_METAFEATURES) - pipelines_df.to_csv(pipelines_csv_path, index=False) - - -def generate_pipeline_stages(primitive_items, primitive_types): - combinations = [] - pre_pipeline = primitive_types - combinations.append(copy.deepcopy(pre_pipeline)) - - for index, primitive_item in enumerate(primitive_items): - pre_pipeline[index] = primitive_item - combinations.append(copy.deepcopy(pre_pipeline)) - - return combinations - - -if __name__ == '__main__': - # Download the metalearningdb.pkl file from https://drive.google.com/file/d/1WjY7iKkkKMZFeoiCqzamA_iqVOwQidXS/view - # and D3M datasets from https://datasets.datadrivendiscovery.org/d3m/datasets/-/tree/master/seed_datasets_current - metalearningdb_pickle_path = '/Users/rlopez/D3M/metalearning_db/metalearningdb.pkl' - pipelines_csv_path = '/Users/rlopez/D3M/metalearning_db/marvin_pipelines.csv' - - create_csv_data(metalearningdb_pickle_path, pipelines_csv_path) diff --git a/alpha_automl/metalearning/resource_builder.py b/alpha_automl/metalearning/resource_builder.py deleted file mode 100644 index 3011f5fc..00000000 --- a/alpha_automl/metalearning/resource_builder.py +++ /dev/null @@ -1,253 +0,0 @@ -import re -import ast -import json -import gzip -import pickle -import logging -from os.path import join, dirname, exists -from alphad3m.primitive_loader import load_primitives_by_name -from alphad3m.metalearning.dataset_profiler import extract_dataprofiles, extract_metafeatures - -logger = logging.getLogger(__name__) - -METALEARNING_DB_PATH = join(dirname(__file__), '../resource/metalearning_db.json.gz') -PRECALCULATED_TASKKEYWORDS_PATH = join(dirname(__file__), '../resource/precalculated_taskkeywords.json') -PRECALCULATED_METAFEATURES_PATH = join(dirname(__file__), '../resource/precalculated_metafeatures.json') -PRECALCULATED_DATAPROFILES_PATH = join(dirname(__file__), '../resource/precalculated_dataprofiles.json') - - -IGNORE_PRIMITIVES = { - # These primitives are static elements in the pipelines, not considered as part of the pattern - 'd3m.primitives.data_transformation.construct_predictions.Common', - 'd3m.primitives.data_transformation.extract_columns_by_semantic_types.Common', - 'd3m.primitives.data_transformation.dataset_to_dataframe.Common', - 'd3m.primitives.data_transformation.denormalize.Common', - 'd3m.primitives.data_transformation.flatten.DataFrameCommon', - 'd3m.primitives.data_transformation.column_parser.Common', - 'd3m.primitives.data_transformation.simple_column_parser.DataFrameCommon', - 'd3m.primitives.data_transformation.do_nothing.DSBOX', - 'd3m.primitives.data_transformation.do_nothing_for_dataset.DSBOX', - 'd3m.primitives.data_transformation.add_semantic_types.Common', - 'd3m.primitives.schema_discovery.profiler.Common', - 'd3m.primitives.schema_discovery.profiler.DSBOX', - 'd3m.primitives.data_cleaning.column_type_profiler.Simon', - 'd3m.primitives.operator.compute_unique_values.Common', - 'd3m.primitives.data_transformation.construct_confidence.Common', - # We add these primitives internally because they require special connections - 'd3m.primitives.data_transformation.text_reader.Common', - 'd3m.primitives.data_transformation.image_reader.Common' -} - - -def create_compressed_metalearningdb(metalearningdb_pickle_path): - logger.debug('Compressing Meta-Learning DB...') - primitives_by_name = load_primitives_by_name(only_installed_primitives=False) - available_datasets = load_precalculated_data('task_keywords') - ignore_primitives_ids = {primitives_by_name[ignore_primitive]['id'] for ignore_primitive in IGNORE_PRIMITIVES} - pipelines_by_dataset = {} - pipelines_hashing = {} - - with open(metalearningdb_pickle_path, 'rb') as fin: - all_pipelines = pickle.load(fin) - - for pipeline_run in all_pipelines: - dataset_id = get_dataset_id(pipeline_run['problem']['id']) - - if dataset_id not in available_datasets: - continue - - pipeline_primitives = pipeline_run['steps'] - pipeline_primitives = filter_primitives(pipeline_primitives, ignore_primitives_ids) # Get the IDs of primitives - - if dataset_id not in pipelines_by_dataset: - pipelines_by_dataset[dataset_id] = {} - - if len(pipeline_primitives) > 0: - score = pipeline_run['scores'][0]['normalized'] - metric = pipeline_run['scores'][0]['metric']['metric'] - pipeline_str = str(pipeline_primitives) - - if pipeline_str not in pipelines_hashing: - hashing_value = 'P' + str(len(pipelines_hashing)) - pipelines_hashing[pipeline_str] = hashing_value - - pipeline_id = pipelines_hashing[pipeline_str] - - if pipeline_id not in pipelines_by_dataset[dataset_id]: - pipelines_by_dataset[dataset_id][pipeline_id] = {'score': [], 'metric': []} - - pipelines_by_dataset[dataset_id][pipeline_id]['score'].append(score) - pipelines_by_dataset[dataset_id][pipeline_id]['metric'].append(metric) - - pipeline_structure = {} - for pipeline_str, pipeline_id in pipelines_hashing.items(): - primitives = [primitive for primitive in ast.literal_eval(pipeline_str)] # Convert str to list - pipeline_structure[pipeline_id] = primitives - - metalearning_db = {} - metalearning_db['pipeline_performances'] = pipelines_by_dataset - metalearning_db['pipeline_structure'] = pipeline_structure - - with gzip.open(METALEARNING_DB_PATH, 'wt', encoding='UTF-8') as zipfile: - json.dump(json.dumps(metalearning_db), zipfile) # Convert to str and then compress it - - logger.debug('Compressing process ended') - - -def extract_taskkeywords_metalearningdb(datasets_path): - datasets = get_unique_datasets() - task_keywords = load_precalculated_data('task_keywords') - - for dataset_id in datasets: - logger.debug('Calculating task keywords for dataset %s...', dataset_id) - if dataset_id not in task_keywords: - try: - _, _, keywords = load_task_info(dataset_id, datasets_path) - task_keywords[dataset_id] = {'task_keywords': keywords} - logger.debug('Task keywords successfully calculated for dataset %s', dataset_id) - with open(PRECALCULATED_TASKKEYWORDS_PATH, 'w') as fout: - json.dump(task_keywords, fout, indent=4, sort_keys=True) - except Exception as e: - logger.error(str(e)) - else: - logger.debug('Using pre-calculated task keywords for dataset %s', dataset_id) - - return task_keywords - - -def extract_metafeatures_metalearningdb(datasets_path): - datasets = get_unique_datasets() - metafeatures = load_precalculated_data('metafeatures') - - for dataset_id in datasets: - logger.debug('Calculating metafeatures for dataset %s...', dataset_id) - if dataset_id not in metafeatures: - try: - dataset_path, target_column, _ = load_task_info(dataset_id, datasets_path, 'SCORE') - mfs = extract_metafeatures(dataset_path, target_column) - metafeatures[dataset_id] = mfs - logger.debug('Metafeatures successfully calculated for dataset %s', dataset_id) - with open(PRECALCULATED_METAFEATURES_PATH, 'w') as fout: - json.dump(metafeatures, fout, indent=4, sort_keys=True) - except Exception as e: - logger.error(str(e)) - else: - logger.debug('Using pre-calculated metafeatures for dataset %s', dataset_id) - - return metafeatures - - -def extract_dataprofiles_metalearningdb(datasets_path): - datasets = get_unique_datasets() - dataprofiles = load_precalculated_data('dataprofiles') - - for dataset_id in datasets: - logger.debug('Calculating data profiles for dataset %s...', dataset_id) - if dataset_id not in dataprofiles: - try: - dataset_path, target_column, _ = load_task_info(dataset_id, datasets_path) - dps = extract_dataprofiles(dataset_path, target_column) - dataprofiles[dataset_id] = dps - logger.debug('Data profiles successfully calculated for dataset %s', dataset_id) - with open(PRECALCULATED_DATAPROFILES_PATH, 'w') as fout: - json.dump(dataprofiles, fout, indent=4, sort_keys=True) - except Exception as e: - logger.error(str(e)) - else: - logger.debug('Using pre-calculated data profiles for dataset %s', dataset_id) - - return dataprofiles - - -def load_task_info(dataset_id, datasets_path, suffix='TRAIN'): - possible_names = [join(datasets_path, dataset_id), join(datasets_path, dataset_id + '_MIN_METADATA'), - join(datasets_path, dataset_id.replace('_MIN_METADATA', ''))] - # All possible names of the datasets on disk, with/without the suffix 'MIN_METADATA' - - for dataset_folder_path in possible_names: - if exists(dataset_folder_path): - break - else: - raise FileNotFoundError('Dataset %s not found' % dataset_id) - - dataset_path = join(dataset_folder_path, suffix, 'dataset_%s/tables/learningData.csv' % suffix) - problem_path = join(dataset_folder_path, suffix, 'problem_%s/problemDoc.json' % suffix) - - with open(problem_path) as fin: - problem_doc = json.load(fin) - task_keywords = problem_doc['about']['taskKeywords'] - target_column = problem_doc['inputs']['data'][0]['targets'][0]['colName'] - - return dataset_path, target_column, task_keywords - - -def get_dataset_id(problem_id): - # Remove suffixes 'TRAIN' and 'problem' from the dataset name - dataset_id = re.sub('_TRAIN$', '', problem_id) - dataset_id = re.sub('_problem$', '', dataset_id) - - return dataset_id - - -def get_unique_datasets(): - metalearning_db = load_metalearningdb() - datasets = metalearning_db['pipeline_performances'].keys() - - return sorted(datasets) - - -def filter_primitives(pipeline_steps, ignore_primitives): - primitives = [] - - for pipeline_step in pipeline_steps: - if pipeline_step['primitive']['id'] not in ignore_primitives: - primitives.append(pipeline_step['primitive']['id']) - - if len(primitives) > 0 and primitives[0] == '7ddf2fd8-2f7f-4e53-96a7-0d9f5aeecf93': - # Special case: Primitive to_numeric.DSBOX - # This primitive should not be first because it only takes the numeric features, ignoring the remaining ones - primitives = primitives[1:] - - return primitives - - -def load_metalearningdb(): - all_pipelines = [] - logger.debug('Loading pipelines from metalearning database...') - - with gzip.open(METALEARNING_DB_PATH, 'rt', encoding='UTF-8') as zipfile: - all_pipelines = json.loads(json.load(zipfile)) # Uncompress as str and then convert to dict - - logger.debug('Found %d unique pipelines in metalearning database' % len(all_pipelines['pipeline_structure'])) - - return all_pipelines - - -def load_precalculated_data(mode): - if mode == 'metafeatures': - file_path = PRECALCULATED_METAFEATURES_PATH - elif mode == 'dataprofiles': - file_path = PRECALCULATED_DATAPROFILES_PATH - elif mode == 'task_keywords': - file_path = PRECALCULATED_TASKKEYWORDS_PATH - else: - raise ValueError('Unknown mode "%s" to load data' % mode) - - if exists(file_path): - with open(file_path) as fin: - return json.load(fin) - - return {} - - -if __name__ == '__main__': - # Run this to create the meta-learning DB, task keywords, data profiles, and meta-features files - # Download the metalearningdb.pkl file from https://drive.google.com/file/d/1WjY7iKkkKMZFeoiCqzamA_iqVOwQidXS/view - # and D3M datasets from https://datasets.datadrivendiscovery.org/d3m/datasets/-/tree/master/seed_datasets_current - metalearningdb_pickle_path = '/Users/rlopez/D3M/metalearning_db/metalearningdb.pkl' - datasets_root_path = '/Users/rlopez/D3M/datasets/seed_datasets_current/' - - create_compressed_metalearningdb(metalearningdb_pickle_path) - extract_taskkeywords_metalearningdb(datasets_root_path) - extract_dataprofiles_metalearningdb(datasets_root_path) - extract_metafeatures_metalearningdb(datasets_root_path) diff --git a/alpha_automl/pipeline_search/Arena.py b/alpha_automl/pipeline_search/Arena.py deleted file mode 100644 index a7709f1b..00000000 --- a/alpha_automl/pipeline_search/Arena.py +++ /dev/null @@ -1,104 +0,0 @@ -import logging - -logger = logging.getLogger(__name__) -NUM_IT = 5 - - -class Arena(): - """ - An Arena class where any 2 agents can be pit against each other. - """ - def __init__(self, player1, player2, game, display=None, logfile=None): - """ - Input: - player 1,2: two functions that takes board as input, return action - game: Game object - display: a function that takes board as input and prints it (e.g. - display in othello/OthelloGame). Is necessary for verbose - mode. - - see othello/OthelloPlayers.py for an example. See pit.py for pitting - human players/other baselines with each other. - """ - self.player1 = player1 - self.player2 = player2 - self.game = game - self.display = display - if logfile is not None: - self.f = open(logfile, 'a') - - def playGame(self, verbose=False): - """ - Executes one episode of a game. - - Returns: - winner: player who won the game (1 if player1, -1 if player2) - """ - players = [self.player2, None, self.player1] - curPlayer = 1 - board = self.game.getInitBoard() - it = 0 - while self.game.getGameEnded(board, curPlayer) == 0 and it <= NUM_IT: - it += 1 - if verbose: - self.f.write(','.join(self.game.get_pipeline_primitives(board)) + '\n') - assert self.display - logger.debug("Turn %s", it) - logger.debug("Player %s", curPlayer) - self.display(board) - action = players[curPlayer+1](self.game.getCanonicalForm(board, curPlayer)) - # print('ACTION ', action) - valids = self.game.getValidMoves(self.game.getCanonicalForm(board, curPlayer), 1) - - # print('VALIDS ', valids) - if valids[action] != 0: - # print(action) - assert valids[action] > 0 - board, curPlayer = self.game.getNextState(board, curPlayer, action) - if verbose: - self.f.write(','.join(self.game.get_pipeline_primitives(board)) + '\n') - assert self.display - logger.debug("Turn %s", it) - logger.debug("Player %s", curPlayer) - self.display(board) - game_ended = self.game.getGameEnded(board, 1) - if verbose: - if game_ended == 1: - self.f.write('Working Pipeline\n') - elif game_ended == 2: - self.f.write('Non-Working Pipeline\n') - # return game_ended==1 and curPlayer == 1 - return self.game.getGameEnded(board, 1) - - def playGames(self, num, verbose=False): - """ - Plays num games in which player1 starts num/2 games and player2 starts - num/2 games. - - Returns: - oneWon: games won by player1 - twoWon: games won by player2 - """ - num = int(num/2) - oneWon = 0 - twoWon = 0 - if verbose: - self.f.write('Round 1\n') - for i in range(num): - if verbose: - self.f.write('Game '+str(i)+'\n') - if self.playGame(verbose=verbose) == 1: - oneWon += 1 - else: - twoWon += 1 - self.player1, self.player2 = self.player2, self.player1 - if verbose: - self.f.write('Round 2 - Players Swapped\n') - for i in range(num): - if verbose: - self.f.write('Game '+str(i)+'\n') - if self.playGame(verbose=verbose) == -1: - oneWon += 1 - else: - twoWon += 1 - return oneWon, twoWon diff --git a/alpha_automl/pipeline_search/Coach.py b/alpha_automl/pipeline_search/Coach.py deleted file mode 100644 index c0fea9ad..00000000 --- a/alpha_automl/pipeline_search/Coach.py +++ /dev/null @@ -1,122 +0,0 @@ -import logging -import numpy as np -from collections import deque -from alpha_automl.pipeline_search.Arena import Arena -from alpha_automl.pipeline_search.MCTS import MCTS -# from alphad3m.pipeline_search.utils import Bar, AverageMeter - - -logger = logging.getLogger(__name__) -np.random.seed(0) - - -class Coach(): - """ - This class executes the self-play + learning. It uses the functions defined - in Game and NeuralNet. args are specified in main.py. - """ - def __init__(self, game, nnet, args): - self.game = game - self.board = game.getInitBoard() - self.nnet = nnet - self.args = args - self.mcts = MCTS(self.game, self.nnet, self.args) - - def executeEpisode(self): - """ - This function executes one episode of self-play, starting with player 1. - As the game is played, each turn is added as a training example to - trainExamples. The game is played till the game ends. After the game - ends, the outcome of the game is used to assign values to each example - in trainExamples. - - It uses a temp=1 if episodeStep < tempThreshold, and thereafter - uses temp=0. - - Returns: - trainExamples: a list of examples of the form (canonicalBoard,pi,v) - pi is the MCTS informed policy vector, v is +1 if - the player eventually won the game, else -1. - """ - trainExamples = [] - self.board = self.game.getInitBoard() - self.curPlayer = 1 - episodeStep = 0 - - while True: - episodeStep += 1 - canonicalBoard = self.game.getCanonicalForm(self.board, self.curPlayer) - temp = int(episodeStep < self.args.get('tempThreshold')) - - pi = self.mcts.getActionProb(canonicalBoard, temp=temp) - - sym = self.game.getTrainExamples(canonicalBoard, pi) - - trainExamples.append(sym) - - action = np.random.choice(len(pi), p=pi) - - logger.debug('COACH ACTION %s', action) - self.board, self.curPlayer = self.game.getNextState(self.board, self.curPlayer, action) - - r = self.game.getGameEnded(self.board, self.curPlayer) - if r != 0: - break - - return trainExamples - - def learn(self): - """ - Performs numIters iterations with numEps episodes of self-play in each - iteration. After every iteration, it retrains neural network with - examples in trainExamples (which has a maximium length of maxlenofQueue). - It then pits the new neural network against the old one and accepts it - only if it wins >= updateThreshold fraction of games. - """ - - trainExamples = deque([], maxlen=self.args.get('maxlenOfQueue')) - for i in range(self.args.get('numIters')): - # bookkeeping - logger.debug('------ITER ' + str(i+1) + '------') - # eps_time = AverageMeter() - # bar = Bar('Self Play', max=self.args.get('numEps')) - # end = time.time() - - for eps in range(self.args.get('numEps')): - self.mcts = MCTS(self.game, self.nnet, self.args) # reset search tree - trainExamples += self.executeEpisode() - - # bookkeeping + plot progress - # eps_time.update(time.time() - end) - # end = time.time() - # bar.suffix = '({eps}/{maxeps}) Eps Time: {et:.3f}s | Total: {total:} | ETA: {eta:}'.format( - # eps=eps+1, maxeps=self.args.get('numEps'), et=eps_time.avg, total=bar.elapsed_td, eta=bar.eta_td) - # bar.next() - # bar.finish() - - # training new network, keeping a copy of the old one - self.nnet.save_checkpoint(folder=self.args.get('checkpoint'), filename='temp.pth.tar') - pnet = self.nnet.__class__(self.game) - pnet.load_checkpoint(folder=self.args.get('checkpoint'), filename='temp.pth.tar') - pmcts = MCTS(self.game, pnet, self.args) - boards, pis, vs = list(zip(*trainExamples)) - # logger.debug([board[self.game.m:self.game.m+self.game.p] for board in boards]) - self.nnet.train(trainExamples) - nmcts = MCTS(self.game, self.nnet, self.args) - - logger.debug('PITTING AGAINST PREVIOUS VERSION') - arena = Arena(lambda x: np.argmax(pmcts.getActionProb(x, temp=0)), - lambda x: np.argmax(nmcts.getActionProb(x, temp=0)), self.game, self.game.display, - self.args['stepsfile']) - pwins, nwins = arena.playGames(self.args.get('arenaCompare'), verbose=self.args['verbose']) - - logger.debug('EVALUATIONS ', self.game.evaluations) - logger.debug('NEW/PREV WINS : ' + str(nwins) + '/' + str(pwins)) - if float(nwins)/(pwins+nwins) < self.args['updateThreshold']: - logger.debug('REJECTING NEW MODEL') - self.nnet = pnet - - else: - logger.debug('ACCEPTING NEW MODEL') - self.nnet.save_checkpoint(folder=self.args['checkpoint'], filename='checkpoint_' + str(i) + '.pth.tar') - self.nnet.save_checkpoint(folder=self.args['checkpoint'], filename='best.pth.tar') diff --git a/alpha_automl/pipeline_search/Game.py b/alpha_automl/pipeline_search/Game.py deleted file mode 100644 index 62963f9a..00000000 --- a/alpha_automl/pipeline_search/Game.py +++ /dev/null @@ -1,112 +0,0 @@ -class Game(): - """ - This class specifies the base Game class. To define your own game, subclass - this class and implement the functions below. This works when the game is - two-player, adversarial and turn-based. - - Use 1 for player1 and -1 for player2. - - See othello/OthelloGame.py for an example implementation. - """ - def __init__(self): - pass - - def getInitBoard(self): - """ - Returns: - startBoard: a representation of the board (ideally this is the form - that will be the input to your neural network) - """ - pass - - def getBoardSize(self): - """ - Returns: - (x,y): a tuple of board dimensions - """ - pass - - def getActionSize(self): - """ - Returns: - actionSize: number of all possible actions - """ - pass - - def getNextState(self, board, player, action): - """ - Input: - board: current board - player: current player (1 or -1) - action: action taken by current player - - Returns: - nextBoard: board after applying action - nextPlayer: player who plays in the next turn (should be -player) - """ - pass - - def getValidMoves(self, board, player): - """ - Input: - board: current board - player: current player - - Returns: - validMoves: a binary vector of length self.getActionSize(), 1 for - moves that are valid from the current board and player, - 0 for invalid moves - """ - pass - - def getGameEnded(self, board, player): - """ - Input: - board: current board - player: current player (1 or -1) - - Returns: - r: 0 if game has not ended. 1 if player won, -1 if player lost. - """ - pass - - def getCanonicalForm(self, board, player): - """ - Input: - board: current board - player: current player (1 or -1) - - Returns: - canonicalBoard: returns canonical form of board. The canonical form - should be independent of player. For e.g. in chess, - the canonical form can be chosen to be from the pov - of white. When the player is white, we can return - board as is. When the player is black, we can invert - the colors and return the board. - """ - pass - - def getSymmetries(self, board, pi): - """ - Input: - board: current board - pi: policy vector of size self.getActionSize() - - Returns: - symmForms: a list of [(board,pi)] where each tuple is a symmetrical - form of the board and the corresponding pi vector. This - is used when training the neural network from examples. - """ - pass - - def stringRepresentation(self, board): - """ - Input: - - board: current board - - Returns: - boardString: a quick conversion of board to a string format. - Required by MCTS for hashing. - """ - pass diff --git a/alpha_automl/pipeline_search/MCTS.py b/alpha_automl/pipeline_search/MCTS.py deleted file mode 100644 index c660b016..00000000 --- a/alpha_automl/pipeline_search/MCTS.py +++ /dev/null @@ -1,191 +0,0 @@ -import math -import numpy as np -import logging - -logger = logging.getLogger(__name__) - -np.random.seed(0) - - -class MCTS(): - """ - This class handles the MCTS tree. - """ - - def __init__(self, game, nnet, args): - self.game = game - self.nnet = nnet - self.args = args - self.Qsa = {} # stores Q values for s,a (as defined in the paper) - self.Nsa = {} # stores #times edge s,a was visited - self.Ns = {} # stores #times board s was visited - self.Ps = {} # stores initial policy (returned by neural net) - - self.Es = {} # stores game.getGameEnded ended for board s - self.Vals = {} - self.Vs = {} # stores game.getValidMoves for board s - self.count = 0 - - def getActionProb(self, canonicalBoard, temp=1): - """ - This function performs numMCTSSims simulations of MCTS starting from - canonicalBoard. - - Returns: - probs: a policy vector where the probability of the ith action is - proportional to Nsa[(s,a)]**(1./temp) - """ - for i in range(self.args.get('numMCTSSims')): - logger.debug('MCTS SIMULATION %s', i + 1) - self.search(canonicalBoard) - - s = self.game.stringRepresentation(canonicalBoard) - counts = [0] * self.game.getActionSize() - counts = [self.Nsa[(s, a)] if (s, a) in self.Nsa else 0 for a in range(self.game.getActionSize())] - - if temp == 0: - bestA = np.argmax(counts) - probs = [0] * len(counts) - probs[bestA] = 1 - if np.sum(probs) == 0: - logger.debug('PROB ZERO') - return probs - - counts = [x ** (1. / temp) for x in counts] - if np.sum(counts) == 0: - probs = [1 / (len(counts))] * len(counts) - else: - non_zero_args = list(np.where(np.asarray(counts) > 0)[0]) - probs = [0] * len(counts) - for index in non_zero_args: - probs[index] = counts[index] / float(sum(counts)) - if np.sum(probs) == 0: - logger.debug('PROB ZERO') - return probs - - def search(self, canonicalBoard, player=1): - """ - This function performs one iteration of MCTS. It is recursively called - till a leaf node is found. The action chosen at each node is one that - has the maximum upper confidence bound as in the paper. - - Once a leaf node is found, the neural network is called to return an - initial policy P and a value v for the state. This value is propogated - up the search path. In case the leaf node is a terminal state, the - outcome is propogated up the search path. The values of Ns, Nsa, Qsa are - updated. - - NOTE: the return values are the negative of the value of the current - state. This is done since v is in [-1,1] and if v is the value of a - state for the current player, then its value is -v for the other player. - - Returns: - v: the negative of the value of the current canonicalBoard - """ - self.game.display(canonicalBoard) - - s = self.game.stringRepresentation(canonicalBoard) - game_ended = self.game.getGameEnded(canonicalBoard, player) - # logger.debug('GAME ENDED %s', game_ended) - - if s not in self.Es: - self.Es[s] = game_ended - if self.Es[s] != 0: - # terminal node - # Clear all previous moves - return self.Vals[s] if not self.Vals.get(s) is None else 0 - - if s not in self.Ps: - # leaf node - self.Ps[s], v = self.nnet.predict(self.game.getTrainBoard(canonicalBoard)) - logger.debug('Prediction %s', v) - # logger.debug('CALLING VALID MOVES') - valids = self.game.getValidMoves(canonicalBoard, 1) - self.Ps[s] = self.Ps[s] * valids # masking invalid moves - self.Ps[s] /= np.sum(self.Ps[s]) # renormalize - self.Vals[s] = v - self.Vs[s] = valids - self.Ns[s] = 0 - return v - - valids = self.Vs[s] - - # Check if valid moves are available. Quit if no more legal moves are possible - if not any(valids): - return 0 - - cur_best = -float('inf') - best_act = -1 - current_primitives = self.game.get_pipeline_primitives(canonicalBoard) - - current_pattern = ' '.join( - [self.game.grammar['RULES_PROBA']['TYPES'].get(p, {'type': p})['type'] for p in current_primitives]) - # pick the action with the highest upper confidence bound - actions = [] - alpha = 0.1 - metalearning_weights = {} - - if len(self.game.grammar['RULES_PROBA']['GLOBAL']) == 0: # It's a manual grammar, so only use the NN info - alpha = 1.0 - - for a in range(self.game.getActionSize()): - if valids[a]: - # logger.debug('MCTS ACTION %s', a) - global_proba = self.game.grammar['RULES_PROBA']['GLOBAL'].get(a + 1, (0, 0))[1] - local_proba = self.game.grammar['RULES_PROBA']['LOCAL'].get(current_pattern, {}).get(a + 1, (0, 0))[1] - correlation = global_proba * local_proba - metalearning_weights[a] = correlation - - if (s, a) in self.Qsa: - u = self.Qsa[(s, a)] + \ - self.args.get('cpuct') * (alpha * self.Ps[s][a] + (1 - alpha) * correlation) * \ - math.sqrt(self.Ns[s]) / (1 + self.Nsa[(s, a)]) - # u = (global_proba + local_proba) + self.Qsa[(s, a)] + self.args.get('cpuct') * - # math.sqrt(self.Ns[s]) / (1 + self.Nsa[(s, a)]) - # u = self.Qsa[(s, a)] + self.args.get('cpuct') * self.Ps[s][a] * - # math.sqrt(self.Ns[s]) / (1 + self.Nsa[(s, a)]) - else: - u = self.args.get('cpuct') * \ - (alpha * self.Ps[s][a] + (1 - alpha) * correlation) * math.sqrt(self.Ns[s]) # Q = 0 ? - # u = (global_proba + local_proba) + self.args.get('cpuct') * math.sqrt(self.Ns[s]) # Q = 0 ? - # u = self.args.get('cpuct') * self.Ps[s][a] * math.sqrt(self.Ns[s]) # Q = 0 ? - # print('Production:', self.game.grammar['RULES_PROBA']['GLOBAL'] - # .get(a + 1, ('None', 0))[0], correlation) - # print('Value of u:', u) - if u > cur_best: - cur_best = u - best_act = a - actions = [a] - elif u == cur_best: - actions.append(a) - - if len(actions) == sum(valids): # All the available actions have the same value - a = sorted(metalearning_weights.items(), key=lambda x: x[1], reverse=True)[0][0] - elif len(actions) > 1: - a = np.random.choice(np.asarray(actions)) - else: - a = best_act - # print('>>>>>>>>>>best a=%d' % a, 'local', self.game.grammar['RULES_PROBA']['LOCAL'] - # .get(current_pattern, {}).get(a + 1, ('None', 0))) - # print('>>>>>>>>>>best a=%d' % a, 'total', self.game.grammar['RULES_PROBA']['GLOBAL'].get(a+1, (0, 0))[1] - # * self.game.grammar['RULES_PROBA']['LOCAL'].get(current_pattern, {}).get(a + 1, (0, 0))[1]) - # logger.debug('BEST ACTIONS %s', actions) - # logger.debug('MCTS BEST ACTION %s', best_act) - next_s, next_player = self.game.getNextState(canonicalBoard, player, a) - next_s = self.game.getCanonicalForm(next_s, next_player) - # self.game.display(next_s) - - # logger.debug('NEXT STATE SEARCH RECURSION') - v = self.search(next_s, next_player) - - if (s, a) in self.Qsa: - self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[(s, a)] + v) / (self.Nsa[(s, a)] + 1) - self.Nsa[(s, a)] += 1 - - else: - self.Qsa[(s, a)] = v - self.Nsa[(s, a)] = 1 - - self.Ns[s] += 1 - - return v diff --git a/alpha_automl/pipeline_search/NeuralNet.py b/alpha_automl/pipeline_search/NeuralNet.py deleted file mode 100644 index 7fcfd644..00000000 --- a/alpha_automl/pipeline_search/NeuralNet.py +++ /dev/null @@ -1,50 +0,0 @@ -class NeuralNet(): - """ - This class specifies the base NeuralNet class. To define your own neural - network, subclass this class and implement the functions below. The neural - network does not consider the current player, and instead only deals with - the canonical form of the board. - - See othello/NNet.py for an example implementation. - """ - - def __init__(self, game): - pass - - def train(self, examples): - """ - This function trains the neural network with examples obtained from - self-play. - - Input: - examples: a list of training examples, where each example is of form - (board, pi, v). pi is the MCTS informed policy vector for - the given board, and v is its value. The examples has - board in its canonical form. - """ - pass - - def predict(self, board): - """ - Input: - board: current board in its canonical form. - - Returns: - pi: a policy vector for the current board- a numpy array of length - game.getActionSize - v: a float in [-1,1] that gives the value of the current board - """ - pass - - def save_checkpoint(self, folder, filename): - """ - Saves the current neural network (with its parameters) in - folder/filename - """ - pass - - def load_checkpoint(self, folder, filename): - """ - Loads parameters of the neural network from folder/filename - """ - pass diff --git a/alpha_automl/pipeline_search/agent_environment.py b/alpha_automl/pipeline_search/agent_environment.py new file mode 100644 index 00000000..b99686c7 --- /dev/null +++ b/alpha_automl/pipeline_search/agent_environment.py @@ -0,0 +1,147 @@ +import logging + +import gymnasium as gym +import numpy as np +from gymnasium.spaces import Box, Dict, Discrete +from ray.rllib.env.env_context import EnvContext + +logger = logging.getLogger(__name__) + + +class AutoMLEnv(gym.Env): + """ + Customized environment for RLlib Reinforcement Learning. + reset: reset the environment to the initial state + step: take an action and return the next state, reward, done, and info + rewards in detail: + - win: + - CLASSIFICATION: 10 + (pipeline score) ^ 2 * 100 + - REGRESSION: 10 + (100 / pipeline score) + - not end: 1 + - invalid: 10 + - bad: -1 + """ + + def __init__(self, config: EnvContext): + self.game = config["game"] # PipelineGame + self.board = self.game.getInitBoard() # initial board + self.step_stack = ["S"] # stack for steps + self.metadata = self.board[: self.game.m] + self.observation_space = Dict( + { + "board": Box( + 0, 85, shape=(self.game.p + self.game.m,), dtype=np.uint8 + ), # Ray env board contains pipeline and metadata + } + ) + self.action_spaces = ( + self.generate_action_spaces() + ) # number of actions for each step + self.max_actions = max(list(self.action_spaces.values())) # max number of actions (depends on the largest step in the grammar, i.e. CLASSIFIER) + self.action_offsets = ( + self.generate_action_offsets() + ) # offset for each step, for translating action to PipelineGame action + self.action_space = Discrete(self.max_actions) # Ray env action space + + def reset(self, *, seed=None, options=None): + self.num_steps = 0 + self.step_stack = ["S"] + self.board = self.game.getInitBoard() + self.metadata = self.board[: self.game.m] + + return {"board": np.array(self.board).astype(np.uint8)}, {} + + def step(self, action): + curr_step = self.step_stack.pop() + offseted_action = self.action_offsets[curr_step] + action + valid_action_size = self.action_spaces[curr_step] + # Check the action is illegal + valid_moves = self.game.getValidMoves(self.board) + if action >= valid_action_size or valid_moves[offseted_action - 1] != 1: + return ( + {"board": np.array(self.board).astype(np.uint8)}, + -1, + True, + False, + {}, + ) + + # Check the action is out of order + move_type, non_terminals_moves = self.extract_action_details(offseted_action) + if move_type != curr_step: + return ( + {"board": np.array(self.board).astype(np.uint8)}, + -100, + True, + False, + {}, + ) + if ( + non_terminals_moves[0] != "E" + and non_terminals_moves[0].upper() == non_terminals_moves[0] + ): + self.step_stack.extend(non_terminals_moves[::-1]) + + # update number of steps + self.num_steps += 1 + + # update board with new action + self.board = self.game.getNextState(self.board, offseted_action - 1) + + # reward: win(1) - pipeline score, not end(0) - 1, bad(2) - -1 + reward = 0 + game_end = self.game.getGameEnded(self.board) + if game_end == 1: # pipeline score over threshold + try: + if self.game.problem == "REGRESSION": + reward = 10 + (100 / self.game.getEvaluation(self.board)) + else: + reward = 10 + (self.game.getEvaluation(self.board)) ** 2 * 100 + except Exception as e: + logger.critical(f"[PIPELINE FOUND] Error happened: {str(e)}") + elif game_end == 2: # finished but invalid + reward = 10 + else: + reward = 1 + + # done & truncated + truncated = self.num_steps >= 20 + done = game_end or truncated + + return ( + {"board": np.array(self.board).astype(np.uint8)}, + reward, + done, + truncated, + {}, + ) + + def extract_action_details(self, action): + rules = self.game.grammar["RULES"] + move_string = list(rules.keys())[list(rules.values()).index(action)] + split_move = move_string.split("->") + move_type = split_move[0].strip() + non_terminals_moves = split_move[1].strip().split(" ") + return move_type, non_terminals_moves + + def generate_action_spaces(self): + action_spaces = {} + for action in self.game.grammar["RULES"].values(): + move_type, non_terminals_moves = self.extract_action_details(action) + + if move_type not in action_spaces: + action_spaces[move_type] = 1 + else: + action_spaces[move_type] += 1 + + return action_spaces + + def generate_action_offsets(self): + action_offsets = {} + for action in self.game.grammar["RULES"].values(): + move_type, non_terminals_moves = self.extract_action_details(action) + + if move_type not in action_offsets: + action_offsets[move_type] = action + + return action_offsets diff --git a/alpha_automl/pipeline_search/agent_lab.py b/alpha_automl/pipeline_search/agent_lab.py new file mode 100644 index 00000000..a1614f5b --- /dev/null +++ b/alpha_automl/pipeline_search/agent_lab.py @@ -0,0 +1,196 @@ +import json +import logging +import os +import time +from datetime import datetime + +import ray +from alpha_automl.pipeline_search.agent_environment import AutoMLEnv +from ray.rllib.policy import Policy +from ray.rllib.utils.checkpoints import get_checkpoint_info +from ray.tune.logger import pretty_print +from ray.tune.registry import get_trainable_cls + +logger = logging.getLogger(__name__) + + +def pipeline_search_rllib(game, time_bound, checkpoint_load_folder, checkpoint_save_folder): + """ + Search for pipelines using Rllib + """ + ray.init(local_mode=True, num_cpus=8, logging_level=logging.CRITICAL, log_to_driver=False) + num_cpus = int(ray.available_resources()["CPU"]) + + # load checkpoint or create a new one + algo = load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers=7) + logger.debug("Create Algo object done") + + # train model + train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder) + logger.debug("Training done") + ray.shutdown() + + +def load_rllib_checkpoint(game, checkpoint_load_folder, num_rollout_workers): + config = ( + get_trainable_cls("PPO") + .get_default_config() + # or "corridor" if registered above + .environment(AutoMLEnv, env_config={"game": game}) + .framework("torch") + # Use GPUs iff `RLLIB_NUM_GPUS` env var set to > 0. + .resources( + # num_gpus=1, + # num_gpus_per_worker=1 / (num_rollout_workers + 1), + num_cpus_per_worker=1, + ) + .rollouts(num_rollout_workers=num_rollout_workers) + .training( + gamma=0.99, + clip_param=0.3, + kl_coeff=0.3, + entropy_coeff=0.05, + train_batch_size=10000, + ) + ) + config.lr = 1e-5 + config.simple_optimizer = True + logger.debug("Create Config done") + + # Checking if the list is empty or not + if not contain_checkpoints(checkpoint_load_folder): + logger.debug("Cannot read checkpoint, create a new one.") + return config.build() + else: + algo = config.build() + weights = load_rllib_policy_weights(checkpoint_load_folder) + + algo.set_weights(weights) + # Restore the old state. + # algo.restore(load_folder) + # checkpoint_info = get_checkpoint_info(load_folder) + return algo + + +def train_rllib_model(algo, time_bound, checkpoint_load_folder, checkpoint_save_folder): + timeout = time.time() + time_bound + result = algo.train() + last_best = result["episode_reward_mean"] + best_unchanged_iter = 1 + logger.debug(pretty_print(result)) + + while True: + if ( + time.time() > timeout + or (best_unchanged_iter >= 600 and result["episode_reward_mean"] >= 0) + # or result["episode_reward_mean"] >= 70 + ): + logger.debug(f"Training timeout reached") + break + + if contain_checkpoints(checkpoint_save_folder): + weights = load_rllib_policy_weights(checkpoint_save_folder) + algo.set_weights(weights) + elif contain_checkpoints(checkpoint_load_folder): + weights = load_rllib_policy_weights(checkpoint_load_folder) + algo.set_weights(weights) + result = algo.train() + logger.debug(pretty_print(result)) + # stop training of the target train steps or reward are reached + if result["episode_reward_mean"] > last_best: + last_best = result["episode_reward_mean"] + best_unchanged_iter = 1 + save_rllib_checkpoint(algo, checkpoint_save_folder) + else: + best_unchanged_iter += 1 + algo.stop() + + +def load_rllib_policy_weights(checkpoint_folder): + logger.debug(f"Synchronizing model weights...") + policy = Policy.from_checkpoint(checkpoint_folder) + policy = policy["default_policy"] + weights = policy.get_weights() + + weights = {"default_policy": weights} + + return weights + + +def save_rllib_checkpoint(algo, checkpoint_save_folder): + save_result = algo.save(checkpoint_dir=checkpoint_save_folder) + path_to_checkpoint = save_result.checkpoint.path + + logger.debug( + f"An Algorithm checkpoint has been created inside directory: '{path_to_checkpoint}'." + ) + + +def dump_result_to_json(primitives, task_start, score, output_folder=None): + output_path = generate_json_path(output_folder) + # Read JSON data from input file + if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: + with open(output_path, "w") as f: + json.dump({}, f) + with open(output_path, "r") as f: + data = json.load(f) + + timestamp = str(datetime.now() - task_start) + # strftime("%Y-%m-%d %H:%M:%S") + + # Check for duplicate elements + if primitives in data.values(): + return + data[score] = primitives + + # Write unique elements to output file + with open(output_path, "w") as f: + json.dump(data, f) + + +def read_result_to_pipeline(builder, output_folder=None): + output_path = generate_json_path(output_folder) + + pipelines = [] + # Read JSON data from input file + if not os.path.exists(output_path) or os.path.getsize(output_path) == 0: + return [] + with open(output_path, "r") as f: + data = json.load(f) + + # Check for duplicate elements + for score, primitives in sorted(data.items()): + pipeline = builder.make_pipeline(primitives) + if pipeline: + pipelines.append(pipeline) + + return pipelines + + +def generate_json_path(output_folder=None): + output_path = os.path.join(output_folder, "result.json") + + return output_path + + +def contain_checkpoints(folder_path): + if folder_path is None: + return False + + file_list = os.listdir(folder_path) + + if [f for f in file_list if not f.startswith(".")] == []: + return False + + if ( + "algorithm_state.pkl" in file_list + and "policies" in file_list + and "rllib_checkpoint.json" in file_list + ): + return True + else: + logger.debug( + f"Checkpoint folder {folder_path} does not contain all necessary files, files: {file_list}." + ) + + return False diff --git a/alpha_automl/pipeline_search/pipeline/PipelineGame.py b/alpha_automl/pipeline_search/game.py similarity index 67% rename from alpha_automl/pipeline_search/pipeline/PipelineGame.py rename to alpha_automl/pipeline_search/game.py index 467e8b17..dc798d42 100644 --- a/alpha_automl/pipeline_search/pipeline/PipelineGame.py +++ b/alpha_automl/pipeline_search/game.py @@ -1,11 +1,7 @@ from __future__ import print_function -import os -import pickle import math import logging -from copy import deepcopy -from alpha_automl.pipeline_search.Game import Game -from alpha_automl.pipeline_search.pipeline.PipelineLogic import Board +from alpha_automl.pipeline_search.game_logic import Board import numpy as np import traceback import time @@ -13,11 +9,10 @@ logger = logging.getLogger(__name__) -class PipelineGame(Game): +class PipelineGame(): # FIXEME: Maybe the input parameters can be in json - def __init__(self, input={}, eval_pipeline=None, args=None): + def __init__(self, input=None, eval_pipeline=None): self.steps = 0 - self.args = input['ARGS'] self.evaluations = {} self.eval_times = {} @@ -31,21 +26,7 @@ def __init__(self, input={}, eval_pipeline=None, args=None): self.data_type = input['DATA_TYPE'].upper() self.metric = input['METRIC'] self.dataset = input['DATASET'] - - self.dataset_metafeatures = input['DATASET_METAFEATURES'] - if self.dataset_metafeatures is None: - metafeatures_path = args.get('metafeatures_path') - if metafeatures_path is not None: - metafeatures_file = os.path.join(metafeatures_path, args['dataset'] + '_metafeatures.pkl') - if os.path.isfile(metafeatures_file): - m_f = open(metafeatures_file, 'rb') - self.dataset_metafeatures = pickle.load(m_f)[args['dataset']] - - if self.dataset_metafeatures is None: - logger.warning('No Dataset Metafeatures specified - Initializing to empty') - self.dataset_metafeatures = [] - else: - self.dataset_metafeatures = list(np.nan_to_num(np.asarray(self.dataset_metafeatures))) + self.dataset_metafeatures = list(np.nan_to_num(np.asarray(input['DATASET_METAFEATURES']))) self.m = len(self.dataset_metafeatures)+2 self.p = input['PIPELINE_SIZE'] @@ -69,18 +50,18 @@ def getActionSize(self): board = Board(self.m, self.grammar, self.pipeline_size, self.metric) return len(board.valid_moves) - def getNextState(self, board, player, action): - # if player takes action on board, return next (board,player) + def getNextState(self, board, action): # action must be a valid move b = Board(self.m, self.grammar, self.pipeline_size, self.metric) b.set_metafeatures(board) b.set_pipeline(board) # logger.debug('PREV STATE %s', b.pieces_p) - b.execute_move(action, player) + b.execute_move(action) # logger.debug('NEXT STATE %s', b.pieces_p) - return (b.pieces_m+b.pieces_p, -player) - def getValidMoves(self, board, player): + return b.pieces_m+b.pieces_p + + def getValidMoves(self, board): # return a fixed size binary vector b = Board(self.m, self.grammar, self.pipeline_size, self.metric) b.set_metafeatures(board) @@ -102,7 +83,7 @@ def getEvaluation(self, board): if eval_val is None: self.steps = self.steps + 1 try: - eval_val = self.eval_pipeline(pipeline, 'AlphaAutoML') + eval_val = self.eval_pipeline(pipeline) except Exception: logger.warning('Error in Pipeline Execution %s', eval_val) traceback.print_exc() @@ -113,41 +94,33 @@ def getEvaluation(self, board): return eval_val - def getGameEnded(self, board, player, eval_val=None): - # return 0 if not ended, 1 if x won, -1 if x lost - # player = 1 + def getGameEnded(self, board, eval_val=None): + # return 0 if not ended, 1 if x won, 2 if x lost b = Board(self.m, self.grammar, self.pipeline_size, self.metric) b.set_metafeatures(board) b.set_pipeline(board) if not b.is_terminal_pipeline(): return 0 - if len(self.evaluations) > 0: - sorted_evals = sorted([eval for eval in list(self.evaluations.values()) if eval != float('inf')]) - if len(sorted_evals) > 0: - if 'error' in self.metric.lower(): - win_threshold = sorted_evals[0] - else: - win_threshold = sorted_evals[-1] - b.win_threshold = win_threshold +# if len(self.evaluations) > 0: +# sorted_evals = sorted([eval for eval in list(self.evaluations.values()) if eval != float('inf')]) +# if len(sorted_evals) > 0: +# if 'error' in self.metric.lower(): +# win_threshold = sorted_evals[0] +# else: +# win_threshold = sorted_evals[-1] +# b.win_threshold = win_threshold eval_val = self.getEvaluation(board) - if b.findWin(player, eval_val): - logger.debug('findwin %s', player) + if b.findWin(eval_val): + logger.debug('Win') return 1 - if b.findWin(-player, eval_val): - logger.debug('findwin %', -player) - return -1 if b.has_legal_moves(): return 0 return 2 - def getCanonicalForm(self, board, player): - # return state if player==1, else return -state if player==-1 - return deepcopy(board) - def stringRepresentation(self, board): # 3x3 numpy array (canonical board) return np.asarray(board).tostring() diff --git a/alpha_automl/pipeline_search/pipeline/PipelineLogic.py b/alpha_automl/pipeline_search/game_logic.py similarity index 96% rename from alpha_automl/pipeline_search/pipeline/PipelineLogic.py rename to alpha_automl/pipeline_search/game_logic.py index 5ffa0b6d..21810cb1 100644 --- a/alpha_automl/pipeline_search/pipeline/PipelineLogic.py +++ b/alpha_automl/pipeline_search/game_logic.py @@ -16,7 +16,7 @@ class Board(): - def __init__(self, m=30, grammar={}, pipeline_size=6, metric='f1macro', win_threshold=0.6): + def __init__(self, m=30, grammar=None, pipeline_size=6, metric='accuracy', win_threshold=0.01): "Set up initial board configuration." self.terminals = grammar['TERMINALS'] @@ -61,7 +61,7 @@ def is_terminal_pipeline(self): return False return True - def findWin(self, player, eval_val=None): + def findWin(self, eval_val=None): """Find win of the given color in row, column, or diagonal (1 for x, -1 for o)""" if not any(self[0:]): @@ -135,7 +135,7 @@ def get_train_board(self): def get_board_size(self): return self.m+(len(self.terminals)+len(self.non_terminals)) - def execute_move(self, action, player): + def execute_move(self, action): """Perform the given move on the board; color gives the color of the piece to play (1=x,-1=o) """ diff --git a/alpha_automl/pipeline_search/pipeline/NNet.py b/alpha_automl/pipeline_search/pipeline/NNet.py deleted file mode 100644 index f364f4ab..00000000 --- a/alpha_automl/pipeline_search/pipeline/NNet.py +++ /dev/null @@ -1,140 +0,0 @@ -import os -import numpy as np -import logging -from alpha_automl.pipeline_search.NeuralNet import NeuralNet -import torch -import torch.optim as optim -from torch.autograd import Variable -from alpha_automl.pipeline_search.pipeline.PipelineNNet import PipelineNNet as onnet - - -logger = logging.getLogger(__name__) - -args = dict({ - 'lr': 0.001, - 'dropout': 0.3, - 'epochs': 2, - 'batch_size': 64, - 'num_channels': 512, -}) - -device = 'cuda' if torch.cuda.is_available() else 'cpu' - - -class NNetWrapper(NeuralNet): - def __init__(self, game): - self.nnet = onnet(game, args) - self.action_size = game.getActionSize() - self.board_size = game.getBoardSize() - if device == 'cuda': - self.nnet.cuda() - - def train(self, examples): - """ - examples: list of examples, each example is of form (board, pi, v) - """ - optimizer = optim.Adam(self.nnet.parameters()) - - for epoch in range(args.get('epochs')): - logger.debug('EPOCH ::: %s', str(epoch+1)) - self.nnet.train() - # data_time = AverageMeter() - # batch_time = AverageMeter() - # pi_losses = AverageMeter() - # v_losses = AverageMeter() - # end = time.time() - - batch_size = args.get('batch_size') - # bar = Bar('Training Net', max=int(len(examples)/batch_size)) - batch_idx = 0 - - while batch_idx < int(len(examples)/batch_size): - sample_ids = np.random.randint(len(examples), size=batch_size) - boards, pis, vs = list(zip(*[examples[i] for i in sample_ids])) - boards = torch.FloatTensor(np.array(boards).astype(np.float64)) - target_pis = torch.FloatTensor(np.array(pis)) - target_vs = torch.FloatTensor(np.array(vs).astype(np.float64)) - - # predict - if device == 'cuda': - boards, target_pis, target_vs = Variable(boards.contiguous().cuda(), requires_grad=True), \ - Variable(target_pis.contiguous().cuda(), requires_grad=True), \ - Variable(target_vs.contiguous().cuda(), requires_grad=True) - else: - boards, target_pis, target_vs = Variable(boards), Variable(target_pis), Variable(target_vs) - - # measure data loading time - # data_time.update(time.time() - end) - - # compute output - # print(boards) - out_pi, out_v = self.nnet(boards) - l_pi = self.loss_pi(target_pis, out_pi) - l_v = self.loss_v(target_vs, out_v) - total_loss = l_pi + l_v - - # record loss - # pi_losses.update(l_pi.data, boards.size(0)) - # v_losses.update(l_v.data, boards.size(0)) - - # compute gradient and do SGD step - optimizer.zero_grad() - total_loss.backward() - optimizer.step() - - # measure elapsed time - # batch_time.update(time.time() - end) - # end = time.time() - batch_idx += 1 - - def predict(self, board): - """ - board: np array with board - """ - # timing - # start = time.time() - # print('BOARD\n', board) - - # preparing input - board = torch.from_numpy(np.array(board[0:self.board_size], dtype='f')).cuda().float() if device == 'cuda' \ - else torch.from_numpy(np.array(board[0:self.board_size], dtype='f')) - # board = torch.FloatTensor(board[0:self.board_x]) - if device == 'cuda': - board = board.contiguous().cuda() - board = Variable(board, volatile=True) - board = board.view(1, self.board_size) - - self.nnet.eval() - pi, v = self.nnet(board) - - # print('PROBABILITY ', torch.exp(pi).data.cpu().numpy()[0]) - # print('VALUE ', v.data.cpu().numpy()[0]) - - # logger.debug('PREDICTION TIME TAKEN : {0:03f}'.format(time.time()-start)) - return torch.exp(pi).data.cpu().numpy()[0], v.data.cpu().numpy()[0][0] - - def loss_pi(self, targets, outputs): - return -torch.sum(targets*outputs)/targets.size()[0] - - def loss_v(self, targets, outputs): - return torch.sum((targets-outputs.view(-1))**2)/targets.size()[0] - - def save_checkpoint(self, folder='checkpoint', filename='checkpoint.pth.tar'): - filepath = os.path.join(folder, filename) - if not os.path.exists(folder): - logger.debug("Checkpoint Directory does not exist! Making directory {}".format(folder)) - os.mkdir(folder) - else: - logger.debug("Checkpoint Directory exists! ") - torch.save({ - 'state_dict': self.nnet.state_dict(), - }, filepath) - - def load_checkpoint(self, folder='checkpoint', filename='checkpoint.pth.tar'): - # https://github.com/pytorch/examples/blob/master/imagenet/main.py#L98 - filepath = os.path.join(folder, filename) - if not os.path.exists(filepath): - raise ("No model in path {}".format(folder)) - - checkpoint = torch.load(filepath) - self.nnet.load_state_dict(checkpoint['state_dict']) diff --git a/alpha_automl/pipeline_search/pipeline/PipelineNNet.py b/alpha_automl/pipeline_search/pipeline/PipelineNNet.py deleted file mode 100644 index 8c1975fc..00000000 --- a/alpha_automl/pipeline_search/pipeline/PipelineNNet.py +++ /dev/null @@ -1,35 +0,0 @@ -import logging - -import torch -import torch.nn as nn -import torch.nn.functional as F - -logger = logging.getLogger(__name__) - - -class PipelineNNet(nn.Module): - def __init__(self, game, args): - # game params - self.action_size = game.getActionSize() - self.board_size = game.getBoardSize() - self.problem = game.problem - self.args = args - - super(PipelineNNet, self).__init__() - hlayer = 512 - torch.manual_seed(1) - self.lstm = nn.LSTM(self.board_size, hlayer, 2) - self.probFC = nn.Linear(hlayer, self.action_size) - self.valueFC = nn.Linear(hlayer, 1) - - def forward(self, s): - s = s.view(-1, 1, self.board_size) - lstm_out, hidden = self.lstm(s) - s = lstm_out[:, -1] - pi = self.probFC(s) # batch_size x 512 - v = self.valueFC(s) # batch_size x 512 - - if self.problem == 'CLASSIFICATION': - return F.log_softmax(pi, 1), F.sigmoid(v) - else: - return F.log_softmax(pi, 1), v diff --git a/alpha_automl/pipeline_search/pipeline/PipelinePlayers.py b/alpha_automl/pipeline_search/pipeline/PipelinePlayers.py deleted file mode 100644 index e69de29b..00000000 diff --git a/alpha_automl/pipeline_search/pipeline/__init__.py b/alpha_automl/pipeline_search/pipeline/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/alpha_automl/pipeline_search/utils.py b/alpha_automl/pipeline_search/utils.py deleted file mode 100644 index ffda3a9f..00000000 --- a/alpha_automl/pipeline_search/utils.py +++ /dev/null @@ -1,3 +0,0 @@ -class dotdict(dict): - def __getattr__(self, name): - return self[name] diff --git a/alpha_automl/pipeline_synthesis/setup_search.py b/alpha_automl/pipeline_synthesis/setup_search.py index 636e7e29..1bcc4de8 100644 --- a/alpha_automl/pipeline_synthesis/setup_search.py +++ b/alpha_automl/pipeline_synthesis/setup_search.py @@ -1,69 +1,138 @@ -import os -import sys -import signal import logging -from os.path import join -from alpha_automl.grammar_loader import load_automatic_grammar, load_manual_grammar -from alpha_automl.pipeline_search.Coach import Coach -from alpha_automl.pipeline_search.pipeline.NNet import NNetWrapper -from alpha_automl.pipeline_search.pipeline.PipelineGame import PipelineGame +import sys +from datetime import datetime +from os.path import dirname, join + +from alpha_automl.grammar_loader import load_manual_grammar +from alpha_automl.pipeline_search.agent_lab import (contain_checkpoints, + dump_result_to_json, + pipeline_search_rllib, + read_result_to_pipeline) +from alpha_automl.pipeline_search.game import PipelineGame from alpha_automl.pipeline_synthesis.pipeline_builder import BaseBuilder from alpha_automl.scorer import score_pipeline from alpha_automl.utils import hide_logs logger = logging.getLogger(__name__) +DEFAULT_CHECKPOINT_PATH = join(dirname(__file__), "../resource/checkpoints/") config = { - 'PROBLEM_TYPES': { - 'CLASSIFICATION': 1, - 'REGRESSION': 2, - 'CLUSTERING': 3, - 'NA': 4, - 'TIME_SERIES_FORECAST': 5, - 'SEMISUPERVISED': 6, - }, - 'DATA_TYPES': {'TABULAR': 1, 'GRAPH': 2, 'IMAGE': 3}, - 'PIPELINE_SIZE': 8, - 'ARGS': { - 'numIters': 25, - 'numEps': 5, - 'tempThreshold': 15, - 'updateThreshold': 0.6, - 'maxlenOfQueue': 200000, - 'numMCTSSims': 5, - 'arenaCompare': 40, - 'cpuct': 1, - 'load_model': False, - 'metafeatures_path': '/d3m/data/metafeatures', - 'verbose': True, + "PROBLEM_TYPES": { + "CLASSIFICATION": 1, + "REGRESSION": 2, + "CLUSTERING": 3, + "TIME_SERIES_FORECAST": 4, + "SEMISUPERVISED": 5, + "NA": 6, }, + "DATA_TYPES": {"TABULAR": 1, "TEXT": 2, "IMAGE": 3, "VIDEO": 4, "MULTIMODAL": 5}, + "PIPELINE_SIZE": 10, } -def signal_handler(queue, signum): - logger.debug(f'Receiving signal {signum}, terminating process') - queue.put('DONE') - # TODO: Should it save the last status of the NN model? - sys.exit(0) +def search_pipelines(X, y, scoring, splitting_strategy, task_name, time_bound, automl_hyperparams, metadata, output_folder, checkpoints_folder): + builder = BaseBuilder(metadata, automl_hyperparams) + all_primitives = builder.all_primitives + ensemble_pipelines_hash = set() + + task_start = datetime.now() + + def evaluate_pipeline(primitives): + has_repeated_classifiers = check_repeated_classifiers( + primitives, all_primitives, ensemble_pipelines_hash + ) + + if has_repeated_classifiers: + logger.info("Repeated classifiers detected in ensembles, ignoring pipeline") + return None + + pipeline = builder.make_pipeline(primitives) + score = None + + if pipeline is not None: + alphaautoml_pipeline = score_pipeline(pipeline, X, y, scoring, splitting_strategy, task_name) + if alphaautoml_pipeline is not None: + score = alphaautoml_pipeline.get_score() + if score is not None: + dump_result_to_json(primitives, task_start, score, output_folder) + return score + + if task_name is None: + task_name = "NA" + + task_name_id = task_name + "_TASK" + include_primitives = automl_hyperparams["include_primitives"] + exclude_primitives = automl_hyperparams["exclude_primitives"] + new_primitives = automl_hyperparams["new_primitives"] + use_imputer = metadata["missing_values"] + nonnumeric_columns = metadata["nonnumeric_columns"] + + logger.debug("Creating a manual grammar") + grammar = load_manual_grammar( + task_name_id, + nonnumeric_columns, + use_imputer, + new_primitives, + include_primitives, + exclude_primitives, + ) + + metric = scoring._score_func.__name__ + config_updated = update_config(task_name, metric, grammar, metadata) + checkpoint_load_folder = ( + checkpoints_folder + if contain_checkpoints(checkpoints_folder) + else DEFAULT_CHECKPOINT_PATH + ) + checkpoint_save_folder = ( + checkpoints_folder + if checkpoints_folder is not None + else DEFAULT_CHECKPOINT_PATH + ) + game = PipelineGame(config_updated, evaluate_pipeline) + pipeline_search_rllib( + game, time_bound, checkpoint_load_folder, checkpoint_save_folder + ) + logger.debug("Search completed") + results = read_result_to_pipeline(builder, output_folder) + + # queue.put('DONE') + return results + +def update_config(task_name, metric, grammar, metadata): + config["PROBLEM"] = task_name + config["DATA_TYPE"] = "TABULAR" + config["METRIC"] = metric + config["DATASET"] = f"DATASET_{task_name}" + config["GRAMMAR"] = grammar + metafeatures = compute_metafeatures(metric, metadata) + config["DATASET_METAFEATURES"] = metafeatures + [0] * (8 - len(metafeatures)) -def check_repeated_classifiers(pipeline_primitives, all_primitives, ensemble_pipelines_hash): + return config + + +def check_repeated_classifiers( + pipeline_primitives, all_primitives, ensemble_pipelines_hash +): # Verify if the classifiers are repeated in the ensembles (regardless of the order) classifiers = [] - pipeline_hash = '' + pipeline_hash = "" has_ensemble_primitive = False has_repeated_classifiers = False for primitive_name in pipeline_primitives: - primitive_type = all_primitives[primitive_name]['type'] + primitive_type = all_primitives[primitive_name]["type"] - if primitive_type == 'CLASSIFIER': + if primitive_type == "CLASSIFIER": classifiers.append(primitive_name) - elif primitive_type == 'MULTI_ENSEMBLER': + elif primitive_type == "MULTI_ENSEMBLER": has_ensemble_primitive = True pipeline_hash += primitive_name - if len(classifiers) != len(set(classifiers)): # All classifiers should be different + if len(classifiers) != len( + set(classifiers) + ): # All classifiers should be different has_repeated_classifiers = True else: pipeline_hash += primitive_name @@ -74,7 +143,7 @@ def check_repeated_classifiers(pipeline_primitives, all_primitives, ensemble_pip if has_repeated_classifiers: return True - pipeline_hash += ''.join(sorted(classifiers)) + pipeline_hash += "".join(sorted(classifiers)) if pipeline_hash in ensemble_pipelines_hash: return True @@ -83,108 +152,67 @@ def check_repeated_classifiers(pipeline_primitives, all_primitives, ensemble_pip return False -def search_pipelines(X, y, scoring, splitting_strategy, task_name, automl_hyperparams, metadata, output_folder, verbose, - queue): - signal.signal(signal.SIGTERM, lambda signum, frame: signal_handler(queue, signum)) - hide_logs(verbose) # Hide logs here too, since multiprocessing has some issues with loggers - - builder = BaseBuilder(metadata, automl_hyperparams) - all_primitives = builder.all_primitives - ensemble_pipelines_hash = set() - - def evaluate_pipeline(primitives, origin): - has_repeated_classifiers = check_repeated_classifiers(primitives, all_primitives, ensemble_pipelines_hash) - - if has_repeated_classifiers: - logger.debug('Repeated classifiers detected in ensembles, ignoring pipeline') - return None - - pipeline = builder.make_pipeline(primitives) - score = None - - if pipeline is not None: - alphaautoml_pipeline = score_pipeline(pipeline, X, y, scoring, splitting_strategy, task_name, verbose) - if alphaautoml_pipeline is not None: - score = alphaautoml_pipeline.get_score() - queue.put(alphaautoml_pipeline) # Only send valid pipelines - - return score - - if task_name is None: - task_name = 'NA' - - task_name_id = task_name + '_TASK' - use_automatic_grammar = automl_hyperparams['use_automatic_grammar'] - include_primitives = automl_hyperparams['include_primitives'] - exclude_primitives = automl_hyperparams['exclude_primitives'] - new_primitives = automl_hyperparams['new_primitives'] - grammar = None - - if use_automatic_grammar: - logger.debug('Creating an automatic grammar') - prioritize_primitives = automl_hyperparams['prioritize_primitives'] - target_column = '' - dataset_path = '' - grammar = load_automatic_grammar( - task_name_id, - dataset_path, - target_column, - include_primitives, - exclude_primitives, - prioritize_primitives, +def compute_metafeatures(metric, metadata): + metafeatures = [] + # SCORING METRIC + scoring_type = 0 + if metric in [ + "accuracy_score", + "f1_score", + "precision_score", + "recall_score", + "jaccard_score", + ]: + scoring_type = 1 + elif metric in [ + "max_error", + "mean_absolute_error", + "mean_squared_error", + "mean_squared_log_error", + "median_absolute_error", + "r2_score", + ]: + scoring_type = 2 + elif metric in [ + "adjusted_mutual_info_score", + "rand_score", + "mutual_info_score", + "normalized_mutual_info_score", + ]: + scoring_type = 3 + metafeatures.append(scoring_type) + + # IMPUTE + metafeatures.append(1 if metadata["missing_values"] else 0) + # ENCODE + nonnumeric_columns = metadata["nonnumeric_columns"] + if nonnumeric_columns != {}: + metafeatures.append(1) + # TEXT + metafeatures.append( + len(nonnumeric_columns["TEXT_ENCODER"]) + if "TEXT_ENCODER" in nonnumeric_columns + else 0 ) - - if grammar is None: - logger.debug('Creating a manual grammar') - use_imputer = metadata['missing_values'] - nonnumeric_columns = metadata['nonnumeric_columns'] - grammar = load_manual_grammar( - task_name_id, - nonnumeric_columns, - use_imputer, - new_primitives, - include_primitives, - exclude_primitives, + # CATEGORICAL + metafeatures.append( + len(nonnumeric_columns["CATEGORICAL_ENCODER"]) + if "CATEGORICAL_ENCODER" in nonnumeric_columns + else 0 ) - - metric = scoring._score_func.__name__ - config_updated = update_config(task_name, metric, output_folder, grammar) - game = PipelineGame(config_updated, evaluate_pipeline) - nnet = NNetWrapper(game) - - if config['ARGS'].get('load_model'): - model_file = join( - config['ARGS'].get('load_folder_file')[0], - config['ARGS'].get('load_folder_file')[1], + # DATETIME + metafeatures.append( + len(nonnumeric_columns["DATETIME_ENCODER"]) + if "DATETIME_ENCODER" in nonnumeric_columns + else 0 ) - if os.path.isfile(model_file): - nnet.load_checkpoint( - config['ARGS'].get('load_folder_file')[0], - config['ARGS'].get('load_folder_file')[1], - ) - - c = Coach(game, nnet, config['ARGS']) - c.learn() - logger.debug('Search completed') - queue.put('DONE') - - -def update_config(task_name, metric, output_folder, grammar): - config['PROBLEM'] = task_name - config['DATA_TYPE'] = 'TABULAR' - config['METRIC'] = metric - config['DATASET'] = f'DATASET_{task_name}' - config['ARGS']['stepsfile'] = join( - output_folder, f'DATASET_{task_name}_pipeline_steps.txt' - ) - config['ARGS']['checkpoint'] = join(output_folder, 'nn_models') - config['ARGS']['load_folder_file'] = join( - output_folder, 'nn_models', 'best.pth.tar' - ) - config['GRAMMAR'] = grammar - # metafeatures_extractor = ComputeMetafeatures(dataset, targets, features, DBSession) - config['DATASET_METAFEATURES'] = [ - 0 - ] * 50 # metafeatures_extractor.compute_metafeatures('Compute_metafeatures') + # IMAGE + metafeatures.append( + len(nonnumeric_columns["IMAGE_ENCODER"]) + if "IMAGE_ENCODER" in nonnumeric_columns + else 0 + ) + else: + metafeatures.append(0) - return config + return metafeatures \ No newline at end of file diff --git a/alpha_automl/resource/checkpoints/.gitignore b/alpha_automl/resource/checkpoints/.gitignore new file mode 100644 index 00000000..4f074fd0 --- /dev/null +++ b/alpha_automl/resource/checkpoints/.gitignore @@ -0,0 +1,2 @@ +.DS_Store +**/.DS_Store diff --git a/alpha_automl/scorer.py b/alpha_automl/scorer.py index 760adff1..7945527a 100644 --- a/alpha_automl/scorer.py +++ b/alpha_automl/scorer.py @@ -121,8 +121,7 @@ def make_splitter(splitting_strategy, splitting_strategy_kwargs=None): f'instance of BaseCrossValidator, BaseShuffleSplit, RepeatedSplits.') -def score_pipeline(pipeline, X, y, scoring, splitting_strategy, task_name, verbose): - hide_logs(verbose) # Hide logs here too, since multiprocessing has some issues with loggers +def score_pipeline(pipeline, X, y, scoring, splitting_strategy, task_name): score = None start_time = None end_time = None diff --git a/alpha_automl/utils.py b/alpha_automl/utils.py index 73313108..7c147cb1 100644 --- a/alpha_automl/utils.py +++ b/alpha_automl/utils.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import torch +from datetime import datetime from enum import Enum from sklearn.compose import ColumnTransformer from sklearn.preprocessing import LabelEncoder @@ -66,10 +67,10 @@ def sample_dataset(X, y, sample_size, task): if original_size > sample_size: ratio = sample_size / original_size try: - _, X_test, _, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=ratio, stratify=y, shuffle=shuffle) + _, X_test, _, y_test = train_test_split(X, y, random_state=int(datetime.now().microsecond), test_size=ratio, stratify=y, shuffle=shuffle) except Exception: # Not using stratified sampling when the minority class has few instances, not enough for all the folds - _, X_test, _, y_test = train_test_split(X, y, random_state=RANDOM_SEED, test_size=ratio, shuffle=shuffle) + _, X_test, _, y_test = train_test_split(X, y, random_state=int(datetime.now().microsecond), test_size=ratio, shuffle=shuffle) logger.debug(f'Sampling down data from {original_size} to {len(X_test)}') if isinstance(X_test, pd.DataFrame): X_test = X_test.reset_index(drop=True) @@ -236,7 +237,7 @@ def hide_logs(level): def setup_output_folder(output_folder): if output_folder is None: output_folder = tempfile.mkdtemp(prefix="alpha_automl", suffix="_log") - logger.debug(f'Created temporary directory: {output_folder}') + logger.info(f'Created temporary directory: {output_folder}') else: os.makedirs(output_folder, exist_ok=True) diff --git a/requirements.txt b/requirements.txt index 96fc1e36..18c41783 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,3 +9,5 @@ feature_engine xgboost lightgbm numpy +typing-extensions==4.5.0 +ray[rllib] \ No newline at end of file diff --git a/scripts/amlb/automl_job.SBATCH b/scripts/amlb/automl_job.SBATCH index d90ee08a..e68d0271 100644 --- a/scripts/amlb/automl_job.SBATCH +++ b/scripts/amlb/automl_job.SBATCH @@ -1,8 +1,9 @@ #!/bin/bash #SBATCH -c 4 -#SBATCH --mem 32GB -#SBATCH --time 01:15:00 +#SBATCH --mem 128GB +#SBATCH --time 02:00:00 #SBATCH --output logs/automl_job_%J.out #SBATCH --mail-user=rl3725@nyu.edu +#SBATCH --no-kill -singularity exec --overlay overlay-50G-10M.ext3:ro /scratch/work/public/singularity/ubuntu-22.04.sif /bin/bash -c "source /ext3/env.sh; python automlbenchmark/runbenchmark.py ${1} ${2} 1h4c -f 0 -u user_config/ -i openml_datasets/ -o results/" +singularity exec --overlay overlay-50G-10M.ext3:ro /scratch/work/public/singularity/ubuntu-22.04.sif /bin/bash -c "source /ext3/env.sh; python automlbenchmark/runbenchmark.py ${1} ${2} 1h4c -f 0 -u user_config/ -i openml_datasets/ -o results/ -s skip" diff --git a/scripts/amlb/run_all_automlbenchmark.sh b/scripts/amlb/run_all_automlbenchmark.sh index 8744ac2a..61d2d133 100644 --- a/scripts/amlb/run_all_automlbenchmark.sh +++ b/scripts/amlb/run_all_automlbenchmark.sh @@ -3,6 +3,8 @@ datasets="openml/t/10101 openml/t/12 openml/t/146195 openml/t/146212 openml/t/146606 openml/t/146818 openml/t/146821 openml/t/146822 openml/t/146825 openml/t/14965 openml/t/167119 openml/t/167120 openml/t/168329 openml/t/168330 openml/t/168331 openml/t/168332 openml/t/168335 openml/t/168337 openml/t/168338 openml/t/168868 openml/t/168908 openml/t/168909 openml/t/168910 openml/t/168911 openml/t/168912 openml/t/189354 openml/t/189355 openml/t/189356 openml/t/3 openml/t/31 openml/t/34539 openml/t/3917 openml/t/3945 openml/t/53 openml/t/7592 openml/t/7593 openml/t/9952 openml/t/9977 openml/t/9981" systems="autosklearn AutoGluon TPOT H2OAutoML AutoWEKA Alpha-AutoML" + +rm -rf logs/* for system in $systems do for dataset in $datasets @@ -10,4 +12,4 @@ do echo "Running ${system} system in ${dataset} dataset" sbatch automl_job.SBATCH $system $dataset done -done \ No newline at end of file +done diff --git a/scripts/amlb/user_config/extensions/Alpha-AutoML/exec.py b/scripts/amlb/user_config/extensions/Alpha-AutoML/exec.py index 153c6f84..5bd9f148 100644 --- a/scripts/amlb/user_config/extensions/Alpha-AutoML/exec.py +++ b/scripts/amlb/user_config/extensions/Alpha-AutoML/exec.py @@ -46,11 +46,12 @@ def run(dataset, config): f'target_name: {target_name}\n' f'time_bound: {time_bound}\n' f'metric: {metric}\n' - f'cores: {cores}' + f'cores: {cores}\n' + f'output_folder: {output_path}\n' ) automl = AutoMLClassifier(time_bound=time_bound, metric=metrics_mapping[metric], time_bound_run=15, - output_folder=output_path, num_cpus=cores, verbose=logging.DEBUG) + output_folder=output_path, verbose=logging.INFO) train_dataset = pd.read_csv(train_dataset_path) test_dataset = pd.read_csv(test_dataset_path) diff --git a/scripts/amlb/user_config/extensions/Alpha-AutoML/requirements.txt b/scripts/amlb/user_config/extensions/Alpha-AutoML/requirements.txt index 2fb52117..b710072a 100644 --- a/scripts/amlb/user_config/extensions/Alpha-AutoML/requirements.txt +++ b/scripts/amlb/user_config/extensions/Alpha-AutoML/requirements.txt @@ -1 +1 @@ -alpha-automl \ No newline at end of file +alpha-automl @ git+https://github.com/VIDA-NYU/alpha-automl@dqn_implementation diff --git a/scripts/offline_training/requirements.txt b/scripts/offline_training/requirements.txt new file mode 100644 index 00000000..e4fd828f --- /dev/null +++ b/scripts/offline_training/requirements.txt @@ -0,0 +1 @@ +openml \ No newline at end of file diff --git a/scripts/offline_training/results/.gitignore b/scripts/offline_training/results/.gitignore new file mode 100644 index 00000000..464e52e2 --- /dev/null +++ b/scripts/offline_training/results/.gitignore @@ -0,0 +1,5 @@ +automl_job_* +**/automl_job_* + +logs +results.json diff --git a/scripts/offline_training/train_all_datasets.sh b/scripts/offline_training/train_all_datasets.sh new file mode 100644 index 00000000..eeae7f45 --- /dev/null +++ b/scripts/offline_training/train_all_datasets.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +#datasets="10101 12 146195 146212 146606 146818 146821 146822 146825 14965 167119 167120 168329 168330 168331 168332 168335 168337 168338 168868 168908 168909 168910 168911 168912 189354 189355 189356 3 31 34539 3917 3945 53 7592 7593 9952 9977 9981" +# datasets="146825 168329 168330 168331 168332" +datasets="168910" + + +rm -rf tmp/logs/* +for dataset in $datasets +do + echo "Training Alpha-AutoML for ${dataset} dataset" + sbatch --output results/logs/automl_job_${dataset}.out trainer_job.SBATCH $dataset +done diff --git a/scripts/offline_training/trainer.py b/scripts/offline_training/trainer.py new file mode 100644 index 00000000..7e38396c --- /dev/null +++ b/scripts/offline_training/trainer.py @@ -0,0 +1,98 @@ +import argparse +import openml +import pandas as pd +from os.path import dirname, join +from alpha_automl import AutoMLClassifier + + +if __name__ == "__main__": + # If running it in Windows or CUDA environment, Alpha-AutoML should be used inside of "if __name__ == '__main__':" + + # argparser + parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) + parser.add_argument( + "-s", + "--save-dir", + default=None, + help="The saving directory for the PPO model checkpoint.", + ) + parser.add_argument( + "-o", + "--output-dir", + default=None, + help="The output directory for saving results.json and other caching files.", + ) + parser.add_argument( + "-d", + "--dataset-dir", + default=None, + help="The specific dataset directory, should contain train_data.csv and test_data.csv.", + ) + parser.add_argument( + "-c", + "--target-column", + default="class", + help="The target column name for local dataset csv files, 'class' by default", + ) + + parser.add_argument( + "-t", + "--task", + metavar="task_id", + type=int, + default=None, + help="The specific task name (as defined in the benchmark file) to run." + "\nWhen an OpenML reference is used as benchmark, the dataset name should be used instead." + "\nIf not provided, then all tasks from the benchmark will be run.", + ) + parser.add_argument( + "-T", + "--time-bound", + type=int, + default=1, + help="The time bound for running AlphaAutoML task, unit by minute, 1 minute by default.", + ) + + args = parser.parse_args() + + # Read the datasets + if args.dataset_dir: + train_dataset = pd.read_csv( + join(dirname(__file__), args.dataset_dir, "train_data.csv") + ) + test_dataset = pd.read_csv( + join(dirname(__file__), args.dataset_dir, "test_data.csv") + ) + target_column = args.target_column + X_train = train_dataset.drop(columns=[target_column]) + y_train = train_dataset[[target_column]] + X_test = test_dataset.drop(columns=[target_column]) + y_test = test_dataset[[target_column]] + elif args.task: + task = openml.tasks.get_task(args.task, download_qualities=False) + X, y = task.get_X_and_y(dataset_format="dataframe") + train_indices, test_indices = task.get_train_test_split_indices( + repeat=0, + fold=0, + sample=0, + ) + X_train = X.iloc[train_indices] + y_train = y.iloc[train_indices] + X_test = X.iloc[test_indices] + y_test = y.iloc[test_indices] + + # Add settings + automl = AutoMLClassifier( + time_bound=args.time_bound, + output_folder=args.output_dir, + checkpoints_folder=args.save_dir, + ) + + # Perform the search + automl.fit(X_train, y_train) + + # Plot leaderboard + automl.plot_leaderboard(use_print=True) + + # Evaluate best model + automl.score(X_test, y_test) diff --git a/scripts/offline_training/trainer_job.SBATCH b/scripts/offline_training/trainer_job.SBATCH new file mode 100644 index 00000000..79c4524d --- /dev/null +++ b/scripts/offline_training/trainer_job.SBATCH @@ -0,0 +1,7 @@ +#!/bin/bash +#SBATCH -c 4 +#SBATCH --mem 128GB +#SBATCH --time 12:00:00 +#SBATCH --mail-user=yfw215@nyu.edu + +singularity exec --overlay overlay-50G-10M.ext3:ro /scratch/work/public/singularity/ubuntu-20.04.4.sif /bin/bash -c "source /ext3/env.sh; python trainer.py -o results -t ${1} -T 5" \ No newline at end of file