Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: Pipeline ranking #92

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions alpha_automl/automl_api.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
setup_output_folder, SemiSupervisedSplitter, SemiSupervisedLabelEncoder, write_pipeline_code_as_pyfile
from alpha_automl.visualization import plot_comparison_pipelines
from alpha_automl.pipeline_serializer import PipelineSerializer
from alpha_automl.primitive_loader import record_primitive_performance

logging.basicConfig(level=logging.DEBUG, stream=sys.stdout)
logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -66,7 +67,7 @@ def __init__(self, time_bound=15, metric=None, split_strategy='holdout', time_bo
self.label_encoder = None
self.task_type = task

def fit(self, X, y):
def fit(self, X, y, record_performance=False):
"""
Search for pipelines and fit the best pipeline.

Expand Down Expand Up @@ -98,6 +99,9 @@ def fit(self, X, y):
logger.info(f'Found {len(pipelines)} pipelines')
sign = get_sign_sorting(self.scorer._score_func, self.score_sorting)
sorted_pipelines = sorted(pipelines, key=lambda x: x.get_score() * sign, reverse=True)

if record_performance:
record_primitive_performance(sorted_pipelines)

leaderboard_data = []
for index, pipeline in enumerate(sorted_pipelines, start=1):
Expand Down Expand Up @@ -333,9 +337,9 @@ def __init__(self, time_bound=15, metric='accuracy_score', split_strategy='holdo

self.label_encoder = LabelEncoder()

def fit(self, X, y):
def fit(self, X, y, record_performance=False):
y = self.label_encoder.fit_transform(y)
super().fit(X, y)
super().fit(X, y, record_performance=record_performance)

def predict(self, X):
predictions = super().predict(X)
Expand Down Expand Up @@ -451,9 +455,9 @@ def _column_parser(self, X):
y = X[[self.target_column]]
return X, y

def fit(self, X, y=None):
def fit(self, X, y=None, record_performance=False):
X, y = self._column_parser(X)
super().fit(X, y)
super().fit(X, y, record_performance=record_performance)


class AutoMLSemiSupervisedClassifier(ClassifierBaseAutoML):
Expand Down
5 changes: 3 additions & 2 deletions alpha_automl/grammar_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
import itertools
from os.path import join, dirname
from nltk.grammar import Production, Nonterminal, CFG, is_terminal, is_nonterminal
from alpha_automl.primitive_loader import load_primitives_hierarchy
from alpha_automl.primitive_loader import load_primitives_hierarchy, load_ranked_primitives_hierarchy
# from alphad3m_sklearn.metalearning.grammar_builder import create_metalearningdb_grammar

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -128,7 +128,8 @@ def modify_manual_grammar(encoders, use_imputer):


def load_manual_grammar(task, encoders, use_imputer, new_primitives, include_primitives, exclude_primitives):
primitives = load_primitives_hierarchy()
# primitives = load_primitives_hierarchy()
primitives = load_ranked_primitives_hierarchy()

for primitive_name in new_primitives.keys():
primitive_type = new_primitives[primitive_name]['primitive_type']
Expand Down
54 changes: 54 additions & 0 deletions alpha_automl/primitive_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,60 @@ def load_primitives_types():
return primitive_types


PRIMITIVES_RANKING_PATH = join(dirname(__file__), 'resource/primitives_ranking.json')


def load_primitives_ranking():
with open(PRIMITIVES_RANKING_PATH) as fin:
primitives = json.load(fin)
logger.debug('Ranking of all primitives loaded')

return primitives


def record_primitive_performance(pipelines):
primitives_ranking = load_primitives_ranking()

for idx, pipeline in enumerate(pipelines):
steps = pipeline.get_pipeline().steps
score = pipeline.get_score()
for step_name, _ in steps:

if step_name not in PRIMITIVE_TYPES:
continue

step_type = PRIMITIVE_TYPES[step_name]
if step_type not in primitives_ranking:
primitives_ranking[step_type] = {}

if step_name not in primitives_ranking[step_type]:
primitives_ranking[step_type][step_name] = {"avg_score": 0, "runs": 0}

ranking = primitives_ranking[step_type][step_name]

weight = 1/(idx+1)
runs = ranking["runs"] + 1
avg_score = (ranking["avg_score"] * ranking["runs"] + score * weight) / runs

primitives_ranking[step_type][step_name] = {"avg_score": avg_score, "runs": runs}

with open(PRIMITIVES_RANKING_PATH, "w") as outfile:
json.dump(primitives_ranking, outfile)
return primitives_ranking


def load_ranked_primitives_hierarchy():
ranking = load_primitives_ranking()
hierarchy = load_primitives_hierarchy()
for key, primitives in ranking.items():
ranked_primitives = []
for primitive in sorted(primitives, key=lambda x: primitives[x]["avg_score"], reverse=True):
if primitive in hierarchy[key]:
ranked_primitives.append(primitive)
hierarchy[key] = ranked_primitives
return hierarchy


PRIMITIVE_TYPES = load_primitives_types()

if __name__ == '__main__':
Expand Down
1 change: 1 addition & 0 deletions alpha_automl/resource/primitives_ranking.json
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
{"CATEGORICAL_ENCODER": {"sklearn.preprocessing.OneHotEncoder": {"avg_score": 0, "runs": 0}}, "CLASSIFIER": {"sklearn.discriminant_analysis.LinearDiscriminantAnalysis": {"avg_score": 0, "runs": 0}, "sklearn.discriminant_analysis.QuadraticDiscriminantAnalysis": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.BaggingClassifier": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.ExtraTreesClassifier": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.GradientBoostingClassifier": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.RandomForestClassifier": {"avg_score": 0, "runs": 0}, "sklearn.naive_bayes.BernoulliNB": {"avg_score": 0, "runs": 0}, "sklearn.naive_bayes.GaussianNB": {"avg_score": 0, "runs": 0}, "sklearn.naive_bayes.MultinomialNB": {"avg_score": 0, "runs": 0}, "sklearn.neighbors.KNeighborsClassifier": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.LogisticRegression": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.PassiveAggressiveClassifier": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.SGDClassifier": {"avg_score": 0, "runs": 0}, "sklearn.svm.LinearSVC": {"avg_score": 0, "runs": 0}, "sklearn.svm.SVC": {"avg_score": 0, "runs": 0}, "sklearn.tree.DecisionTreeClassifier": {"avg_score": 0, "runs": 0}, "xgboost.XGBClassifier": {"avg_score": 0, "runs": 0}, "lightgbm.LGBMClassifier": {"avg_score": 0, "runs": 0}}, "CLUSTERER": {"sklearn.cluster.KMeans": {"avg_score": 0, "runs": 0}, "sklearn.cluster.AgglomerativeClustering": {"avg_score": 0, "runs": 0}}, "DATETIME_ENCODER": {"sklearn.preprocessing.OrdinalEncoder": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.datetime_encoder.CyclicalFeature": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.datetime_encoder.Datetime64ExpandEncoder": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.datetime_encoder.DummyEncoder": {"avg_score": 0, "runs": 0}}, "FEATURE_SCALER": {"sklearn.preprocessing.MaxAbsScaler": {"avg_score": 0, "runs": 0}, "sklearn.preprocessing.RobustScaler": {"avg_score": 0, "runs": 0}, "sklearn.preprocessing.StandardScaler": {"avg_score": 0, "runs": 0}}, "FEATURE_SELECTOR": {"sklearn.feature_selection.GenericUnivariateSelect": {"avg_score": 0, "runs": 0}, "sklearn.feature_selection.SelectPercentile": {"avg_score": 0, "runs": 0}, "sklearn.feature_selection.SelectKBest": {"avg_score": 0, "runs": 0}}, "IMPUTER": {"sklearn.impute.SimpleImputer": {"avg_score": 0, "runs": 0}}, "REGRESSOR": {"sklearn.linear_model.ARDRegression": {"avg_score": 0, "runs": 0}, "sklearn.tree.DecisionTreeRegressor": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.ExtraTreesRegressor": {"avg_score": 0, "runs": 0}, "sklearn.gaussian_process.GaussianProcessRegressor": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.GradientBoostingRegressor": {"avg_score": 0, "runs": 0}, "sklearn.neighbors.KNeighborsRegressor": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.Lars": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.Lasso": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.LassoCV": {"avg_score": 0, "runs": 0}, "sklearn.svm.LinearSVR": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.PassiveAggressiveRegressor": {"avg_score": 0, "runs": 0}, "sklearn.ensemble.RandomForestRegressor": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.Ridge": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.SGDRegressor": {"avg_score": 0, "runs": 0}, "sklearn.svm.SVR": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.BayesianRidge": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.ElasticNet": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.HuberRegressor": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.LinearRegression": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.RANSACRegressor": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.RidgeCV": {"avg_score": 0, "runs": 0}, "sklearn.linear_model.TheilSenRegressor": {"avg_score": 0, "runs": 0}, "xgboost.XGBRegressor": {"avg_score": 0, "runs": 0}, "lightgbm.LGBMRegressor": {"avg_score": 0, "runs": 0}}, "TEXT_ENCODER": {"sklearn.feature_extraction.text.CountVectorizer": {"avg_score": 0, "runs": 0}, "sklearn.feature_extraction.text.TfidfVectorizer": {"avg_score": 0, "runs": 0}}, "IMAGE_ENCODER": {"alpha_automl.builtin_primitives.image_encoder.RGB2GrayTransformer": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.image_encoder.HogTransformer": {"avg_score": 0, "runs": 0}}, "COLUMN_TRANSFORMER": {"sklearn.compose.ColumnTransformer": {"avg_score": 0, "runs": 0}}, "TIME_SERIES_FORECAST": {"alpha_automl.builtin_primitives.time_series_forecasting.ArimaEstimator": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.time_series_forecasting.DeeparEstimator": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.time_series_forecasting.NBEATSEstimator": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.time_series_forecasting.NHITSEstimator": {"avg_score": 0, "runs": 0}}, "SEMISUPERVISED_CLASSIFIER": {"alpha_automl.builtin_primitives.semisupervised_classifier.AutonBox": {"avg_score": 0, "runs": 0}, "sklearn.semi_supervised.SelfTrainingClassifier": {"avg_score": 0, "runs": 0}}, "LABELPROPAGATION_CLASSIFIER": {"alpha_automl.builtin_primitives.semisupervised_classifier.SkLabelSpreading": {"avg_score": 0, "runs": 0}, "alpha_automl.builtin_primitives.semisupervised_classifier.SkLabelPropagation": {"avg_score": 0, "runs": 0}}}
Loading