Skip to content

Commit

Permalink
Update config and docs
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Aug 4, 2024
1 parent 4228f53 commit 06dcca3
Show file tree
Hide file tree
Showing 7 changed files with 56 additions and 57 deletions.
3 changes: 3 additions & 0 deletions bluecast/config/training_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,8 @@ class TrainingConfig(BaseModel):
custom ML model is passed.
:param hypertuning_cv_folds: Number of cross-validation folds to use for hyperparameter tuning. Not used when
custom ML model is passed.
:param hypertuning_cv_repeats: Number of repetitions for each cross-validation fold during hyperparameter
tuning. Not used when custom ML model is passed.
:param sample_data_during_tuning: Whether to sample the data during tuning. Not used when custom ML model is passed.
:param sample_data_during_tuning_alpha: Alpha value for sampling the data during tuning. The higher alpha the
fewer samples will be left. Not used when custom ML model is passed.
Expand Down Expand Up @@ -84,6 +86,7 @@ class TrainingConfig(BaseModel):
hyperparameter_tuning_rounds: int = 200
hyperparameter_tuning_max_runtime_secs: int = 3600
hypertuning_cv_folds: int = 5
hypertuning_cv_repeats: int = 1
sample_data_during_tuning: bool = False
sample_data_during_tuning_alpha: float = 2.0
precise_cv_tuning: bool = False
Expand Down
18 changes: 9 additions & 9 deletions bluecast/ml_modelling/xgboost.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
import optuna
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.utils import class_weight

from bluecast.config.training_config import (
Expand Down Expand Up @@ -345,10 +345,10 @@ def objective(trial):

return self._fine_tune_precise(params, x_train, y_train, x_test, y_test)
else:
skf = StratifiedKFold(
n_splits=5,
skf = RepeatedStratifiedKFold(
n_splits=self.conf_training.hypertuning_cv_folds,
n_repeats=self.conf_training.hypertuning_cv_repeats,
random_state=self.conf_training.global_random_state,
shuffle=self.conf_training.shuffle_during_training,
)
folds = []
for train_index, test_index in skf.split(x_train, y_train.tolist()):
Expand Down Expand Up @@ -549,9 +549,9 @@ def _fine_tune_precise(
"Could not find Training config. Falling back to default values"
)

stratifier = StratifiedKFold(
stratifier = RepeatedStratifiedKFold(
n_splits=self.conf_training.hypertuning_cv_folds,
shuffle=self.conf_training.shuffle_during_training,
n_repeats=self.conf_training.hypertuning_cv_repeats,
random_state=self.conf_training.global_random_state,
)

Expand Down Expand Up @@ -742,10 +742,10 @@ def objective(trial):
y_test,
)
else:
skf = StratifiedKFold(
n_splits=5,
skf = RepeatedStratifiedKFold(
n_splits=self.conf_training.hypertuning_cv_folds,
n_repeats=self.conf_training.hypertuning_cv_repeats,
random_state=self.conf_training.global_random_state,
shuffle=self.conf_training.shuffle_during_training,
)
folds = []
for train_index, test_index in skf.split(x_train, y_train.astype(int)):
Expand Down
14 changes: 7 additions & 7 deletions bluecast/ml_modelling/xgboost_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
except ImportError:
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.model_selection import KFold, RepeatedStratifiedKFold
from sklearn.preprocessing import LabelEncoder

from bluecast.config.training_config import (
Expand Down Expand Up @@ -347,10 +347,10 @@ def objective(trial):
# make regression cv startegy stratified
le = LabelEncoder()
y_binned = le.fit_transform(pd.qcut(y_train, 10, duplicates="drop"))
skf = StratifiedKFold(
n_splits=5,
skf = RepeatedStratifiedKFold(
n_splits=self.conf_training.hypertuning_cv_folds,
n_repeats=self.conf_training.hypertuning_cv_repeats,
random_state=self.conf_training.global_random_state,
shuffle=self.conf_training.shuffle_during_training,
)
folds = []
for train_index, test_index in skf.split(x_train, y_binned):
Expand Down Expand Up @@ -724,10 +724,10 @@ def objective(trial):
# make regression cv startegy stratified
le = LabelEncoder()
y_binned = le.fit_transform(pd.qcut(y_train, 10, duplicates="drop"))
skf = StratifiedKFold(
n_splits=5,
skf = RepeatedStratifiedKFold(
n_splits=self.conf_training.hypertuning_cv_folds,
n_repeats=self.conf_training.hypertuning_cv_repeats,
random_state=self.conf_training.global_random_state,
shuffle=self.conf_training.shuffle_during_training,
)
folds = []
for train_index, test_index in skf.split(x_train, y_binned):
Expand Down
Binary file modified dist/bluecast-1.5.1-py3-none-any.whl
Binary file not shown.
Binary file modified dist/bluecast-1.5.1.tar.gz
Binary file not shown.
7 changes: 7 additions & 0 deletions docs/source/Customize training settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,13 @@ can test many more hyperparameters than usual cross validation.
For regression problems Xgboost's inbuilt cross validation routine is also used,
however BlueCast uses stratification to ensure that the folds are balanced.

## Repeated cross-validation

BlueCast supports repeated cross-validation as well. This can be enabled by
setting `hypertuning_cv_repeats` to a value greater than 1. This will repeat
the cross-validation routine n times and will return the average performance
of all runs.

## Enable even more overfitting-robust cross-validation

There might be situations where a preprocessing step has a high risk of overfitting
Expand Down
Original file line number Diff line number Diff line change
@@ -1,34 +1,12 @@
from bluecast.blueprints.cast import BlueCast
from bluecast.blueprints.cast_cv import BlueCastCV
from bluecast.blueprints.cast_regression import BlueCastRegression
from bluecast.blueprints.cast_cv_regression import BlueCastCVRegression
from bluecast.config.training_config import TrainingConfig, XgboostTuneParamsConfig, XgboostTuneParamsRegressionConfig
from bluecast.experimentation.tracking import ExperimentTracker
from bluecast.general_utils.general_utils import save_to_production, load_for_production
from bluecast.preprocessing.feature_creation import AddRowLevelAggFeatures
from bluecast.preprocessing.feature_types import FeatureTypeDetector
from bluecast.evaluation.eval_metrics import ClassificationEvalWrapper
from bluecast.monitoring.data_monitoring import DataDrift
from bluecast.preprocessing.feature_types import FeatureTypeDetector

import polars as pl

import numpy as np, pandas as pd
import os
import json
from sklearn.metrics import log_loss, mean_squared_error, roc_auc_score

import gc
import re
from tqdm import tqdm

import matplotlib
matplotlib.use('Agg')

import pandas as pd
import plotly.io as pio
import plotly.graph_objects as go

pio.renderers.default = 'svg'
from bluecast.blueprints.cast import BlueCast
from bluecast.config.training_config import TrainingConfig

matplotlib.use("Agg")
pio.renderers.default = "svg"


def competition_pipeline():
Expand All @@ -40,7 +18,7 @@ def competition_pipeline():

folder = "/home/thomas/Schreibtisch/Data Science/Preprocessing lib test/automl_competition"
train = pd.read_csv(f"{folder}/train.csv")
#train_original = pd.read_csv("original.csv")
# train_original = pd.read_csv("original.csv")
test = pd.read_csv(f"{folder}/test.csv")
submission = pd.read_csv(f"{folder}/sample_submission.csv")

Expand All @@ -54,40 +32,51 @@ def competition_pipeline():
train_config.autotune_model = False
train_config.calculate_shap_values = False
else:
train_config.hypertuning_cv_folds = 10
train_config.hyperparameter_tuning_rounds = 25
train_config.hypertuning_cv_folds = 5
train_config.hypertuning_cv_repeats = 3
train_config.hyperparameter_tuning_rounds = 100
train_config.hyperparameter_tuning_max_runtime_secs = 60 * 60 * 5
train_config.enable_grid_search_fine_tuning = False
train_config.calculate_shap_values = False
train_config.show_detailed_tuning_logs = True
train_config.train_size = 0.9
# train_config.sample_data_during_tuning_alpha = True
train_config.bluecast_cv_train_n_model = (5, 2)
train_config.infrequent_categories_threshold = 10
# train_config.cat_encoding_via_ml_algorithm = True
train_config.train_size = 0.85
# train_config.infrequent_categories_threshold = 10
train_config.cat_encoding_via_ml_algorithm = True
train_config.out_of_fold_dataset_store_path = "/home/thomas/Schreibtisch/Data Science/Preprocessing lib test/automl_competition/"


automl = BlueCastCV(
automl = BlueCast(
class_problem=class_problem,
conf_training=train_config,
# single_fold_eval_metric_func=cew
)

automl.conf_xgboost.max_bin_max = 2500

automl.fit_eval(train, target_col=target)
train = train.sample(frac=1.0, random_state=500)
df_unseen = train.sample(frac=0.1, random_state=500)
df_unseed_target = df_unseen.pop(target)
train = train.drop(df_unseen.index)

automl.fit_eval(train, df_eval=df_unseen, target_eval=df_unseed_target, target_col=target)

y_probs, y_classes = automl.predict(test)

reverse_mapping = {
value: key for key, value in automl.bluecast_models[0].target_label_encoder.target_label_mapping.items()
value: key
for key, value in automl.bluecast_models[
0
].target_label_encoder.target_label_mapping.items()
}

submission[target] = y_classes.astype(int)

submission = submission.replace(reverse_mapping).copy().to_csv('automl_grandprix_bluecast_xgboost_10fold_submission.csv', index=False)
submission = (
submission.replace(reverse_mapping)
.copy()
.to_csv("automl_grandprix_bluecast_xgboost_10fold_submission.csv", index=False)
)
print(submission)


if __name__ == "__main__":
competition_pipeline()

0 comments on commit 06dcca3

Please sign in to comment.