Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

default split for train_pipeline and scoring #446

Merged
merged 8 commits into from
Jul 11, 2023
7 changes: 6 additions & 1 deletion openstef/model/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,12 @@ def __call__(
"stratification_min_max": self.model_type != MLModelType.ProLoaf,
"back_test": True,
}
(self.train_data, self.validation_data, self.test_data,) = self.split_func(
(
self.train_data,
self.validation_data,
self.test_data,
self.operational_score_data,
) = self.split_func(
self.input_data,
test_fraction=self.test_fraction,
validation_fraction=self.validation_fraction,
Expand Down
23 changes: 14 additions & 9 deletions openstef/model_selection/model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def split_data_train_validation_test(
validation dataset. In an operational setting the following sequence is
returned (when using stratification):

Test >> Train >> Validation
Train >> Validation (and the test is the Train and Validation combined.)

For a back test (indicated with argument "back_test") the following sequence
is returned:
Expand Down Expand Up @@ -141,6 +141,7 @@ def split_data_train_validation_test(
- Test data.

"""
test_fraction = test_fraction if back_test else 0
JanMaartenvanDoorn marked this conversation as resolved.
Show resolved Hide resolved
train_fraction = 1 - (test_fraction + validation_fraction)
if train_fraction < 0:
raise ValueError(
Expand Down Expand Up @@ -172,10 +173,18 @@ def split_data_train_validation_test(
start_date_test = end_date - np.round(number_indices * test_fraction) * delta
test_data = data_[start_date_test:]
train_val_data = data_[:start_date_test]
operational_score_data = (
pd.DataFrame()
) # Empty because a backtest is no operational setting.
else:
start_date_val = start_date + np.round(number_indices * test_fraction) * delta
test_data = data_[:start_date_val]
test_data = data_[
:start_date_val
] # Empty as all data is used for training in an operational setting.
train_val_data = data_[start_date_val:]
operational_score_data = data_.copy(deep=True).reset_index(
drop=True
) # Used to check wether a new operationally train model is better than the old one.

if stratification_min_max and (
len(set(train_val_data.index.date)) >= min_days_for_stratification
Expand Down Expand Up @@ -248,11 +257,7 @@ def split_data_train_validation_test(
validation_data = validation_data.sort_index()
test_data = test_data.sort_index()

return (
train_data,
validation_data,
test_data,
)
return (train_data, validation_data, test_data, operational_score_data)


def backtest_split_default(
Expand Down Expand Up @@ -286,14 +291,14 @@ def backtest_split_default(
for ifold in range(n_folds):
test_data = data[data["random_fold"] == ifold].sort_index()

(train_data, validation_data, _,) = split_data_train_validation_test(
(train_data, validation_data, _, _) = split_data_train_validation_test(
data[data["random_fold"] != ifold].iloc[:, :-2],
test_fraction=0,
back_test=True,
stratification_min_max=stratification_min_max,
)

yield train_data, validation_data, test_data.iloc[:, :-2]
yield train_data, validation_data, test_data.iloc[:, :-2], pd.DataFrame()
else:
yield split_data_train_validation_test(
data,
Expand Down
8 changes: 6 additions & 2 deletions openstef/pipeline/train_create_forecast_backtest.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,14 @@ def train_model_and_forecast_back_test(
) = zip(
*(
train_model_and_forecast_test_core(
pj, modelspecs, train_data, validation_data, test_data
pj,
modelspecs,
train_data,
validation_data,
test_data,
)
+ (train_data, validation_data, test_data)
for train_data, validation_data, test_data in backtest_split_func(
for train_data, validation_data, test_data, _ in backtest_split_func(
data_with_features, n_folds, **backtest_split_args
)
)
Expand Down
26 changes: 19 additions & 7 deletions openstef/pipeline/train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,14 @@ def train_model_pipeline_core(
logger = structlog.get_logger(__name__)

# Call common pipeline
model, report, train_data, validation_data, test_data = train_pipeline_common(
(
model,
report,
train_data,
validation_data,
test_data,
operational_score_data,
) = train_pipeline_common(
pj,
model_specs,
input_data,
Expand All @@ -192,8 +199,8 @@ def train_model_pipeline_core(
combined = combined.iloc[:, :-1]

x_data, y_data = (
combined.iloc[:, 1:-1],
combined.iloc[:, 0],
operational_score_data.iloc[:, 1:-1],
operational_score_data.iloc[:, 0],
)

# Score method always returns R^2
Expand Down Expand Up @@ -260,7 +267,12 @@ def train_pipeline_common(
horizons=horizons,
)

train_data, validation_data, test_data = train_pipeline_step_split_data(
(
train_data,
validation_data,
test_data,
operational_score_data,
) = train_pipeline_step_split_data(
data_with_features=data_with_features,
pj=pj,
test_fraction=test_fraction,
Expand All @@ -284,7 +296,7 @@ def train_pipeline_common(
validation_data["forecast"] = model.predict(validation_data.iloc[:, 1:-1])
test_data["forecast"] = model.predict(test_data.iloc[:, 1:-1])

return model, report, train_data, validation_data, test_data
return model, report, train_data, validation_data, test_data, operational_score_data


def train_pipeline_step_load_model(
Expand Down Expand Up @@ -515,12 +527,12 @@ def train_pipeline_step_split_data(
required_arguments=["data", "test_fraction"]
)

train_data, validation_data, test_data = split_func(
train_data, validation_data, test_data, operational_score_data = split_func(
data_with_features, test_fraction, **split_args
)

# if test_data is predefined, use this over the returned test_data of split function
if not test_data_predefined.empty:
test_data = test_data_predefined

return train_data, validation_data, test_data
return train_data, validation_data, test_data, operational_score_data
1 change: 1 addition & 0 deletions test/component/test_component.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@ def test_component_training_prediction_happyflow(self):
train_data,
validation_data,
test_data,
operational_score_data,
) = train_pipeline_common(
self.pj, self.model_specs, self.input_data, [0.25, 47.0]
)
Expand Down
16 changes: 10 additions & 6 deletions test/unit/model/test_model_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def test_split_data_train_validation(self):
train_set,
valid_set,
test_set,
operational_score_data,
) = model_selection.split_data_train_validation_test(
data,
test_fraction=SPLIT_PARAMS["test_fraction"],
Expand All @@ -79,18 +80,16 @@ def test_split_data_train_validation(self):
)

# delta = 1, number of the peaks the two amounts may differ for the train and validation data
# delta = 4, when looking at the test data, can differ 1 hr (4x15min)

self.assertAlmostEqual(
len(valid_set),
len(data) * SPLIT_PARAMS["validation_fraction"],
delta=2 * 96,
) # two days is allowed

self.assertAlmostEqual(
len(test_set),
len(data.index) * SPLIT_PARAMS["test_fraction"],
delta=4,
self.assertEqual(
len(operational_score_data),
len(data),
)

def test_split_data_train_validation_test_stratification(self):
Expand Down Expand Up @@ -123,7 +122,12 @@ def test_split_data_train_validation_test_stratification(self):
df.loc[df.index.day == day, "load"] -= 5

# Act: Split using default arguments. Should result in stratified split
(train, val, test,) = model_selection.split_data_train_validation_test(
(
train,
val,
test,
operational_score_data,
) = model_selection.split_data_train_validation_test(
df, test_fraction=0, stratification_min_max=True
)

Expand Down
2 changes: 1 addition & 1 deletion test/unit/pipeline/test_optimize_hyperparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@


def dummy_split(data, test_fraction, validation_fraction=0.0):
return data.iloc[:100], data.iloc[100:110], data.iloc[110:120]
return data.iloc[:100], data.iloc[100:110], data.iloc[110:120], data.iloc[110:120]


class TestOptimizeHyperParametersPipeline(BaseTestCase):
Expand Down
14 changes: 12 additions & 2 deletions test/unit/pipeline/test_pipeline_train_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ def set_feature_importance(self):


def split_dummy_arima(data, test_fraction):
return data.iloc[:-5], data.iloc[-10:-5], data.iloc[-5:]
return data.iloc[:-5], data.iloc[-10:-5], data.iloc[-5:], data.iloc[-5:]


class TestTrainModelPipeline(BaseTestCase):
Expand Down Expand Up @@ -190,6 +190,7 @@ def test_train_model_pipeline_core_happy_flow(self):
train_data,
validation_data,
test_data,
operational_score_data,
) = split_data_train_validation_test(data_with_features)

importance = model.set_feature_importance()
Expand Down Expand Up @@ -257,6 +258,7 @@ def test_train_model_pipeline_core_happy_flow_with_legacy_data_prep(self):
train_data,
validation_data,
test_data,
operational_score_data,
) = split_data_train_validation_test(data_with_features)

importance = model.set_feature_importance()
Expand Down Expand Up @@ -640,7 +642,14 @@ def test_train_pipeline_common_different_quantiles_with_quantile_regressor(self)
modified_model_specs["hyper_params"].update(dict(quantiles=old_quantiles))

# train model
model, report, train_data, validation_data, test_data = train_pipeline_common(
(
model,
report,
train_data,
validation_data,
test_data,
operational_score_data,
) = train_pipeline_common(
pj, modified_model_specs, self.train_input, horizons=[0.25, 47.0]
)

Expand All @@ -662,6 +671,7 @@ def test_train_pipeline_common_with_missing_custom_horizon(self):
train_data,
validation_data,
test_data,
operational_score_data,
) = train_pipeline_common(
self.pj, self.model_specs, self.train_input, horizons="custom_horizon"
)
Expand Down
Loading
Loading