Skip to content

Commit

Permalink
Optimize feature selection and onehot encoding
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Oct 1, 2024
1 parent 9b2c251 commit 864be5b
Show file tree
Hide file tree
Showing 4 changed files with 29 additions and 20 deletions.
26 changes: 21 additions & 5 deletions bluecast/preprocessing/feature_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import numpy as np
import pandas as pd
import xgboost as xgb
from tqdm import tqdm

from bluecast.preprocessing.custom import CustomPreprocessing

Expand All @@ -29,7 +30,7 @@ def fit_transform(
) -> Tuple[pd.DataFrame, Optional[pd.Series]]:
if self.class_problem == "binary":
model = xgb.XGBClassifier(
tree_method="approx",
tree_method="hist",
max_bin=255,
n_estimators=100,
random_state=self.random_state,
Expand All @@ -41,7 +42,7 @@ def fit_transform(
br = BoostARoota(metric="mlogloss")
else:
model = xgb.XGBRegressor(
tree_method="approx",
tree_method="hist",
max_bin=255,
n_estimators=100,
random_state=self.random_state,
Expand Down Expand Up @@ -150,13 +151,27 @@ def _create_shadow(x_train):
:param x_train: the dataframe to create shadow features on
:return: dataframe 2x width and the names of the shadows for removing later
"""
random_generator = np.random.default_rng(200)
x_train = x_train.apply(
lambda col: col.astype("category") if col.dtypes == "object" else col
)
x_shadow = x_train.copy()
for c in x_shadow.columns:
np.random.shuffle(x_shadow[c].values)
# rename the shadow

# Convert object columns to categorical
x_shadow = x_shadow.apply(
lambda col: col.astype("category") if col.dtypes == "object" else col
)

for c in tqdm(x_shadow.columns.to_list()):
x_shadow[c] = x_shadow[c].sample(frac=1, random_state=random_generator).values

# Rename the shadow
shadow_names = ["ShadowVar" + str(i + 1) for i in range(x_train.shape[1])]
x_shadow.columns = shadow_names

# Combine to make one new dataframe
x_train.columns = x_train.columns.astype(str)
x_shadow.columns = x_shadow.columns.astype(str)
new_x = pd.concat([x_train, x_shadow], axis=1)
return new_x, shadow_names

Expand All @@ -183,6 +198,7 @@ def _reduce_vars_xgb(x, y, metric, this_round, cutoff, n_iterations, delta, sile
for i in range(1, n_iterations + 1):
# Create the shadow variables and run the model to obtain importances
new_x, shadow_names = _create_shadow(x)
# Convert the object columns to category
dtrain = xgb.DMatrix(new_x, label=y, enable_categorical=True)
bst = xgb.train(param, dtrain, verbose_eval=False)
if i == 1:
Expand Down
23 changes: 8 additions & 15 deletions bluecast/preprocessing/onehot_encoding.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,3 @@
"""
Onehot encoding is a method to encode categorical features. It is an unsupervised encoding technique.
It is not recommended for features with high cardinality.
The onehot encoding technique is implemented in the category_encoders library. The library offers a variety of
different encoding techniques. The onehot encoding technique is implemented in the OneHotEncoder class.
"""

import logging
from typing import Dict, List, Union

Expand Down Expand Up @@ -57,11 +49,12 @@ def append_encoded_columns(
self, x: pd.DataFrame, encoded_cats: pd.DataFrame
) -> pd.DataFrame:
"""Append encoded columns to the DataFrame."""
x[encoded_cats.columns.to_list()] = encoded_cats
x[encoded_cats.columns.to_list()] = x[encoded_cats.columns.to_list()].astype(
int
)
# Instead of assigning the columns one by one, we concatenate them all at once.
encoded_cats = encoded_cats.astype(int) # Ensure all columns are integers
x_new = pd.concat([x, encoded_cats], axis=1)

# Set all new columns to -1 where all values are 0
mask_all_zero = (x[encoded_cats.columns.to_list()] == 0).all(axis=1)
x.loc[mask_all_zero, encoded_cats.columns.to_list()] = -1
return x
mask_all_zero = (x_new[encoded_cats.columns.to_list()] == 0).all(axis=1)
x_new.loc[mask_all_zero, encoded_cats.columns.to_list()] = -1

return x_new
Binary file modified dist/bluecast-1.6.2-py3-none-any.whl
Binary file not shown.
Binary file modified dist/bluecast-1.6.2.tar.gz
Binary file not shown.

0 comments on commit 864be5b

Please sign in to comment.