Skip to content

Commit

Permalink
Add InFrequentCategoryEncoder
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasMeissnerDS committed Aug 1, 2024
1 parent f002de6 commit bb21af1
Show file tree
Hide file tree
Showing 10 changed files with 197 additions and 2 deletions.
21 changes: 21 additions & 0 deletions bluecast/blueprints/cast.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
from bluecast.preprocessing.encode_target_labels import TargetLabelEncoder
from bluecast.preprocessing.feature_selection import BoostaRootaWrapper
from bluecast.preprocessing.feature_types import FeatureTypeDetector
from bluecast.preprocessing.infrequent_categories import InFrequentCategoryEncoder
from bluecast.preprocessing.nulls_and_infs import fill_infinite_values
from bluecast.preprocessing.onehot_encoding import OneHotCategoryEncoder
from bluecast.preprocessing.schema_checks import SchemaDetector
Expand Down Expand Up @@ -115,6 +116,7 @@ def __init__(
self.conf_xgboost = conf_xgboost
self.conf_params_xgboost = conf_params_xgboost
self.feat_type_detector: Optional[FeatureTypeDetector] = None
self.infreq_cat_encoder: Optional[InFrequentCategoryEncoder] = None
self.cat_encoder: Optional[
Union[BinaryClassTargetEncoder, MultiClassTargetEncoder]
] = None
Expand Down Expand Up @@ -343,6 +345,18 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
self.cat_columns is not None
and not self.conf_training.cat_encoding_via_ml_algorithm
):
from bluecast.preprocessing.infrequent_categories import (
InFrequentCategoryEncoder,
)

self.infreq_cat_encoder = InFrequentCategoryEncoder(
self.cat_columns,
self.target_column,
self.conf_training.infrequent_threshold,
)
x_train = self.infreq_cat_encoder.fit_transform(x_train, y_train)
x_test = self.infreq_cat_encoder.transform(x_test)

self.category_encoder_orchestrator = CategoryEncoderOrchestrator(
self.target_column
)
Expand Down Expand Up @@ -542,6 +556,13 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
if self.schema_detector:
df = self.schema_detector.transform(df)

if (
self.cat_columns is not None
and self.infreq_cat_encoder
and not self.conf_training.cat_encoding_via_ml_algorithm
):
df = self.infreq_cat_encoder.transform(df.copy())

if (
self.cat_columns is not None
and self.onehot_encoder
Expand Down
21 changes: 21 additions & 0 deletions bluecast/blueprints/cast_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
from bluecast.preprocessing.encode_target_labels import TargetLabelEncoder
from bluecast.preprocessing.feature_selection import BoostaRootaWrapper
from bluecast.preprocessing.feature_types import FeatureTypeDetector
from bluecast.preprocessing.infrequent_categories import InFrequentCategoryEncoder
from bluecast.preprocessing.nulls_and_infs import fill_infinite_values
from bluecast.preprocessing.onehot_encoding import OneHotCategoryEncoder
from bluecast.preprocessing.schema_checks import SchemaDetector
Expand Down Expand Up @@ -117,6 +118,7 @@ def __init__(
self.conf_xgboost = conf_xgboost
self.conf_params_xgboost = conf_params_xgboost
self.feat_type_detector: Optional[FeatureTypeDetector] = None
self.infreq_cat_encoder: Optional[InFrequentCategoryEncoder] = None
self.cat_encoder: Optional[
Union[BinaryClassTargetEncoder, MultiClassTargetEncoder]
] = None
Expand Down Expand Up @@ -330,6 +332,18 @@ def fit(self, df: pd.DataFrame, target_col: str) -> None:
self.cat_columns is not None
and not self.conf_training.cat_encoding_via_ml_algorithm
):
from bluecast.preprocessing.infrequent_categories import (
InFrequentCategoryEncoder,
)

self.infreq_cat_encoder = InFrequentCategoryEncoder(
self.cat_columns,
self.target_column,
self.conf_training.infrequent_threshold,
)
x_train = self.infreq_cat_encoder.fit_transform(x_train, y_train)
x_test = self.infreq_cat_encoder.transform(x_test)

self.category_encoder_orchestrator = CategoryEncoderOrchestrator(
self.target_column
)
Expand Down Expand Up @@ -506,6 +520,13 @@ def transform_new_data(self, df: pd.DataFrame) -> pd.DataFrame:
if self.schema_detector:
df = self.schema_detector.transform(df)

if (
self.cat_columns is not None
and self.infreq_cat_encoder
and not self.conf_training.cat_encoding_via_ml_algorithm
):
df = self.infreq_cat_encoder.transform(df.copy())

if (
self.cat_columns is not None
and self.onehot_encoder
Expand Down
4 changes: 4 additions & 0 deletions bluecast/config/training_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ class TrainingConfig(BaseModel):
:param cardinality_threshold_for_onehot_encoding: Categorical features with a cardinality of less or equal
this threshold will be onehot encoded. The rest will be target encoded. Will be ignored if
cat_encoding_via_ml_algorithm is set to true.
:param infrequent_categories_threshold: Categories with a frequency of less this threshold will be
grouped into a common group. This is done to reduce the risk of overfitting. Will be ignored if
cat_encoding_via_ml_algorithm is set to true.
:param cat_encoding_via_ml_algorithm: Whether to use an ML algorithm for categorical encoding. If True, the
categorical encoding is done via a ML algorithm. If False, the categorical encoding is done via a target
encoding in the preprocessing steps. See the ReadMe for more details.
Expand Down Expand Up @@ -99,6 +102,7 @@ class TrainingConfig(BaseModel):
train_split_stratify: bool = True
use_full_data_for_final_model: bool = False
cardinality_threshold_for_onehot_encoding: int = 5
infrequent_categories_threshold: int = 5
cat_encoding_via_ml_algorithm: bool = False
show_detailed_tuning_logs: bool = False
optuna_sampler_n_startup_trials: int = 10
Expand Down
52 changes: 52 additions & 0 deletions bluecast/preprocessing/infrequent_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""
Infrequent categories may cause overfitting.
This module groups infrequent categories into a common group to reduce the risk of overfitting
"""

import logging
from typing import Dict, List, Union

import pandas as pd


class InFrequentCategoryEncoder:
"""Group infrequent categories into common group."""

def __init__(
self,
cat_columns: List[Union[str, float, int]],
target_col: Union[str, float, int],
infrequent_threshold: int = 5,
):
self.frequencies: Dict[Union[str, float, int], pd.Series] = {}
self.prediction_mode: bool = False
self.cat_columns = cat_columns
self.target_col = target_col
self.infrequent_threshold = infrequent_threshold

def fit_transform(self, x: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
"""Find infrequent categories and transform column."""
logging.info("Start fitting binary target encoder.")
if self.target_col in self.cat_columns:
self.cat_columns.remove(self.target_col)

for col in self.cat_columns:
self.frequencies[col] = x[col].value_counts()
x[col] = x[col].mask(
x[col].map(self.frequencies[col], na_action="ignore")
< self.infrequent_threshold,
"rare categories",
)
return x.copy() # copy against high fragmentation

def transform(self, x: pd.DataFrame) -> pd.DataFrame:
"""Transform categories based on already explored frequencies."""
logging.info("Start transforming categories with binary target encoder.")
for col in self.cat_columns:
x[col] = x[col].mask(
x[col].map(self.frequencies[col], na_action="ignore")
< self.infrequent_threshold,
"rare categories",
)
return x
97 changes: 97 additions & 0 deletions bluecast/tests/test_infrequent_categories.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import pandas as pd
import pytest

from bluecast.preprocessing.infrequent_categories import InFrequentCategoryEncoder


@pytest.fixture
def sample_data():
data = {
"cat1": ["a", "b", "b", "c", "c", "c", "d", "e"],
"cat2": ["w", "w", "x", "y", "y", "z", "z", "z"],
"target": [0, 1, 0, 1, 0, 1, 0, 1],
}
df = pd.DataFrame(data)
return df


def test_fit_transform(sample_data):
encoder = InFrequentCategoryEncoder(
cat_columns=["cat1", "cat2"], target_col="target", infrequent_threshold=2
)
transformed_df = encoder.fit_transform(
sample_data.drop(columns="target"), sample_data["target"]
)

assert transformed_df["cat1"].tolist() == [
"rare categories",
"b",
"b",
"c",
"c",
"c",
"rare categories",
"rare categories",
]
assert transformed_df["cat2"].tolist() == [
"w",
"w",
"rare categories",
"y",
"y",
"z",
"z",
"z",
]


def test_transform(sample_data):
encoder = InFrequentCategoryEncoder(
cat_columns=["cat1", "cat2"], target_col="target", infrequent_threshold=2
)
encoder.fit_transform(sample_data.drop(columns="target"), sample_data["target"])

new_data = pd.DataFrame(
{"cat1": ["a", "b", "c", "d", "f"], "cat2": ["w", "x", "y", "z", "a"]}
)

transformed_new_data = encoder.transform(new_data)

assert transformed_new_data["cat1"].tolist() == [
"rare categories",
"b",
"c",
"rare categories",
"f",
]
assert transformed_new_data["cat2"].tolist() == [
"w",
"rare categories",
"y",
"z",
"a",
]


def test_no_infrequent_categories(sample_data):
encoder = InFrequentCategoryEncoder(
cat_columns=["cat1", "cat2"], target_col="target", infrequent_threshold=1
)
transformed_df = encoder.fit_transform(
sample_data.drop(columns="target"), sample_data["target"]
)

assert transformed_df["cat1"].tolist() == sample_data["cat1"].tolist()
assert transformed_df["cat2"].tolist() == sample_data["cat2"].tolist()


def test_all_infrequent_categories(sample_data):
encoder = InFrequentCategoryEncoder(
cat_columns=["cat1", "cat2"], target_col="target", infrequent_threshold=10
)
transformed_df = encoder.fit_transform(
sample_data.drop(columns="target"), sample_data["target"]
)

assert all(val == "rare categories" for val in transformed_df["cat1"].tolist())
assert all(val == "rare categories" for val in transformed_df["cat2"].tolist())
Binary file removed dist/bluecast-1.5.0.tar.gz
Binary file not shown.
Binary file not shown.
Binary file added dist/bluecast-1.5.1.tar.gz
Binary file not shown.
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
project = "BlueCast"
copyright = "2024, Thomas Meißner"
author = "Thomas Meißner"
release = "1.5.0"
release = "1.5.1"

# -- General configuration ---------------------------------------------------
# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "bluecast"
version = "1.5.0"
version = "1.5.1"
description = "A lightweight and fast automl framework"
authors = ["Thomas Meißner <[email protected]>"]
license = "GPL-3.0-only"
Expand Down

0 comments on commit bb21af1

Please sign in to comment.