-
Notifications
You must be signed in to change notification settings - Fork 0
/
model_lgbm.py
91 lines (69 loc) · 2.69 KB
/
model_lgbm.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score
import lightgbm as lgbm
from lightgbm import LGBMClassifier
# compute f1 score based on probabilities and threshold
def lgbm_f1(y_true, y_proba):
# threshold is not passed as a parameter because LGBM callbacks need only 2 arguments
threshold=0.5
y_pred = np.where(y_proba > threshold, 1, 0)
eval_score = f1_score(y_true, y_pred)
return ("f1_score", eval_score, True)
# double f1 loss
def double_soft_f1_loss(y_true, y_pred):
# inspired from: https://towardsdatascience.com/the-unknown-benefits-of-using-a-soft-f1-loss-in-classification-systems-753902c0105d
tp = np.sum(y_pred * y_true)
fp = np.sum(y_pred * (1-y_true))
fn = np.sum((1-y_pred) * y_true)
tn = np.sum((1-y_pred) * (1-y_true))
soft_f1_cls1 = 2*tp / (2*tp + fn + fp + 1e-16)
soft_f1_cls0 = 2*tn / (2*tn + fn + fp + 1e-16)
cost_cls1 = 1 - soft_f1_cls1
cost_cls0 = 1 - soft_f1_cls0
cost = 0.5 * (cost_cls1 + cost_cls0)
macro_cost = np.mean(cost)
return ("double_soft_f1_loss", macro_cost, True)
def train_lgbm(eval_sets, **hyperparams):
# train a single LGBM classifier
clf = LGBMClassifier(
objective="binary",
n_estimators=100,
learning_rate=0.01,
boosting_type="gbdt",
subsample=0.5,
subsample_freq=1,
num_leaves=31,
max_depth=-1,
boost_from_average=False,
n_jobs=8,
)
clf.fit(
eval_sets[0]["train"][0], eval_sets[0]["train"][1],
eval_set=[(eval_sets[0]["test"][0], eval_sets[0]["test"][1])],
eval_metric=["logloss", double_soft_f1_loss, lgbm_f1],
)
return clf
def train_best_lgbm(train_df, hyperparams):
# train LGBM on the whole training set for final predictions
# TODO merge with train_lgbm() and handle data split or not within function
x_train = train_df.loc[:, ~train_df.columns.isin(["LABELS"])].values
y_train = train_df.loc[:, "LABELS"].values
clf = LGBMClassifier(objective="binary", **hyperparams)
clf.fit(
x_train,
y_train,
eval_metric=["logloss", "double_soft_f1_loss"],
)
return clf
def lgbm_plot_evals(model):
# Plot every eval metrics from the lightGBM model object
sets = list(model.evals_result_.keys())
n_sets = len(sets)
metrics = list(model.evals_result_[sets[0]].keys())
n_metrics = len(metrics)
fig, ax = plt.subplots(1, n_metrics)
for i, seti in enumerate(sets):
for j, metric in enumerate(metrics):
ax[j].set_title(f"{metric}")
ax[j].plot(model.evals_result_[seti][metric], label=f"{seti}")