-
Notifications
You must be signed in to change notification settings - Fork 0
/
xgboost_tune.py
131 lines (112 loc) · 4.03 KB
/
xgboost_tune.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn import metrics, preprocessing, linear_model
from sklearn.model_selection import GridSearchCV
#from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import data_helper
import model_helper
def modelfit(alg, X, y,
useTrainCV=True,
cv_folds=5,
eval_metric=['logloss', 'auc', 'error'],
early_stopping_rounds=5):
if useTrainCV:
print('cross validating xgboost model with training data')
xgb_param = alg.get_xgb_params()
xgtrain = xgb.DMatrix(X, label=y)
cvresult = xgb.cv(xgb_param, xgtrain,
num_boost_round=alg.get_params()['n_estimators'],
nfold=cv_folds,
#metrics='auc',
metrics=eval_metric,
early_stopping_rounds=early_stopping_rounds,
verbose_eval=True
)
print('cvresult:', cvresult)
alg.set_params(n_estimators=cvresult.shape[0])
#Fit the algorithm on the data
print('fitting model: {}'.format(alg))
alg.fit(X, y, eval_metric=eval_metric)
#Predict training set:
#dtrain_predictions = alg.predict(X)
dtrain_predprob = alg.predict_proba(X)[:,1]
#Print model report:
print("\nModel Report")
model_helper.eval_predict(y, dtrain_predprob)
#print "Accuracy : %.4g" % metrics.accuracy_score(y, dtrain_predictions)
#print "AUC Score (Train): %f" % metrics.roc_auc_score(y, dtrain_predprob)
#if performCV:
# print "CV Score : Mean - %.7g | Std - %.7g | Min - %.7g | Max - %.7g" \
# % (np.mean(cv_score),np.std(cv_score),np.min(cv_score),np.max(cv_score))
feat_imp = pd.Series(alg.get_booster().get_fscore()).sort_values(ascending=False)
feat_imp.plot(kind='bar', title='Feature Importances')
plt.ylabel('Feature Importance Score')
plt.show()
xgb.plot_importance(alg)
plt.show()
def tune1(xgb1, X, y):
param_test1 = {
'max_depth': range(3,10,2),
'min_child_weight': range(1,6,2)
}
gsearch1 = GridSearchCV(estimator =
xgb.XGBClassifier(learning_rate =0.1,
# TODO: clone xgb1?
#n_estimators=140,
n_estimators=xgb1.n_estimators,
max_depth=5,
min_child_weight=1,
gamma=0, subsample=0.8, colsample_bytree=0.8,
objective= 'binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27),
param_grid = param_test1,
#scoring='roc_auc',
scoring='neg_log_loss',
n_jobs=4,
iid=False,
verbose=5,
cv=5)
gsearch1.fit(X, y)
print(gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_)
def main():
# Set seed for reproducibility
np.random.seed(0)
print("Loading data...")
tour = 90
nrows = None
# Load the data from the CSV files
training_data = data_helper.load_training_data(tour, nrows=nrows)
#training_data = pd.read_csv(data_dir + '/numerai_training_data.csv', header=0)
#prediction_data = pd.read_csv(data_dir + '/numerai_tournament_data.csv', header=0)
#prediction_data = data_helper.load_testing_data(tour, nrows=nrows)
features = data_helper.get_feature_names(training_data)
# Transform the loaded CSV data into numpy arrays
frac = 0.1
frac = 0.01
frac = 0.0001
frac = None
if frac is not None:
training_data = training_data.sample(frac=frac)
X, y = data_helper.get_Xy(training_data)
xgb1 = xgb.XGBClassifier(
learning_rate =0.1,
n_estimators=1000,
max_depth=5,
min_child_weight=1,
gamma=0,
subsample=0.8,
colsample_bytree=0.8,
objective='binary:logistic',
nthread=4,
scale_pos_weight=1,
seed=27)
#modelfit(xgb1, X, y)
tune1(xgb1, X, y)
if __name__ == '__main__':
main()