-
Notifications
You must be signed in to change notification settings - Fork 4
/
trainClassifier.py
116 lines (80 loc) · 4.33 KB
/
trainClassifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# Author: Alexandre Bovet <[email protected]>
# License: BSD 3 clause
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.externals import joblib
from sklearn.feature_extraction import DictVectorizer
import pickle
import time
import numpy as np
import ujson as json
from baseModule import baseModule
class trainClassifier(baseModule):
""" Train a classifier on the training set using the best parameters.
Must be initialized with a dictionary `job` containing keys `features_pickle_file`,
`labels_pickle_file`, `best_params_file`, `labels_mappers_file` and
`classifier_filename`.
Uses features and labels from `features_pickle_file` and `labels_pickle_file`
to train the classifier using the parameters from `best_params_file`.
The trained classifier is then saved to `classifier_filename`.
"""
def run(self):
#==============================================================================
# PARAMETERS
#==============================================================================
features_file = self.job['features_pickle_file']
labels_file = self.job['labels_pickle_file']
best_params_file = self.job['best_params_file']
labels_mappers_file = self.job['labels_mappers_file']
classifier_filename = self.job['classifier_filename']
# load best parameters from crossval
with open(best_params_file, 'r') as fopen:
best_parameters = json.load(fopen)
# set classifier parameters
# loss function (log = logistic regression)
loss=best_parameters.get('classifier__loss', 'log')
# regularization (l2 = Ridge (L2 norm))
penalty=best_parameters.get('classifier__penalty', 'l2')
# regularization strength
alpha = best_parameters.get('classifier__alpha', 0.1)
# class weight (if training set is not balanced)
class_weight = best_parameters.get('classifier__class_weight', None)
# number of iterations of the stochastic gradient descent
# SGD should see aounrd 1e6 samples
n_iter = best_parameters.get('classifier__n_iter', 100)
pipeline_list = [('feat_vectorizer', DictVectorizer(dtype=np.int8, sparse=True, sort=False)),
('classifier', SGDClassifier(loss=loss,
alpha=alpha,
n_iter=n_iter,
penalty=penalty,
class_weight=class_weight,
shuffle=True,
random_state=42))]
print('Training classifier with the following parameters:')
print(' loss = ' + str(loss))
print(' alpha = ' + str(alpha))
print(' n_iter = ' + str(n_iter))
print(' penalty = ' + str(penalty))
print(' class_weight = ' + str(class_weight) +'\n')
pipeline = Pipeline(pipeline_list)
with open(labels_mappers_file, 'rb') as fopen:
labels_mappers = pickle.load(fopen)
label_mapper = labels_mappers['label_mapper']
label_inv_mapper = labels_mappers['label_inv_mapper']
with open(features_file, 'rb') as fopen:
features = pickle.load(fopen)
with open(labels_file, 'rb') as fopen:
labels = pickle.load(fopen)
def label_transformer(labels, mapper=label_mapper):
""" maps label names to label values """
return np.array([mapper[l] for l in labels])
y = label_transformer(labels)
pipeline = Pipeline(pipeline_list)
print('fitting classifier')
t0 = time.time()
pipeline.fit(features, y)
self.print_elapsed_time(t0)
self.to_dump = {'sklearn_pipeline' : pipeline,
'label_mapper' : label_mapper,
'label_inv_mapper' : label_inv_mapper}
joblib.dump(self.to_dump, classifier_filename)