-
Notifications
You must be signed in to change notification settings - Fork 0
/
project2_language_model.py
176 lines (136 loc) · 5.94 KB
/
project2_language_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 26 10:13:54 2018
@author: fourn
"""
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.optimizers import RMSprop
from sklearn.cross_validation import train_test_split
import numpy as np
import io
import project2_utils as utils
import gc
import sys
def data_load(path):
file = io.open(path, encoding='utf-8')
text = file.read()
file.close()
return text[0:3806020]
# return text
def dataset_preparation(data):
# Get list of phrases : ['NN FF EE TT', 'FF SS LL'...]
phrases = data.split("\n")
print('Number phrases : ', len(phrases))
# Get list of list phrases : [['NN', 'FF', 'EE', 'TT'], ['FF', 'SS', 'LL']...]
input_sequences = []
for i in range(0, len(phrases)):
input_sequences.append(phrases[i].split(" "))
print('Total input sequences : ', len(input_sequences))
# Put the different chars of data in a list
chars = sorted(set(' '.join(phrases).split(" "))) # ['EE', 'FF', 'LL', 'NN', 'SS', 'TT']
print('Chars : ', chars)
total_words = len(chars)
print('Total words : ', total_words)
char_indices = dict((c, i) for i, c in enumerate(chars)) # ['EE' : 0, 'FF' : 1, 'LL' : 2, 'NN' : 3, 'SS' : 4, 'TT' : 5]
print('Char indice : ', char_indices)
return input_sequences, chars, char_indices, total_words
def predictors_label(input_sequences, maxlen):
# Get the predictors and the label for each phrases
predictors = [] # [['NN', 'FF'], ['FF', 'EE'], ['FF', 'SS']]
label = [] # ['EE', 'TT', 'LL']
for i in range(0, len(input_sequences)):
for j in range(0, len(input_sequences[i]) - maxlen):
predictors.append(input_sequences[i][j: j + maxlen])
label.append(input_sequences[i][j + maxlen])
return predictors, label
def vectorization(predictors, label, maxlen, chars, char_indices):
x = np.zeros((len(predictors), maxlen, len(chars)), dtype=np.bool) # [[[0, 0, 0, 1, 0, 0], [0, 1, 0, 0, 0, 0]], [[0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]], [0, 1, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0]]
y = np.zeros((len(predictors), len(chars)), dtype=np.bool) # [[1, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 1], [0, 0, 1, 0, 0, 0]]
for i, predictor in enumerate(predictors):
for t, char in enumerate(predictor):
x[i, t, char_indices[char]] = 1
y[i, char_indices[label[i]]] = 1
return x, y
def train_valid(x, y):
x_train, x_valid, y_train, y_valid = train_test_split(x, y, test_size=0.2)
return x_train, x_valid, y_train, y_valid
def create_model(x_train, y_train, x_valid, y_valid, maxlen, total_words):
# build the model: a single LSTM
model = Sequential()
model.add(LSTM(12, input_shape=(maxlen, total_words)))
model.add(Dense(total_words, activation='softmax'))
optimizer = RMSprop(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()
model.fit(x_train, y_train, batch_size=128, epochs=10, validation_data=(x_valid, y_valid))
return model
def predict_char(model, predictors, labels, maxlen, chars, char_indices):
x, y = vectorization(predictors, labels, maxlen, chars, char_indices)
preds = model.predict(x, verbose=0)
char_preds = [] # Character predicted : ['AA', 'BB',...]
real_estimation = [] # Estimation of the real character : [0.9, 0.21,... ]
success = [] # 1 if good prediction, 0 else : [1, 0, 1, 1, ...]
for i in range(len(preds)):
pred = preds[i]
index_max = np.argmax(pred)
char_preds.append(chars[index_max])
real_estimation.append(pred[char_indices[labels[i]]])
if(chars[index_max] == labels[i]):
success.append(1)
else:
success.append(0)
return char_preds, real_estimation, success
def principal():
maxlen = utils.maxlen
print('Data Load...')
data = data_load('Corpus/CNN1.txt')
print('Size of data : ', sys.getsizeof(data))
print('Dataset preparation...')
input_sequences, chars, char_indices, total_words = dataset_preparation(data)
print('Size of input_sequences : ', sys.getsizeof(input_sequences))
print('Size of chars : ', sys.getsizeof(chars))
print('Size of char_indices : ', sys.getsizeof(char_indices))
del data
gc.collect()
utils.save_json(chars, 'data/chars_cnn_2')
utils.save_json(char_indices, 'data/char_indice_cnn_2')
print('Predictor and Label...')
predictors, label = predictors_label(input_sequences, maxlen)
print('Size of predictors : ', sys.getsizeof(predictors))
print('Size of label : ', sys.getsizeof(label))
del input_sequences
gc.collect()
print('Number predictor : ', len(predictors))
print('Number label : ', len(label))
print('Vectorization...')
x, y = vectorization(predictors, label, maxlen, chars, char_indices)
print('Size of x : ', sys.getsizeof(x))
print('Size of y : ', sys.getsizeof(y))
del predictors, label
gc.collect()
print('Len X : ', len(x))
print('Len Y : ', len(y))
print("Train Valid...")
x_train, x_valid, y_train, y_valid = train_valid(x,y)
print('Size of x_train : ', sys.getsizeof(x_train))
print('Size of x_valid : ', sys.getsizeof(x_valid))
print('Size of y_train : ', sys.getsizeof(y_train))
print('Size of y_valid : ', sys.getsizeof(y_valid))
del x, y
gc.collect()
print(len(x_train))
print(len(y_train))
print(len(x_valid))
print(len(y_valid))
print('Build model...')
model = create_model(x_train, y_train, x_valid, y_valid, maxlen, total_words)
print('Size of model : ', sys.getsizeof(model))
del x_train, y_train, x_valid, y_valid, total_words
gc.collect()
model.save('data/model_language_cnn.h5')
print('Model saved')
del model
gc.collect()
#principal()