-
Notifications
You must be signed in to change notification settings - Fork 9
/
load_data_multimodal.py
229 lines (196 loc) · 7.03 KB
/
load_data_multimodal.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
import codecs
import os
from collections import Counter
import cPickle
import h5py
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from numpy import random
from gensim.models import word2vec
from word2vecReader import Word2Vec
import sys
reload(sys)
sys.setdefaultencoding('utf8')
data_dir = '../data'
tweet_data_dir = '../data/tweet_new/'
# h5_file = "img_vgg_feature_all.h5"
h5_file = 'img_vgg_feature_224.h5'
IMAGEID='IMGID'
img_feature_file = h5py.File(os.path.join(data_dir, h5_file), 'r')
def load_sentence(IMAGEID, tweet_data_dir, train_name, dev_name, test_name):
"""
read the word from doc, and build sentence. every line contain a word and it's tag
every sentence is split with a empty line. every sentence begain with an "IMGID:num"
"""
#IMAGEID='IMGID'
img_id = []
sentences = []
sentence = []
sent_maxlen =0
word_maxlen =0
img_to_feature = []
datasplit = []
for fname in (train_name, dev_name, test_name):
datasplit.append(len(img_id))
with open(os.path.join(tweet_data_dir, fname), 'r') as file:
for line in file:
line = line.rstrip()
if line == '':
sent_maxlen = max(sent_maxlen, len(sentence))
sentences.append(sentence)
sentence = []
else:
if IMAGEID in line:
num = line[6:]
img_id.append(num)
else:
sentence.append(line.split('\t'))
word_maxlen = max(word_maxlen, len(str(line.split()[0])) )
sentences.append(sentence)
datasplit.append(len(img_id))
num_sentence = len(sentences)
print "datasplit",datasplit
print sentences[len(sentences)-2]
print sentences[0]
# train_num = datasplit[1]
# dev_num = datasplit[2]-datasplit[1]
# test_num = datasplit[3]-datasplit[2]
## get image feature.
for image in img_id:
feature = img_feature_file.get(image)
np_feature = np.array(feature)
img_to_feature.append(np_feature)
print 'sent_maxlen', sent_maxlen
print 'word_maxlen', word_maxlen
print 'number sentence', len(sentences)
print 'number image', len(img_id)
return [datasplit,sentences, img_to_feature, sent_maxlen,word_maxlen, num_sentence]
def load_word_matrix(vocabulary, size=200):
"""
This function is used to convert words into word vectors
"""
b = 0
word_matrix = np.zeros((len(vocabulary)+1, size))
model = word2vec.Word2Vec.load('../40Wtweet_200dim.model')
for word, i in vocabulary.iteritems():
try:
word_matrix[i]=model[word.lower().encode('utf8')]
except KeyError:
# if a word is not include in the vocabulary, it's word embedding will be set by random.
word_matrix[i] = np.random.uniform(-0.25,0.25,size)
b+=1
print('there are %d words not in model'%b)
return word_matrix
def vocab_bulid(sentences):
"""
input:
sentences list,
the element of the list is (word, label) pair.
output:
some dictionaries.
"""
words = []
chars = []
labels = []
for sentence in sentences:
for word_label in sentence:
words.append(word_label[0])
labels.append(word_label[1])
for char in word_label[0]:
chars.append(char)
word_counts = Counter(words)
vocb_inv = [x[0] for x in word_counts.most_common()]
vocb = {x: i + 1 for i, x in enumerate(vocb_inv)}
vocb['PAD'] =0
id_to_vocb = {i:x for x,i in vocb.items()}
char_counts = Counter(chars)
vocb_inv_char = [x[0] for x in char_counts.most_common()]
vocb_char = {x: i + 1 for i, x in enumerate(vocb_inv_char)}
labels_counts = Counter(labels)
print 'labels_counts',len(labels_counts)
print labels_counts
labelVoc_inv, labelVoc = label_index(labels_counts)
print 'labelVoc',labelVoc
return [id_to_vocb, vocb, vocb_inv, vocb_char, vocb_inv_char, labelVoc, labelVoc_inv]
def label_index(labels_counts):
"""
the input is the output of Counter. This function defines the (label, index) pair,
and it cast our datasets label to the definition (label, index) pair.
"""
num_labels = len(labels_counts)
labelVoc_inv = [x[0] for x in labels_counts.most_common()]
labelVoc = {'0':0,
'B-PER':1, 'I-PER':2,
'B-LOC':3, 'I-LOC':4,
'B-ORG':5, 'I-ORG':6,
'B-OTHER':7, 'I-OTHER':8,
'O':9}
if len(labelVoc) < num_labels:
for key,value in labels_counts.items():
if not labelVoc.has_key(key):
labelVoc.setdefault(key, len(labelVoc))
return labelVoc_inv, labelVoc
def pad_sequence(sentences, img_to_feature, vocabulary, vocabulary_char, labelVoc, word_maxlen=30, sent_maxlen=35):
"""
This function is used to pad the word into the same length, the word length is set to 30.
Moreover, it also pad each sentence into the same length, the length is set to 35.
"""
print sentences[0]
x = []
y = []
for sentence in sentences:
w_id =[]
y_id = []
for word_label in sentence:
w_id.append(vocabulary[word_label[0]])
y_id.append(labelVoc[word_label[1]])
x.append(w_id)
y.append(y_id)
y = pad_sequences(y, maxlen=sent_maxlen).astype(np.int32)
x = pad_sequences(x, maxlen=sent_maxlen).astype(np.int32)
img_x = np.asarray(img_to_feature)
x_c = []
for sentence in sentences:
s_pad = np.zeros([sent_maxlen, word_maxlen], dtype = np.int32)
s_c_pad = []
for word_label in sentence:
w_c = []
char_pad = np.zeros([word_maxlen], dtype=np.int32)
for char in word_label[0]:
w_c.append(vocabulary_char[char])
if len(w_c) <= word_maxlen:
char_pad[:len(w_c)] =w_c
else:
char_pad = w_c[:word_maxlen]
s_c_pad.append(char_pad)
for i in range(len(s_c_pad)):
s_pad[sent_maxlen-len(s_c_pad)+i,:len(s_c_pad[i])] =s_c_pad[i]
x_c.append(s_pad)
x_c = np.asarray(x_c)
x = np.asarray(x)
y = np.asarray(y)
return [x, x_c, img_x, y]
def load_data():
# vocabFileName = 'vocabulary_full_tweet_coatt_img.pkl'
# if os.path.isfile(vocabFileName):
# print "loading vocabulary..."
# id_to_vocb,word_matrix,sentences,datasplit,x, x_c, img_x, y, sent_maxlen, word_maxlen, num_sentence, vocb, vocb_char, labelVoc = cPickle.load(open(vocabFileName))
# else:
print 'calculating vocabulary...'
datasplit,sentences, img_to_feature, sent_maxlen,word_maxlen, num_sentence = load_sentence(
IMAGEID, tweet_data_dir, 'train', 'dev', 'test')
id_to_vocb, vocb, vocb_inv, vocb_char, vocb_inv_char, labelVoc, labelVoc_inv =vocab_bulid(sentences)
word_matrix = load_word_matrix(vocb, size=200)
x, x_c, img_x, y = pad_sequence(sentences, img_to_feature, vocb, vocb_char, labelVoc, word_maxlen=30, sent_maxlen=35)
# cPickle.dump([id_to_vocb,word_matrix,sentences,datasplit,x, x_c, img_x, y, sent_maxlen, word_maxlen, num_sentence, vocb, vocb_char, labelVoc], open(vocabFileName, "w"))
return [id_to_vocb,word_matrix,sentences,datasplit,x, x_c, img_x, y, num_sentence, vocb, vocb_char, labelVoc]
if __name__ == '__main__':
load_data()
id_to_vocb,word_matrix,sentences,datasplit,x, x_c, img_x, y, sent_maxlen, word_maxlen, num_sentence, vocb, vocb_char, labelVoc = load_data()
print x_c[0][0][1]
print x_c[0][0]
print x_c[1]
print img_x[0]
print np.asarray(img_x).shape
print y
print 'datasplit', datasplit