-
Notifications
You must be signed in to change notification settings - Fork 1
/
splitcross.py
205 lines (171 loc) · 9.97 KB
/
splitcross.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
from collections import defaultdict
import torch
import torch.nn as nn
import numpy as np
class SplitCrossEntropyLoss(nn.Module):
r'''SplitCrossEntropyLoss calculates an approximate softmax'''
def __init__(self, hidden_size, splits, verbose=False):
# We assume splits is [0, split1, split2, N] where N >= |V|
# For example, a vocab of 1000 words may have splits [0] + [100, 500] + [inf]
super(SplitCrossEntropyLoss, self).__init__()
self.hidden_size = hidden_size
self.splits = [0] + splits + [100 * 1000000]
self.nsplits = len(self.splits) - 1
self.stats = defaultdict(list)
self.verbose = verbose
# Each of the splits that aren't in the head require a pretend token, we'll call them tombstones
# The probability given to this tombstone is the probability of selecting an item from the represented split
if self.nsplits > 1:
self.tail_vectors = nn.Parameter(torch.zeros(self.nsplits - 1, hidden_size))
self.tail_bias = nn.Parameter(torch.zeros(self.nsplits - 1))
def logprob(self, weight, bias, hiddens, splits=None, softmaxed_head_res=None, verbose=False):
# First we perform the first softmax on the head vocabulary and the tombstones
if softmaxed_head_res is None:
start, end = self.splits[0], self.splits[1]
head_weight = None if end - start == 0 else weight[start:end]
head_bias = None if end - start == 0 else bias[start:end]
# We only add the tombstones if we have more than one split
if self.nsplits > 1:
head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors])
head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias])
# Perform the softmax calculation for the word vectors in the head for all splits
# We need to guard against empty splits as torch.cat does not like random lists
head_res = torch.nn.functional.linear(hiddens, head_weight, bias=head_bias)
softmaxed_head_res = torch.nn.functional.log_softmax(head_res, dim=-1)
if splits is None:
splits = list(range(self.nsplits))
results = []
running_offset = 0
for idx in splits:
# For those targets in the head (idx == 0) we only need to return their loss
if idx == 0:
results.append(softmaxed_head_res[:, :-(self.nsplits - 1)])
# If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone)
else:
start, end = self.splits[idx], self.splits[idx + 1]
tail_weight = weight[start:end]
tail_bias = bias[start:end]
# Calculate the softmax for the words in the tombstone
tail_res = torch.nn.functional.linear(hiddens, tail_weight, bias=tail_bias)
# Then we calculate p(tombstone) * p(word in tombstone)
# Adding is equivalent to multiplication in log space
head_entropy = (softmaxed_head_res[:, -idx]).contiguous()
tail_entropy = torch.nn.functional.log_softmax(tail_res, dim=-1)
results.append(head_entropy.view(-1, 1) + tail_entropy)
if len(results) > 1:
return torch.cat(results, dim=1)
return results[0]
def split_on_targets(self, hiddens, targets):
# Split the targets into those in the head and in the tail
split_targets = []
split_hiddens = []
# Determine to which split each element belongs (for each start split value, add 1 if equal or greater)
# This method appears slower at least for WT-103 values for approx softmax
#masks = [(targets >= self.splits[idx]).view(1, -1) for idx in range(1, self.nsplits)]
#mask = torch.sum(torch.cat(masks, dim=0), dim=0)
###
# This is equally fast for smaller splits as method below but scales linearly
mask = None
for idx in range(1, self.nsplits):
partial_mask = targets >= self.splits[idx]
mask = mask + partial_mask if mask is not None else partial_mask
###
#masks = torch.stack([targets] * (self.nsplits - 1))
#mask = torch.sum(masks >= self.split_starts, dim=0)
for idx in range(self.nsplits):
# If there are no splits, avoid costly masked select
if self.nsplits == 1:
split_targets, split_hiddens = [targets], [hiddens]
continue
# If all the words are covered by earlier targets, we have empties so later stages don't freak out
if sum(len(t) for t in split_targets) == len(targets):
split_targets.append([])
split_hiddens.append([])
continue
# Are you in our split?
tmp_mask = mask == idx
split_targets.append(torch.masked_select(targets, tmp_mask))
split_hiddens.append(hiddens.masked_select(tmp_mask.unsqueeze(1).expand_as(hiddens)).view(-1, hiddens.size(1)))
return split_targets, split_hiddens
def forward(self, weight, bias, hiddens, targets, verbose=False):
if self.verbose or verbose:
for idx in sorted(self.stats):
print('{}: {}'.format(idx, int(np.mean(self.stats[idx]))), end=', ')
print()
total_loss = None
if len(hiddens.size()) > 2: hiddens = hiddens.view(-1, hiddens.size(2))
split_targets, split_hiddens = self.split_on_targets(hiddens, targets)
# First we perform the first softmax on the head vocabulary and the tombstones
start, end = self.splits[0], self.splits[1]
head_weight = None if end - start == 0 else weight[start:end]
head_bias = None if end - start == 0 else bias[start:end]
# We only add the tombstones if we have more than one split
if self.nsplits > 1:
head_weight = self.tail_vectors if head_weight is None else torch.cat([head_weight, self.tail_vectors])
head_bias = self.tail_bias if head_bias is None else torch.cat([head_bias, self.tail_bias])
# Perform the softmax calculation for the word vectors in the head for all splits
# We need to guard against empty splits as torch.cat does not like random lists
combo = torch.cat([split_hiddens[i] for i in range(self.nsplits) if len(split_hiddens[i])])
###
# Fix for AMP with 02 optimization
if head_weight.dtype == torch.float16: combo = combo.half()
#head_weight, head_bias = head_weight.to(torch.float), head_bias.to(torch.float)
###
all_head_res = torch.nn.functional.linear(combo, head_weight, bias=head_bias)
softmaxed_all_head_res = torch.nn.functional.log_softmax(all_head_res, dim=-1)
if self.verbose or verbose:
self.stats[0].append(combo.size()[0] * head_weight.size()[0])
running_offset = 0
for idx in range(self.nsplits):
# If there are no targets for this split, continue
if len(split_targets[idx]) == 0: continue
# For those targets in the head (idx == 0) we only need to return their loss
if idx == 0:
softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])]
entropy = -torch.gather(softmaxed_head_res, dim=1, index=split_targets[idx].view(-1, 1))
# If the target is in one of the splits, the probability is the p(tombstone) * p(word within tombstone)
else:
softmaxed_head_res = softmaxed_all_head_res[running_offset:running_offset + len(split_hiddens[idx])]
if self.verbose or verbose:
start, end = self.splits[idx], self.splits[idx + 1]
tail_weight = weight[start:end]
self.stats[idx].append(split_hiddens[idx].size()[0] * tail_weight.size()[0])
# Calculate the softmax for the words in the tombstone
tail_res = self.logprob(weight, bias, split_hiddens[idx], splits=[idx], softmaxed_head_res=softmaxed_head_res)
# Then we calculate p(tombstone) * p(word in tombstone)
# Adding is equivalent to multiplication in log space
head_entropy = softmaxed_head_res[:, -idx]
# All indices are shifted - if the first split handles [0,...,499] then the 500th in the second split will be 0 indexed
indices = (split_targets[idx] - self.splits[idx]).view(-1, 1)
# Warning: if you don't squeeze, you get an N x 1 return, which acts oddly with broadcasting
tail_entropy = torch.gather(torch.nn.functional.log_softmax(tail_res, dim=-1), dim=1, index=indices).squeeze()
entropy = -(head_entropy + tail_entropy)
###
running_offset += len(split_hiddens[idx])
total_loss = entropy.float().sum() if total_loss is None else total_loss + entropy.float().sum()
return (total_loss / len(targets)).type_as(weight)
if __name__ == '__main__':
np.random.seed(42)
torch.manual_seed(42)
if torch.cuda.is_available():
torch.cuda.manual_seed(42)
V = 8
H = 10
N = 100
E = 10
embed = torch.nn.Embedding(V, H)
crit = SplitCrossEntropyLoss(hidden_size=H, splits=[V // 2])
bias = torch.nn.Parameter(torch.ones(V))
optimizer = torch.optim.SGD(list(embed.parameters()) + list(crit.parameters()), lr=1)
for _ in range(E):
prev = torch.autograd.Variable((torch.rand(N, 1) * 0.999 * V).int().long())
x = torch.autograd.Variable((torch.rand(N, 1) * 0.999 * V).int().long())
y = embed(prev).squeeze()
c = crit(embed.weight, bias, y, x.view(N))
print('Crit', c.exp().data[0])
logprobs = crit.logprob(embed.weight, bias, y[:2]).exp()
print(logprobs)
print(logprobs.sum(dim=1))
optimizer.zero_grad()
c.backward()
optimizer.step()