-
Notifications
You must be signed in to change notification settings - Fork 19
/
daxpy_amd64.s
302 lines (258 loc) · 5.64 KB
/
daxpy_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
//func Daxpy(N int, alpha float64, X []float64, incX int, Y []float64, incY int)
TEXT ·Daxpy(SB), 7, $0
MOVQ N+0(FP), BP
MOVSD alpha+8(FP), X0
MOVQ X_data+16(FP), SI
MOVQ incX+40(FP), AX
MOVQ Y_data+48(FP), DI
MOVQ incY+72(FP), BX
// Setup 0, 1, -1
PCMPEQL X1, X1
PCMPEQL X7, X7
XORPD X6, X6 // 0
PSLLQ $54, X1
PSRLQ $2, X1 // 1
PSLLQ $63, X7
ORPD X1, X7 // -1
// Check data bounaries
MOVQ BP, CX
DECQ CX
MOVQ CX, DX
IMULQ AX, CX // CX = incX * (N - 1)
IMULQ BX, DX // DX = incY * (N - 1)
CMPQ CX, X_len+24(FP)
JGE panic
CMPQ DX, Y_len+56(FP)
JGE panic
// Check that is there any work to do
UCOMISD X0, X6
JE end // alpha == 0
// Setup strides
SALQ $3, AX // AX = sizeof(float64) * incX
SALQ $3, BX // BX = sizeof(float64) * incY
// Check that there are 4 or more pairs for SIMD calculations
SUBQ $4, BP
JL rest // There are less than 4 pairs to process
// Setup two alphas in X0
MOVLHPS X0, X0
// Check if incX != 1 or incY != 1
CMPQ AX, $8
JNE with_stride
CMPQ BX, $8
JNE with_stride
// Fully optimized loops (for incX == incY == 1)
UCOMISD X0, X1
JE full_simd_loop_sum // alpha == 1
UCOMISD X0, X7
JE full_simd_loop_diff // alpha == -1
full_simd_loop:
// Load first two pairs and scale
MOVUPD (SI), X2
MOVUPD (DI), X3
MULPD X0, X2
// Load second two pairs and scale
MOVUPD 16(SI), X4
MOVUPD 16(DI), X5
MULPD X0, X4
// Save sum of first two pairs
ADDPD X2, X3
MOVUPD X3, (DI)
// Save sum of second two pairs
ADDPD X4, X5
MOVUPD X5, 16(DI)
// Update data pointers
ADDQ $32, SI
ADDQ $32, DI
SUBQ $4, BP
JGE full_simd_loop // There are 4 or more pairs to process
JMP rest
full_simd_loop_sum:
// Load first two pairs
MOVUPD (SI), X2
MOVUPD (DI), X3
// Load second two pairs
MOVUPD 16(SI), X4
MOVUPD 16(DI), X5
// Save a sum of first two pairs
ADDPD X2, X3
MOVUPD X3, (DI)
// Save a sum of second two pairs
ADDPD X4, X5
MOVUPD X5, 16(DI)
// Update data pointers
ADDQ $32, SI
ADDQ $32, DI
SUBQ $4, BP
JGE full_simd_loop_sum // There are 4 or more pairs to process
JMP rest_sum
full_simd_loop_diff:
// Load first two pairs
MOVUPD (SI), X2
MOVUPD (DI), X3
// Load second two pairs
MOVUPD 16(SI), X4
MOVUPD 16(DI), X5
// Save a difference of first two pairs
SUBPD X2, X3
MOVUPD X3, (DI)
// Save a difference of second two pairs
SUBPD X4, X5
MOVUPD X5, 16(DI)
// Update data pointers
ADDQ $32, SI
ADDQ $32, DI
SUBQ $4, BP
JGE full_simd_loop_diff // There are 4 or more pairs to process
JMP rest_diff
with_stride:
// Setup long strides
MOVQ AX, CX
MOVQ BX, DX
SALQ $1, CX // CX = 16 * incX
SALQ $1, DX // DX = 16 * incY
UCOMISD X0, X1
JE half_simd_loop_sum // alpha == 1
UCOMISD X0, X7
JE half_simd_loop_diff // alpha == -1
half_simd_loop:
// Load first two pairs and scale
MOVLPD (SI), X2
MOVHPD (SI)(AX*1), X2
MOVLPD (DI), X3
MOVHPD (DI)(BX*1), X3
MULPD X0, X2
// Save sum of first two pairs
ADDPD X2, X3
MOVLPD X3, (DI)
MOVHPD X3, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Load second two pairs and scale
MOVLPD (SI), X4
MOVHPD (SI)(AX*1), X4
MOVLPD (DI), X5
MOVHPD (DI)(BX*1), X5
MULPD X0, X4
// Save sum of second two pairs
ADDPD X4, X5
MOVLPD X5, (DI)
MOVHPD X5, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
SUBQ $4, BP
JGE half_simd_loop // There are 4 or more pairs to process
JMP rest
half_simd_loop_sum:
// Load first two pairs
MOVLPD (SI), X2
MOVHPD (SI)(AX*1), X2
MOVLPD (DI), X3
MOVHPD (DI)(BX*1), X3
// Save a sum of first two pairs
ADDPD X2, X3
MOVLPD X3, (DI)
MOVHPD X3, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Load second two pairs
MOVLPD (SI), X4
MOVHPD (SI)(AX*1), X4
MOVLPD (DI), X5
MOVHPD (DI)(BX*1), X5
// Save a sum of second two pairs
ADDPD X4, X5
MOVLPD X5, (DI)
MOVHPD X5, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
SUBQ $4, BP
JGE half_simd_loop // There are 4 or more pairs to process
JMP rest_sum
half_simd_loop_diff:
// Load first two pairs
MOVLPD (SI), X2
MOVHPD (SI)(AX*1), X2
MOVLPD (DI), X3
MOVHPD (DI)(BX*1), X3
// Save a difference of first two pairs
SUBPD X2, X3
MOVLPD X3, (DI)
MOVHPD X3, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Load second two pairs
MOVLPD (SI), X4
MOVHPD (SI)(AX*1), X4
MOVLPD (DI), X5
MOVHPD (DI)(BX*1), X5
// Save a difference of second two pairs
SUBPD X4, X5
MOVLPD X5, (DI)
MOVHPD X5, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
SUBQ $4, BP
JGE half_simd_loop // There are 4 or more pairs to process
JMP rest_diff
rest:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop:
// Load from X and scale
MOVSD (SI), X2
MULSD X0, X2
// Save a sum in Y
ADDSD (DI), X2
MOVSD X2, (DI)
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
DECQ BP
JNE loop
RET
rest_sum:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop_sum:
// Load from X
MOVSD (SI), X2
// Save a sum in Y
ADDSD (DI), X2
MOVSD X2, (DI)
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
DECQ BP
JNE loop_sum
RET
rest_diff:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop_diff:
// Load from Y
MOVSD (DI), X2
// Save sum in Y
SUBSD (SI), X2
MOVSD X2, (DI)
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
DECQ BP
JNE loop_diff
RET
panic:
CALL ·panicIndex(SB)
end:
RET