-
Notifications
You must be signed in to change notification settings - Fork 19
/
saxpy_amd64.s
290 lines (241 loc) · 5.07 KB
/
saxpy_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
//func Saxpy(N int, alpha float32, X []float32, incX int, Y []float32, incY int)
TEXT ·Saxpy(SB), 7, $0
MOVQ N+0(FP), BP
MOVSS alpha+8(FP), X0
MOVQ X_data+16(FP), SI
MOVQ incX+40(FP), AX
MOVQ Y_data+48(FP), DI
MOVQ incY+72(FP), BX
// Setup 0, 1, -1
PCMPEQW X1, X1
PCMPEQW X8, X8
XORPS X7, X7 // 0
PSLLL $25, X1
PSRLL $2, X1 // 1
PSLLL $31, X8
ORPS X1, X8 // -1
// Check data bounaries
MOVQ BP, CX
DECQ CX
MOVQ CX, DX
IMULQ AX, CX // CX = incX * (N - 1)
IMULQ BX, DX // DX = incY * (N - 1)
CMPQ CX, X_len+24(FP)
JGE panic
CMPQ DX, Y_len+56(FP)
JGE panic
// Check that is there any work to do
UCOMISS X0, X7
JE end // alpha == 0
// Setup strides
SALQ $2, AX // AX = sizeof(float32) * incX
SALQ $2, BX // BX = sizeof(float32) * incY
// Check that there are 4 or more pairs for SIMD calculations
SUBQ $4, BP
JL rest // There are less than 4 pairs to process
// Setup four alphas in X0
SHUFPS $0, X0, X0
// Check if incX != 1 or incY != 1
CMPQ AX, $4
JNE with_stride
CMPQ BX, $4
JNE with_stride
// Fully optimized loop (for incX == incY == 1)
UCOMISS X0, X1
JE full_simd_loop_sum // alpha == 1
UCOMISS X0, X8
JE full_simd_loop_diff // alpha == -1
full_simd_loop:
// Load four pairs and scale
MOVUPS (SI), X2
MOVUPS (DI), X3
MULPS X0, X2
// Save sum
ADDPS X2, X3
MOVUPS X3, (DI)
// Update data pointers
ADDQ $16, SI
ADDQ $16, DI
SUBQ $4, BP
JGE full_simd_loop // There are 4 or more pairs to process
JMP rest
full_simd_loop_sum:
// Load four pairs
MOVUPS (SI), X2
MOVUPS (DI), X3
// Save a sum
ADDPS X2, X3
MOVUPS X3, (DI)
// Update data pointers
ADDQ $16, SI
ADDQ $16, DI
SUBQ $4, BP
JGE full_simd_loop_sum // There are 4 or more pairs to process
JMP rest_sum
full_simd_loop_diff:
// Load four pairs
MOVUPS (SI), X2
MOVUPS (DI), X3
// Save a difference
SUBPS X2, X3
MOVUPS X3, (DI)
// Update data pointers
ADDQ $16, SI
ADDQ $16, DI
SUBQ $4, BP
JGE full_simd_loop_diff // There are 4 or more pairs to process
JMP rest_diff
with_stride:
// Setup long strides
MOVQ AX, CX
MOVQ BX, DX
SALQ $1, CX // CX = 8 * incX
SALQ $1, DX // DX = 8 * incY
UCOMISS X0, X1
JE half_simd_loop_sum // alpha == 1
UCOMISS X0, X8
JE half_simd_loop_diff // alpha == -1
half_simd_loop:
// Load first two pairs
MOVSS (SI), X2
MOVSS (SI)(AX*1), X4
MOVSS (DI), X3
MOVSS (DI)(BX*1), X5
// Create two half-vectors
UNPCKLPS X4, X2
UNPCKLPS X5, X3
// Save data pointer for destination
MOVQ DI, R8
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Load second two pairs
MOVSS (SI), X4
MOVSS (SI)(AX*1), X6
MOVSS (DI), X5
MOVSS (DI)(BX*1), X7
// Create two half-vectors
UNPCKLPS X6, X4
UNPCKLPS X7, X5
// Create two full-vectors
MOVLHPS X4, X2
MOVLHPS X5, X3
// Scale and sum
MULPS X0, X2
ADDPS X2, X3
// Unvectorize and save the result
MOVHLPS X3, X5
MOVSS X3, X4
MOVSS X5, X6
SHUFPS $0xe1, X3, X3
SHUFPS $0xe1, X5, X5
MOVSS X4, (R8)
MOVSS X3, (R8)(BX*1)
MOVSS X6, (DI)
MOVSS X5, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
SUBQ $4, BP
JGE half_simd_loop // There are 4 or more pairs to process
JMP rest
half_simd_loop_sum:
MOVSS (DI), X2
MOVSS (DI)(BX*1), X3
ADDSS (SI), X2
ADDSS (SI)(AX*1), X3
MOVSS X2, (DI)
MOVSS X3, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
MOVSS (DI), X4
MOVSS (DI)(BX*1), X5
ADDSS (SI), X4
ADDSS (SI)(AX*1), X5
MOVSS X4, (DI)
MOVSS X5, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
SUBQ $4, BP
JGE half_simd_loop_sum // There are 4 or more pairs to process
JMP rest_sum
half_simd_loop_diff:
MOVSS (DI), X2
MOVSS (DI)(BX*1), X3
SUBSS (SI), X2
SUBSS (SI)(AX*1), X3
MOVSS X2, (DI)
MOVSS X3, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
MOVSS (DI), X4
MOVSS (DI)(BX*1), X5
SUBSS (SI), X4
SUBSS (SI)(AX*1), X5
MOVSS X4, (DI)
MOVSS X5, (DI)(BX*1)
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
SUBQ $4, BP
JGE half_simd_loop_diff // There are 4 or more pairs to process
JMP rest_diff
rest:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop:
// Load from X and scale
MOVSS (SI), X2
MULSS X0, X2
// Save sum in Y
ADDSS (DI), X2
MOVSS X2, (DI)
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
DECQ BP
JNE loop
RET
rest_sum:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop_sum:
// Load from X
MOVSS (SI), X2
// Save sum in Y
ADDSS (DI), X2
MOVSS X2, (DI)
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
DECQ BP
JNE loop_sum
RET
rest_diff:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop_diff:
// Load from Y
MOVSS (DI), X2
// Save sum in Y
SUBSS (SI), X2
MOVSS X2, (DI)
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
DECQ BP
JNE loop_diff
RET
panic:
CALL ·panicIndex(SB)
end:
RET