-
Notifications
You must be signed in to change notification settings - Fork 19
/
sdsdot_amd64.s
168 lines (135 loc) · 3.02 KB
/
sdsdot_amd64.s
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
// func Sdsdot(N int, alpha float32, X []float32, incX int, Y []float32, incY int) float32
TEXT ·Sdsdot(SB), 7, $0
MOVQ N+0(FP), BP
MOVQ X_data+16(FP), SI
MOVQ incX+40(FP), AX
MOVQ Y_data+48(FP), DI
MOVQ incY+72(FP), BX
// Check data bounaries
MOVQ BP, CX
DECQ CX
MOVQ CX, DX
IMULQ AX, CX // CX = incX * (N - 1)
IMULQ BX, DX // DX = incY * (N - 1)
CMPQ CX, X_len+16(FP)
JGE panic
CMPQ DX, Y_len+56(FP)
JGE panic
// Clear accumulators
XORPD X0, X0
XORPD X1, X1
// Setup strides
SALQ $2, AX // AX = sizeof(float32) * incX
SALQ $2, BX // BX = sizeof(float32) * incY
// Check that there are 4 or more pairs for SIMD calculations
SUBQ $4, BP
JL rest // There are less than 4 pairs to process
// Check if incX != 1 or incY != 1
CMPQ AX, $4
JNE with_stride
CMPQ BX, $4
JNE with_stride
// Fully optimized loop (for incX == incY == 1)
full_simd_loop:
// Load four pairs
MOVUPS (SI), X2
MOVUPS (DI), X3
// Move two high pairs to low part of another registers
MOVHLPS X2, X4
MOVHLPS X3, X5
// Convert to float64
CVTPS2PD X2, X2
CVTPS2PD X3, X3
CVTPS2PD X4, X4
CVTPS2PD X5, X5
// Multiply converted values
MULPD X2, X3
MULPD X4, X5
// Update data pointers
ADDQ $16, SI
ADDQ $16, DI
// Accumulate the results of multiplications
ADDPD X3, X0
ADDPD X5, X1
SUBQ $4, BP
JGE full_simd_loop // There are 4 or more pairs to process
JMP hsum
with_stride:
// Setup long strides
MOVQ AX, CX
MOVQ BX, DX
SALQ $1, CX // CX = 16 * incX
SALQ $1, DX // DX = 16 * incY
// Partially optimized loop
half_simd_loop:
// Load first two pairs
MOVSS (SI), X2
MOVSS (SI)(AX*1), X6
MOVSS (DI), X3
MOVSS (DI)(BX*1), X7
// Convert them to float64 and multiply
UNPCKLPS X6, X2
UNPCKLPS X7, X3
CVTPS2PD X2, X2
CVTPS2PD X3, X3
MULPD X2, X3
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Load second two pairs
MOVSS (SI), X4
MOVSS (SI)(AX*1), X6
MOVSS (DI), X5
MOVSS (DI)(BX*1), X7
// Convert them to float64 and multiply
UNPCKLPS X6, X4
UNPCKLPS X7, X5
CVTPS2PD X4, X4
CVTPS2PD X5, X5
MULPD X4, X5
// Update data pointers using long strides
ADDQ CX, SI
ADDQ DX, DI
// Accumulate the results of multiplications
ADDPD X3, X0
ADDPD X5, X1
SUBQ $4, BP
JGE half_simd_loop // There are 4 or more pairs to process
hsum:
// Summ intermediate results from SIMD operations
ADDPD X0, X1
// Horizontal sum
MOVHLPS X1, X0
ADDSD X1, X0
rest:
// Undo last SUBQ
ADDQ $4, BP
// Check that are there any value to process
JE end
loop:
// Load one pair
MOVSS (SI), X2
MOVSS (DI), X3
// Convert them to float64 and multiply
CVTSS2SD X2, X2
CVTSS2SD X3, X3
MULSD X3, X2
// Update data pointers
ADDQ AX, SI
ADDQ BX, DI
// Accumulate the results of multiplication
ADDSD X2, X0
DECQ BP
JNE loop
end:
// Add alpha
MOVSS alpha+8(FP), X1
CVTSS2SD X1, X1
ADDSD X1, X0
// Convert result to float32 and return
CVTSD2SS X0, X0
MOVSS X0, r+80(FP)
RET
panic:
CALL ·panicIndex(SB)
RET