This repository has been archived by the owner on Oct 21, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 25
/
avxdefs.h
132 lines (113 loc) · 4.16 KB
/
avxdefs.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
// Some tools to help using AVX and AVX2
#include <inttypes.h>
//#include <immintrin.h>
// Use these overlays to access the same data in memory as different types
//
// Can they be used to access data in ymm/xmm regs?
// Can they be used in expressions?
// uint64 a;
// area256 v;
// _mm256_load_si256( v.v256, p );
// a = v.v64[0];
// a = v.64[0] + v.v64[1];
typedef union
{
#if defined __AVX__
__m256i v256;
#endif
__m128i v128[ 2];
uint64_t v64 [ 4];
uint32_t v32 [ 8];
uint16_t v16 [16];
uint8_t v8 [32];
} area256;
typedef union
{
__m128i v128;
uint64_t v64[ 2];
uint32_t v32[ 4];
uint16_t v16[ 8];
uint8_t v8 [16];
} area128;
// useful instrinsics to move data between regs
// move 128 from ymm to xmm
// dst = a[127:0] ; imm8 == 0
// dst = a[255:0] ; imm8 == 1
//__m128i _mm256_extracti128_si256(__m256i a, int imm8)
//move 128 from xmm to ymm
// dst = a( a[255:128], b ) ; mask == 0
// dst = a( b, a[127:0] ) ; mask == 1
//__m256i _mm256_inserti128_si256(__m256i a, __m128i b, const int mask);
#if defined __AVX2__
// Rotate bits in 4 uint64
// __m256i mm256_rotr_64( __256i, int )
#define mm256_rotr_64(w,c) _mm256_or_si256(_mm256_srli_epi64(w, c), \
_mm256_slli_epi64(w, 64 - c))
//static inline __m256i mm256_rotr_64 ( __m256i w, int c)
//{
// return _mm256_or_si256( _mm256_srli_epi64( w, c ),
// _mm256_slli_epi64( w, 64 - c ) );
//}
// Rotate uint64 by one uint64
//__m256i mm256_rotl256_1x64( _mm256i, int )
#define mm256_rotl256_1x64(s) _mm256_permute4x64_epi64( s, 0x39 )
#define mm256_rotr256_1x64(s) _mm256_permute4x64_epi64( s, 0x93 )
//static inline __m256i mm256_rotl256_1x64( __m256i s )
//{
// return _mm256_permute4x64_epi64( s, 0x39 );
//}
//static inline __m256i mm256_rotr256_1x64( __m256i s )
//{
// return _mm256_permute4x64_epi64( s, 0x93 );
//}
// swap hi and lo 128 bits in 256 bit vector
// __m256i mm256_swap128( __m256i )
#define mm256_swap128(s) _mm256_permute2f128_si256( s, s, 1 )
//static inline __m256i mm256_swap128( __m256i s )
//{
// return _mm256_permute2f128_si256( s, s, 1 );
//}
#endif
// rotate bits in 2 uint64
// _m128i mm_rotr_64( __m128i, int )
#define mm_rotr_64(w,c) _mm_or_si128(_mm_srli_epi64(w, c), \
_mm_slli_epi64(w, 64 - c))
//static inline __m128i mm_rotr_64( __m128i w, int c )
//{
// _mm_or_si128( _mm_srli_epi64( w, c ),
// _mm_slli_epi64( w, 64 - c ) );
//}
// swap 128 bit source vectors
// void mm128_swap128( __m128i, __m128i )
// macro is better to update two args
#define mm128_swap128(s0, s1) s0 = _mm_xor_si128(s0, s1); \
s1 = _mm_xor_si128(s0, s1); \
s0 = _mm_xor_si128(s0, s1);
// swap upper and lower 64 bits of 128 bit source vector
// __m128i mm128_swap64( __m128 )
#define mm128_swap64(s) _mm_or_si128( _mm_slli_si128( s, 8 ), \
_mm_srli_si128( s, 8 ) )
// rotate 2 128 bit vectors as one 256 vector by 1 uint64, use as equivalent of
// mm256_rotl256_1x64 when avx2 is not available or data is alreeady in __m128i
// format. uses one local
//void mm128_rotl256_1x64( __m128i, __m128i )
#define mm128_rotl256_1x64(s0, s1) do { \
__m128i t; \
s0 = mm128_swap64( s0); \
s1 = mm128_swap64( s1); \
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s0 = t; \
} while(0)
#define mm128_rotr256_1x64(s0, s1) do { \
__m128i t; \
s0 = mm128_swap64( s0); \
s1 = mm128_swap64( s1); \
t = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0xffffffffffffffffull,0ull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0ull,0xffffffffffffffffull) ) ); \
s1 = _mm_or_si128( _mm_and_si128( s0, _mm_set_epi64x(0ull,0xffffffffffffffffull) ), \
_mm_and_si128( s1, _mm_set_epi64x(0xffffffffffffffffull,0ull) ) ); \
s0 = t; \
} while(0)