-
Notifications
You must be signed in to change notification settings - Fork 25
/
simd_vectors.h
880 lines (741 loc) · 27.7 KB
/
simd_vectors.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
#ifndef TDOKU_SIMD_VECTORS_H
#define TDOKU_SIMD_VECTORS_H
#include <cstring>
#include <immintrin.h>
#include <memory>
#include <tuple>
#include <utility>
/*
* We'll support vectors targeting sse2, sse4_1, avx2, and avx512bitalg instruction sets.
* While avx2 or avx512 will be ideal, sse4_1 should deliver solid performance. OTOH, sse2
* performance is seriously handicapped because of our heavy reliance on fast ssse3 shuffles
* for which there is no great sse2 alternative.
*
* sse2 - pentium4 2000
* has most of the instructions we'll use, with exceptions noted below
*
* ssse3 2006 - core2 2006
* _mm_shuffle_epi8 // sse2 alt: kind of a mess. see below.
*
* sse4_1 - penryn 2007
* _mm_testz_si128 // sse2 alt: movemask(cmpeq(setzero())) in sse2
* _mm_blend_epi16 // sse2 alt: &| with masks
* _mm_minpos_epu8
*
* sse4_2 - nehalem 2007
* _mm_cmpgt_epi64
*
* avx2 - haswell 2013
* _mm256 versions of most everything
*
* avx512vl - skylake 2017
* _mm(256)_ternarylogic_epi32
*
* avx512vpopcntdq, avx512bitalg - ice lake 2019
* _mm_popcnt_epi64
* _mm256_popcnt_epi16
*
* September 2022 Steam monthly hardware survey:
* SSE2 100.00%
* SSSE3 99.50%
* SSE4.1 99.24%
* SSE4.2 98.95%
* AVX 95.27%
* AVX2 89.05%
*/
// for functions like extract below where we use switches to determine which immediate to use
// we'll assume only valid values are passed and omit the default, thereby allowing the compiler's
// assumption of defined behavior to optimize away a branch.
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wreturn-type"
struct TwoBy64 {
uint64_t x0;
uint64_t x1;
};
struct FourBy64 {
uint64_t x0;
uint64_t x1;
uint64_t x2;
uint64_t x3;
};
namespace {
struct Consts {
__m128i popcount_mask4 = _mm_set1_epi16(0x0f);
__m128i popcount_lookup = _mm_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m128i rotate_rows1 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
};
const Consts consts{};
// Postfix-named specifications for three-operand boolean functions to use with ternarylogic intrinsics.
//
// X 11110000
// Y 11001100
// Z 10101010
// --------
constexpr int OP_X_Y_and_Z_or = 0b11101010;
constexpr int OP_X_Y_andnot_Z_or = 0b10111010;
constexpr int OP_X_Y_or_Z_or = 0b11111110;
constexpr int OP_X_Y_xor_Z_or = 0b10111110;
} // namespace
struct Bitvec08x16 {
__m128i vec;
Bitvec08x16() : vec{} {}
// non-explicit conversions intended
Bitvec08x16(const __m128i &m128i) noexcept : vec{m128i} {}
Bitvec08x16(const Bitvec08x16 &other) noexcept : vec(other.vec) {}
Bitvec08x16(uint16_t x00, uint16_t x01, uint16_t x02, uint16_t x03,
uint16_t x04, uint16_t x05, uint16_t x06, uint16_t x07) :
vec{_mm_setr_epi16(x00, x01, x02, x03, x04, x05, x06, x07)} {}
static inline Bitvec08x16 All(uint16_t value) {
return _mm_set1_epi16(value);
}
static inline Bitvec08x16
X_Y_and_Z_or(const Bitvec08x16 &x, const Bitvec08x16 &y, const Bitvec08x16 &z) {
#if(defined __AVX512VL__ && defined __AVX512F__)
return _mm_ternarylogic_epi32(x.vec, y.vec, z.vec, OP_X_Y_and_Z_or);
#else
return (x & y) | z;
#endif
}
static inline Bitvec08x16
X_Y_or_Z_or(const Bitvec08x16 &x, const Bitvec08x16 &y, const Bitvec08x16 &z) {
#if(defined __AVX512VL__ && defined __AVX512F__)
return _mm_ternarylogic_epi32(x.vec, y.vec, z.vec, OP_X_Y_or_Z_or);
#else
return x | y | z;
#endif
}
inline Bitvec08x16 &operator=(const __m128i &m128i) {
vec = m128i;
return *this;
}
inline Bitvec08x16 &operator=(const Bitvec08x16 &other) = default;
inline bool operator==(const Bitvec08x16 &other) const {
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm_cmp_epi16_mask(vec, other.vec, _MM_CMPINT_EQ) == 0xf;
#else
return (*this ^ other).AllZero();
#endif
}
inline bool operator!=(const Bitvec08x16 &other) const {
return !(*this == other);
}
inline TwoBy64 As_2x64() const {
TwoBy64 out{};
_mm_store_si128((__m128i *) &out, vec);
return out;
}
inline Bitvec08x16 WhichEqual(const Bitvec08x16 &other) const {
return _mm_cmpeq_epi16(vec, other.vec);
}
inline Bitvec08x16 WhichNonZero() const {
return _mm_cmpgt_epi16(vec, _mm_setzero_si128());
}
inline bool AllZero() const {
#ifdef __SSE4_1__
return _mm_test_all_zeros(vec, vec) != 0;
#else
return _mm_movemask_epi8(WhichEqual(_mm_setzero_si128()).vec) == 0xffff;
#endif
}
inline bool AnyZero() const {
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm_cmp_epi16_mask(vec, _mm_setzero_si128(), _MM_CMPINT_EQ) != 0;
#else
Bitvec08x16 which_equal_zero = WhichEqual(_mm_setzero_si128());
return _mm_movemask_epi8(which_equal_zero.vec) != 0;
#endif
}
inline bool AnyLessThan(const Bitvec08x16 &other) const {
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm_cmp_epi16_mask(vec, other.vec, _MM_CMPINT_LT) != 0;
#else
Bitvec08x16 which_less_than = _mm_cmpgt_epi16(other.vec, vec);
return _mm_movemask_epi8(which_less_than.vec) != 0;
#endif
}
inline bool Intersects(const Bitvec08x16 &other) const {
#ifdef __SSE4_1__
return !_mm_testz_si128(vec, other.vec);
#else
return !(*this & other).AllZero();
#endif
}
inline bool SubsetOf(const Bitvec08x16 &other) const {
#ifdef __SSE4_1__
return _mm_testc_si128(other.vec, vec);
#else
return Bitvec08x16{_mm_andnot_si128(other.vec, vec)}.AllZero();
#endif
}
inline Bitvec08x16 GetLowBit() const {
#ifdef __SSSE3__
return _mm_and_si128(vec, _mm_sign_epi16(vec, _mm_set1_epi16(-1)));
#else
return _mm_and_si128(vec, _mm_add_epi16(_mm_xor_si128(vec, _mm_set1_epi16(-1)), _mm_set1_epi16(1)));
#endif
}
inline Bitvec08x16 ClearLowBit() const {
#ifdef __SSE4_2__
__m128i cmp = _mm_cmpgt_epi64(vec, _mm_setzero_si128());
#else
__m128i cmp = _mm_cmpgt_epi32(vec, _mm_setzero_si128());
cmp = _mm_or_si128(cmp, _mm_shuffle_epi32(cmp, 0b10110001));
#endif
__m128i one = _mm_andnot_si128(_mm_slli_si128(cmp, 1), _mm_srli_epi64(cmp, 63));
return _mm_and_si128(vec, _mm_sub_epi64(vec, one));
}
// counts the number of bits set among the 9 lowest order bits of each packed 16-bit integer
// subject to the assumption that the 7 high bits are zero. results are undefined if any of
// the 7 high bits are nonzero.
inline Bitvec08x16 Popcounts9() const {
#ifdef __SSSE3__
Bitvec08x16 sum_0_3 = Bitvec08x16{consts.popcount_lookup}.Shuffle(
*this & consts.popcount_mask4);
Bitvec08x16 sum_4_7 = Bitvec08x16{consts.popcount_lookup}.Shuffle(
_mm_srli_epi16(vec, 4));
Bitvec08x16 sum_0_7 = _mm_add_epi16(sum_0_3.vec, sum_4_7.vec);
Bitvec08x16 result = _mm_add_epi16(sum_0_7.vec, _mm_srli_epi16(vec, 8));
return result;
#else
// SSE2 version following https://www.hackersdelight.org/hdcodetxt/pop.c.txt
__m128i mask1 = _mm_set1_epi8(0x77);
__m128i mask2 = _mm_set1_epi8(0x0f);
__m128i mask3 = _mm_set1_epi16(0xff);
__m128i x = vec;
__m128i n = _mm_and_si128(mask1, _mm_srli_epi64(x, 1));
x = _mm_sub_epi8(x, n);
n = _mm_and_si128(mask1, _mm_srli_epi64(n, 1));
x = _mm_sub_epi8(x, n);
n = _mm_and_si128(mask1, _mm_srli_epi64(n, 1));
x = _mm_sub_epi8(x, n);
x = _mm_add_epi8(x, _mm_srli_epi16(x, 4));
x = _mm_and_si128(mask2, x);
x = _mm_add_epi16(_mm_and_si128(x, mask3),
_mm_and_si128(_mm_bsrli_si128(x, 1), mask3));
return x;
#endif
}
inline int Popcount() const {
#if(defined __AVX512VPOPCNTDQ__ && defined __AVX512VL__)
__m128i counts = _mm_popcnt_epi64(vec);
return _mm_cvtsi128_si64(counts) + _mm_cvtsi128_si64(_mm_unpackhi_epi64(counts, counts));
#else
// unpackhi_epi64+cvtsi128_si64 compiles to the same instructions as extract_epi64,
// but works on windows where extract_epi64 is missing.
return NumBitsSet64((uint64_t) _mm_cvtsi128_si64(vec)) +
NumBitsSet64((uint64_t) _mm_cvtsi128_si64(_mm_unpackhi_epi64(vec, vec)));
#endif
}
inline uint32_t MinPosGreaterThanOrEqual(uint16_t min_val) {
#ifdef __SSE4_1__
return _mm_cvtsi128_si32(_mm_minpos_epu16(_mm_sub_epi16(vec, _mm_set1_epi16(min_val))));
#else
uint32_t min = 0xffff;
uint32_t pos = 0;
uint64_t lo = _mm_cvtsi128_si64(vec);
for (int i = 0; i < 4; i++) {
uint32_t val = ((int32_t)(lo & 0xffffu) - min_val);
if (val < min) {
min = val;
pos = i;
}
lo >>= 16u;
}
uint64_t hi = _mm_cvtsi128_si64(_mm_unpackhi_epi64(vec, vec));
for (int i = 4; i < 8; i++) {
uint32_t val = ((int32_t)(hi & 0xffffu) - min_val);
if (val < min) {
min = val;
pos = i;
}
hi >>= 16u;
}
return (pos << 16u) | min;
#endif
}
inline Bitvec08x16 Shuffle(const Bitvec08x16 &control) const {
#ifdef __SSSE3__
return _mm_shuffle_epi8(vec, control.vec);
#else
// we'll rely on the assumption that all requested shuffles are for epi16s so each
// pair of requested bytes are always adjacent like 0x0302.
__m128i ctrl = control.vec & _mm_set1_epi16(0xf);
// replicate low 16 bits of each epi32 to the high 16
Bitvec08x16 lo16s = vec & _mm_set1_epi32(0x0000ffff);
lo16s |= _mm_bslli_si128(lo16s.vec, 2);
// and vice versa
Bitvec08x16 hi16s = vec & _mm_set1_epi32(0xffff0000);
hi16s |= _mm_bsrli_si128(hi16s.vec, 2);
Bitvec08x16 z{};
z |= _mm_shuffle_epi32(lo16s.vec, 0b00000000) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0x0));
z |= _mm_shuffle_epi32(hi16s.vec, 0b00000000) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0x2));
z |= _mm_shuffle_epi32(lo16s.vec, 0b01010101) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0x4));
z |= _mm_shuffle_epi32(hi16s.vec, 0b01010101) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0x6));
z |= _mm_shuffle_epi32(lo16s.vec, 0b10101010) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0x8));
z |= _mm_shuffle_epi32(hi16s.vec, 0b10101010) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0xa));
z |= _mm_shuffle_epi32(lo16s.vec, 0b11111111) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0xc));
z |= _mm_shuffle_epi32(hi16s.vec, 0b11111111) & _mm_cmpeq_epi16(ctrl, _mm_set1_epi16(0xe));
return z;
#endif
}
inline Bitvec08x16 RotateRows() const {
#ifdef __SSSE3__
return Shuffle(consts.rotate_rows1);
#else
__m128i mask1 = _mm_setr_epi16(0xffff, 0xffff, 0xffff, 0x0, 0xffff, 0xffff, 0xffff, 0x0);
__m128i mask2 = _mm_setr_epi16(0x0, 0x0, 0x0, 0xffff, 0x0, 0x0, 0x0, 0xffff);
return _mm_or_si128(
_mm_and_si128(_mm_bsrli_si128(vec, 2), mask1),
_mm_and_si128(_mm_bslli_si128(vec, 6), mask2));
#endif
}
inline Bitvec08x16 RotateRows2() const {
#ifdef __SSSE3__
return _mm_shuffle_epi32(vec, 0b10110001);
#else
__m128i mask1 = _mm_setr_epi16(0xffff, 0xffff, 0x0, 0x0, 0xffff, 0xffff, 0x0, 0x0);
__m128i mask2 = _mm_setr_epi16(0x0, 0x0, 0xffff, 0xffff, 0x0, 0x0, 0xffff, 0xffff);
return _mm_or_si128(
_mm_and_si128(_mm_bsrli_si128(vec, 4), mask1),
_mm_and_si128(_mm_bslli_si128(vec, 4), mask2));
#endif
}
inline Bitvec08x16 RotateCols() const {
return _mm_shuffle_epi32(vec, 0b01001110);
}
inline uint16_t Extract(int index) const {
#define CASE(x) case x: return _mm_extract_epi16(vec, x);
switch (index) {
CASE(0)
CASE(1)
CASE(2)
CASE(3)
CASE(4)
CASE(5)
CASE(6)
CASE(7)
}
#undef CASE
}
void Insert(int index, uint16_t value) {
#define CASE(x) case x: vec = _mm_insert_epi16(vec, value, x); break;
switch (index) {
CASE(0)
CASE(1)
CASE(2)
CASE(3)
CASE(4)
CASE(5)
CASE(6)
CASE(7)
}
#undef CASE
}
inline Bitvec08x16 operator|(const Bitvec08x16 &other) const {
return _mm_or_si128(vec, other.vec);
}
inline void operator|=(const Bitvec08x16 &other) {
vec = (*this | other).vec;
}
inline Bitvec08x16 operator^(const Bitvec08x16 &other) const {
return _mm_xor_si128(vec, other.vec);
}
inline void operator^=(const Bitvec08x16 &other) {
vec = (*this ^ other).vec;
}
inline Bitvec08x16 operator&(const Bitvec08x16 &other) const {
return _mm_and_si128(vec, other.vec);
}
inline void operator&=(const Bitvec08x16 &other) {
vec = (*this & other).vec;
}
inline Bitvec08x16 and_not(const Bitvec08x16 &other) const {
return _mm_andnot_si128(other.vec, vec);
};
};
#ifndef __AVX2__
struct Bitvec16x16 {
Bitvec08x16 lo_;
Bitvec08x16 hi_;
Bitvec16x16() noexcept : lo_{}, hi_{} {}
// non-explicit conversions intended
Bitvec16x16(const Bitvec16x16 &other) noexcept = default;
Bitvec16x16(const Bitvec08x16 &lo, const Bitvec08x16 &hi) noexcept : lo_(lo.vec), hi_(hi.vec) {}
Bitvec16x16(uint16_t x00, uint16_t x01, uint16_t x02, uint16_t x03,
uint16_t x04, uint16_t x05, uint16_t x06, uint16_t x07,
uint16_t x08, uint16_t x09, uint16_t x10, uint16_t x11,
uint16_t x12, uint16_t x13, uint16_t x14, uint16_t x15) noexcept :
lo_{_mm_setr_epi16(x00, x01, x02, x03, x04, x05, x06, x07)},
hi_{_mm_setr_epi16(x08, x09, x10, x11, x12, x13, x14, x15)} {}
static inline Bitvec16x16 All(uint16_t value) {
return Bitvec16x16{Bitvec08x16::All(value), Bitvec08x16::All(value)};
}
static inline Bitvec16x16
X_Y_and_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
return Bitvec16x16{(x.lo_ & y.lo_) | z.lo_, (x.hi_ & y.hi_) | z.hi_};
}
static inline Bitvec16x16
X_Y_andnot_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
return Bitvec16x16{x.lo_.and_not(y.lo_) | z.lo_, x.hi_.and_not(y.hi_) | z.hi_};
}
static inline Bitvec16x16
X_Y_or_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
return Bitvec16x16{x.lo_ | y.lo_ | z.lo_, x.hi_ | y.hi_ | z.hi_};
}
static inline Bitvec16x16
X_Y_xor_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
return Bitvec16x16{(x.lo_ ^ y.lo_) | z.lo_, (x.hi_ ^ y.hi_) | z.hi_};
}
inline Bitvec16x16 &operator=(const Bitvec16x16 &other) = default;
inline bool operator==(const Bitvec16x16 &other) const {
return lo_ == other.lo_ && hi_ == other.hi_;
}
inline bool operator!=(const Bitvec16x16 &other) const {
return lo_ != other.lo_ || hi_ != other.hi_;
}
inline const Bitvec08x16 GetLo() const {
return lo_;
}
inline const Bitvec08x16 GetHi() const {
return hi_;
}
inline FourBy64 As_4x64() const {
FourBy64 out{};
TwoBy64 lo_2x64 = lo_.As_2x64();
TwoBy64 hi_2x64 = hi_.As_2x64();
memcpy(&out.x0, &lo_2x64, sizeof(lo_2x64));
memcpy(&out.x2, &hi_2x64, sizeof(hi_2x64));
return out;
}
inline Bitvec16x16 WhichEqual(const Bitvec16x16 &other) const {
return Bitvec16x16{lo_.WhichEqual(other.lo_), hi_.WhichEqual(other.hi_)};
}
inline Bitvec16x16 WhichNonZero() const {
return Bitvec16x16{lo_.WhichNonZero(), hi_.WhichNonZero()};
}
inline bool AllZero() const {
return lo_.AllZero() && hi_.AllZero();
}
inline bool AnyZero() const {
return lo_.AnyZero() || hi_.AnyZero();
}
inline bool AnyLessThan(const Bitvec16x16 &other) const {
return lo_.AnyLessThan(other.lo_) || hi_.AnyLessThan(other.hi_);
}
inline bool Intersects(const Bitvec16x16 &other) const {
return lo_.Intersects(other.lo_) || hi_.Intersects(other.hi_);
}
inline bool SubsetOf(const Bitvec16x16 &other) const {
return lo_.SubsetOf(other.lo_) && hi_.SubsetOf(other.hi_);
}
// counts the number of bits set among the 9 lowest order bits of each packed 16-bit integer
// subject to the assumption that the 7 high bits are zero. results are undefined if any of
// the 7 high bits are nonzero.
inline Bitvec16x16 Popcounts9() const {
return Bitvec16x16{lo_.Popcounts9(), hi_.Popcounts9()};
}
inline Bitvec16x16 Shuffle(const Bitvec16x16 &control) const {
return Bitvec16x16{lo_.Shuffle(control.lo_), hi_.Shuffle(control.hi_)};
}
inline Bitvec16x16 RotateRows() const {
return Bitvec16x16{lo_.RotateRows(), hi_.RotateRows()};
}
inline Bitvec16x16 RotateRows2() const {
return Bitvec16x16{lo_.RotateRows2(), hi_.RotateRows2()};
}
inline Bitvec16x16 RotateCols() const {
#ifdef __SSSE3__
return Bitvec16x16{_mm_alignr_epi8(hi_.vec, lo_.vec, 8),
_mm_alignr_epi8(lo_.vec, hi_.vec, 8)};
#else
return Bitvec16x16{_mm_or_si128(_mm_srli_si128(lo_.vec, 8), _mm_slli_si128(hi_.vec, 8)),
_mm_or_si128(_mm_srli_si128(hi_.vec, 8), _mm_slli_si128(lo_.vec, 8))};
#endif
}
inline Bitvec16x16 RotateCols2() const {
return Bitvec16x16{hi_, lo_};
}
inline uint16_t Extract(int index) const {
if (index < 8) {
return lo_.Extract(index);
} else {
return hi_.Extract(index - 8);
}
}
inline void Insert(int index, uint16_t value) {
if (index < 8) {
lo_.Insert(index, value);
} else {
hi_.Insert(index - 8, value);
}
}
inline Bitvec16x16 operator|(const Bitvec16x16 &other) const {
return Bitvec16x16{lo_ | other.lo_, hi_ | other.hi_};
}
inline void operator|=(const Bitvec16x16 &other) {
lo_ |= other.lo_;
hi_ |= other.hi_;
}
inline Bitvec16x16 operator^(const Bitvec16x16 &other) const {
return Bitvec16x16{lo_ ^ other.lo_, hi_ ^ other.hi_};
}
inline void operator^=(const Bitvec16x16 &other) {
lo_ ^= other.lo_;
hi_ ^= other.hi_;
}
inline Bitvec16x16 operator&(const Bitvec16x16 &other) const {
return Bitvec16x16{lo_ & other.lo_, hi_ & other.hi_};
}
inline void operator&=(const Bitvec16x16 &other) {
lo_ &= other.lo_;
hi_ &= other.hi_;
}
inline Bitvec16x16 and_not(const Bitvec16x16 &other) const {
return Bitvec16x16{lo_.and_not(other.lo_), hi_.and_not(other.hi_)};
};
};
#endif //!__AVX2__
#ifdef __AVX2__
#ifdef __GNUC__
#if __GNUC__ < 8
#define _mm256_set_m128i(x, y) _mm256_permute2f128_si256(_mm256_castsi128_si256(x), \
_mm256_castsi128_si256(y), 2)
#endif
#endif
struct Bitvec16x16 {
__m256i vec;
Bitvec16x16() noexcept : vec{} {}
// non-explicit conversions intended
Bitvec16x16(const __m256i &m256i) noexcept : vec{m256i} {}
Bitvec16x16(const Bitvec16x16 &other) noexcept = default;
Bitvec16x16(const Bitvec08x16 &lo, const Bitvec08x16 &hi) noexcept :
vec{_mm256_set_m128i(hi.vec, lo.vec)} {}
Bitvec16x16(uint16_t x00, uint16_t x01, uint16_t x02, uint16_t x03,
uint16_t x04, uint16_t x05, uint16_t x06, uint16_t x07,
uint16_t x08, uint16_t x09, uint16_t x10, uint16_t x11,
uint16_t x12, uint16_t x13, uint16_t x14, uint16_t x15) noexcept :
vec{_mm256_setr_epi16(x00, x01, x02, x03, x04, x05, x06, x07,
x08, x09, x10, x11, x12, x13, x14, x15)} {}
static Bitvec16x16 All(uint16_t value) {
return _mm256_set1_epi16(value);
}
static inline Bitvec16x16
X_Y_and_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
#if(defined __AVX512VL__ && defined __AVX512F__)
return _mm256_ternarylogic_epi32(x.vec, y.vec, z.vec, OP_X_Y_and_Z_or);
#else
return (x & y) | z;
#endif
}
static inline Bitvec16x16
X_Y_andnot_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
#if(defined __AVX512VL__ && defined __AVX512F__)
return _mm256_ternarylogic_epi32(x.vec, y.vec, z.vec, OP_X_Y_andnot_Z_or);
#else
return x.and_not(y) | z;
#endif
}
static inline Bitvec16x16
X_Y_or_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
#if(defined __AVX512VL__ && defined __AVX512F__)
return _mm256_ternarylogic_epi32(x.vec, y.vec, z.vec, OP_X_Y_or_Z_or);
#else
return x | y | z;
#endif
}
static inline Bitvec16x16
X_Y_xor_Z_or(const Bitvec16x16 &x, const Bitvec16x16 &y, const Bitvec16x16 &z) {
#if(defined __AVX512VL__ && defined __AVX512F__)
return _mm256_ternarylogic_epi32(x.vec, y.vec, z.vec, OP_X_Y_xor_Z_or);
#else
return (x ^ y) | z;
#endif
}
inline Bitvec16x16 &operator=(const Bitvec16x16 &other) = default;
inline bool operator==(const Bitvec16x16 &other) const {
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm256_cmp_epi16_mask(vec, other.vec, _MM_CMPINT_EQ) == 0xf;
#else
return (*this ^ other).AllZero();
#endif
}
inline bool operator!=(const Bitvec16x16 &other) const {
return !(*this == other);
}
inline Bitvec08x16 GetLo() const {
return _mm256_extracti128_si256(vec, 0);
}
inline Bitvec08x16 GetHi() const {
return _mm256_extracti128_si256(vec, 1);
}
inline FourBy64 As_4x64() const {
FourBy64 out{};
out.x0 = _mm256_extract_epi64(vec, 0);
out.x1 = _mm256_extract_epi64(vec, 1);
out.x2 = _mm256_extract_epi64(vec, 2);
out.x3 = _mm256_extract_epi64(vec, 3);
return out;
}
inline Bitvec16x16 WhichEqual(const Bitvec16x16 &other) const {
return _mm256_cmpeq_epi16(vec, other.vec);
}
inline Bitvec16x16 WhichNonZero() const {
return _mm256_cmpgt_epi16(vec, _mm256_setzero_si256());
}
inline bool AllZero() const {
return _mm256_testz_si256(vec, vec);
}
inline bool AnyZero() const {
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm256_cmp_epi16_mask(vec, _mm256_setzero_si256(), _MM_CMPINT_EQ) != 0;
#else
Bitvec16x16 which_equal_zero = WhichEqual(_mm256_setzero_si256());
return _mm256_movemask_epi8(which_equal_zero.vec) != 0;
#endif
}
inline bool AnyLessThan(const Bitvec16x16 &other) const {
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm256_cmp_epi16_mask(vec, other.vec, _MM_CMPINT_LT) != 0;
#else
Bitvec16x16 which_less_than = _mm256_cmpgt_epi16(other.vec, vec);
return _mm256_movemask_epi8(which_less_than.vec) != 0;
#endif
}
inline bool Intersects(const Bitvec16x16 &other) const {
return !_mm256_testz_si256(vec, other.vec);
}
inline bool SubsetOf(const Bitvec16x16 &other) const {
return _mm256_testc_si256(other.vec, vec);
}
// counts the number of bits set among the 9 lowest order bits of each packed 16-bit integer
// subject to the assumption that the 7 high bits are zero. results are undefined if any of
// the 7 high bits are nonzero.
inline Bitvec16x16 Popcounts9() const {
#if(defined __AVX512BITALG__ && defined __AVX512VL__)
return _mm256_popcnt_epi16(vec);
#else
__m256i lookup = _mm256_setr_epi8(0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4);
__m256i mask4 = _mm256_set1_epi16(0x0f);
__m256i sum_0_3 = _mm256_shuffle_epi8(lookup, _mm256_and_si256(vec, mask4));
__m256i sum_4_7 = _mm256_shuffle_epi8(lookup, _mm256_srli_epi16(vec, 4));
__m256i sum_0_7 = _mm256_add_epi16(sum_0_3, sum_4_7);
return _mm256_add_epi16(sum_0_7, _mm256_srli_epi16(vec, 8));
#endif
}
inline Bitvec16x16 Shuffle(const Bitvec16x16 &control) const {
return _mm256_shuffle_epi8(vec, control.vec);
}
inline Bitvec16x16 RotateRows() const {
#if(defined __AVX512VBMI2__ && defined __AVX512VL__)
return _mm256_shldi_epi64(vec, vec, 16);
#else
__m256i shuffle_control =
_mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9,
2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
return _mm256_shuffle_epi8(vec, shuffle_control);
#endif
}
inline Bitvec16x16 RotateRows2() const {
return _mm256_shuffle_epi32(vec, 0b10110001);
}
inline Bitvec16x16 RotateCols() const {
return _mm256_permute4x64_epi64(vec, 0b00111001);
}
inline Bitvec16x16 RotateCols2() const {
return _mm256_permute4x64_epi64(vec, 0b01001110);
}
inline uint16_t Extract(int index) const {
#define CASE(x) case x: return _mm256_extract_epi16(vec, x);
switch (index) {
CASE(0)
CASE(1)
CASE(2)
CASE(3)
CASE(4)
CASE(5)
CASE(6)
CASE(7)
CASE(8)
CASE(9)
CASE(10)
CASE(11)
CASE(12)
CASE(13)
CASE(14)
CASE(15)
}
#undef CASE
}
inline void Insert(int index, uint16_t value) {
#define CASE(x) case x: vec = _mm256_insert_epi16(vec, value, x); break;
switch (index) {
CASE(0)
CASE(1)
CASE(2)
CASE(3)
CASE(4)
CASE(5)
CASE(6)
CASE(7)
CASE(8)
CASE(9)
CASE(10)
CASE(11)
CASE(12)
CASE(13)
CASE(14)
CASE(15)
}
#undef CASE
}
inline Bitvec16x16 operator|(const Bitvec16x16 &other) const {
return _mm256_or_si256(vec, other.vec);
}
inline void operator|=(const Bitvec16x16 &other) {
vec = (*this | other).vec;
}
inline Bitvec16x16 operator^(const Bitvec16x16 &other) const {
return _mm256_xor_si256(vec, other.vec);
}
inline void operator^=(const Bitvec16x16 &other) {
vec = (*this ^ other).vec;
}
inline Bitvec16x16 operator&(const Bitvec16x16 &other) const {
return _mm256_and_si256(vec, other.vec);
}
inline void operator&=(const Bitvec16x16 &other) {
vec = (*this & other).vec;
}
inline Bitvec16x16 and_not(const Bitvec16x16 &other) const {
return _mm256_andnot_si256(other.vec, vec);
};
};
#endif // __AVX2__
inline uint32_t WhichDots16(const char *x) {
const __m128i dots = _mm_set1_epi8('.');
const __m128i src = _mm_loadu_si128((const __m128i *) x);
#if(defined __AVX512VL__ && defined __AVX512BW__)
return ((uint32_t) _mm_cmpeq_epi8_mask(src, dots));
#else
return ((uint32_t) _mm_movemask_epi8(_mm_cmpeq_epi8(src, dots)));
#endif
}
inline uint32_t WhichDots32(const char *x) {
#ifndef __AVX2__
return WhichDots16(x) | (WhichDots16(x + 16) << 16u);
#else
const __m256i dots = _mm256_set1_epi8('.');
const __m256i src = _mm256_loadu_si256((const __m256i *) x);
#if(defined __AVX512VL__ && defined __AVX512BW__)
return _mm256_cmpeq_epi8_mask(src, dots);
#else
return ((uint32_t) _mm256_movemask_epi8(_mm256_cmpeq_epi8(src, dots)));
#endif
#endif
}
inline uint64_t WhichDots64(const char *x) {
return (uint64_t) WhichDots32(x) | ((uint64_t) WhichDots32(x + 32) << 32u);
}
#pragma GCC diagnostic pop
#endif //TDOKU_SIMD_VECTORS_H