Skip to content

Commit

Permalink
Improvements for intel -m32 builds.
Browse files Browse the repository at this point in the history
On this platform _mm256_extract_epi64 isn't defined, but the rest of
AVX2 is.  It needs to fail auto-detection.

Also we get unaligned accesses in the SSE4 code with tbuf due to
differing data alignment caused by 32-bit pointers instead of 64-bit.
This exposes an underlying problem of using aligned SIMD writes on
tbuf without explicitly asking for alignment. (The new code is also
sometimes a little faster.)

See also samtools/htslib#1500
  • Loading branch information
jkbonfield authored and daviesrob committed Aug 25, 2022
1 parent 70e12b5 commit 843d4f6
Show file tree
Hide file tree
Showing 5 changed files with 10 additions and 5 deletions.
3 changes: 2 additions & 1 deletion configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,8 @@ AX_CHECK_COMPILE_FLAG([-mavx2], [
]],[[
__m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8);
__m256i b = _mm256_add_epi32(a, a);
return *((char *) &b);
long long c = _mm256_extract_epi64(b, 0);
return (int) c;
]])
])
AM_CONDITIONAL([RANS_32x16_AVX2],[test "x$MAVX2" != "x"])
Expand Down
2 changes: 1 addition & 1 deletion htscodecs/rANS_static32x16pr_avx2.c
Original file line number Diff line number Diff line change
Expand Up @@ -1123,7 +1123,7 @@ unsigned char *rans_uncompress_O1_32x16_avx2(unsigned char *in,
union {
unsigned char tbuf[32][32];
uint64_t tbuf64[32][4];
} u;
} u __attribute__((aligned(32)));
unsigned int tidx = 0;

if (0) {
Expand Down
2 changes: 1 addition & 1 deletion htscodecs/rANS_static32x16pr_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -735,7 +735,7 @@ unsigned char *rans_uncompress_O1_32x16_avx512(unsigned char *in,
union {
unsigned char tbuf[32][32];
uint64_t tbuf64[32][4];
} u;
} u __attribute__((aligned(32)));
#else
uint32_t tbuf[32][32];
#endif
Expand Down
2 changes: 1 addition & 1 deletion htscodecs/rANS_static32x16pr_sse4.c
Original file line number Diff line number Diff line change
Expand Up @@ -1423,7 +1423,7 @@ unsigned char *rans_uncompress_O1_32x16_sse4(unsigned char *in,
uint16_t *sp = (uint16_t *)ptr;
const uint32_t mask = ((1u << TF_SHIFT_O1_FAST)-1);
__m128i maskv = _mm_set1_epi32(mask); // set mask in all lanes
uint8_t tbuf[32][32];
uint8_t tbuf[32][32] __attribute__((aligned(32)));
int tidx = 0;
LOAD128(Rv, R);
LOAD128(Lv, l);
Expand Down
6 changes: 5 additions & 1 deletion m4/ax_check_compile_flag.m4
Original file line number Diff line number Diff line change
Expand Up @@ -36,13 +36,17 @@

#serial 6

# LOCAL modification; change AC_COMPILE_IFELSE to AC_LINK_IFELSE so
# _mm256_extract_epi64 tests on a x86_64 running under -m32 still
# fails. (Otherwise it compiles, but fails to link.)

AC_DEFUN([AX_CHECK_COMPILE_FLAG],
[AC_PREREQ(2.64)dnl for _AC_LANG_PREFIX and AS_VAR_IF
AS_VAR_PUSHDEF([CACHEVAR],[ax_cv_check_[]_AC_LANG_ABBREV[]flags_$4_$1])dnl
AC_CACHE_CHECK([whether _AC_LANG compiler accepts $1], CACHEVAR, [
ax_check_save_flags=$[]_AC_LANG_PREFIX[]FLAGS
_AC_LANG_PREFIX[]FLAGS="$[]_AC_LANG_PREFIX[]FLAGS $4 $1"
AC_COMPILE_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
AC_LINK_IFELSE([m4_default([$5],[AC_LANG_PROGRAM()])],
[AS_VAR_SET(CACHEVAR,[yes])],
[AS_VAR_SET(CACHEVAR,[no])])
_AC_LANG_PREFIX[]FLAGS=$ax_check_save_flags])
Expand Down

0 comments on commit 843d4f6

Please sign in to comment.