diff --git a/share/cmake/utils/CheckSupportAVX512.cmake b/share/cmake/utils/CheckSupportAVX512.cmake index 3d4f5bedd4..98004b1d02 100644 --- a/share/cmake/utils/CheckSupportAVX512.cmake +++ b/share/cmake/utils/CheckSupportAVX512.cmake @@ -5,7 +5,7 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") @@ -28,6 +28,8 @@ set(AVX512_CODE " int main() { __m512i vec = _mm512_set1_epi32(42); + // gcc <= 6 appear to be missing this intrinsic + __mmask16 k = _mm512_int2mask(42); return 0; } ") diff --git a/share/cmake/utils/CheckSupportX86SIMD.cmake b/share/cmake/utils/CheckSupportX86SIMD.cmake index 3d4b4f19ba..f709efbb5e 100644 --- a/share/cmake/utils/CheckSupportX86SIMD.cmake +++ b/share/cmake/utils/CheckSupportX86SIMD.cmake @@ -31,6 +31,10 @@ if(MSVC) if (COMPILER_SUPPORTS_AVX2) set(OCIO_AVX2_ARGS "/arch:AVX2") endif() + + if (COMPILER_SUPPORTS_AVX512) + set(OCIO_AVX512_ARGS "/arch:AVX512") + endif() else() if (COMPILER_SUPPORTS_SSE2) set(OCIO_SSE2_ARGS "-msse2") @@ -42,6 +46,10 @@ else() if (COMPILER_SUPPORTS_AVX2) set(OCIO_AVX2_ARGS "-mavx2" "-mfma") + endif() + + if (COMPILER_SUPPORTS_AVX512) + set(OCIO_AVX512_ARGS "-mavx512f") endif() endif() diff --git a/src/OpenColorIO/AVX512.h b/src/OpenColorIO/AVX512.h new file mode 100644 index 0000000000..1ffef09f7e --- /dev/null +++ b/src/OpenColorIO/AVX512.h @@ -0,0 +1,466 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + + +#ifndef INCLUDED_OCIO_AVX512_H +#define INCLUDED_OCIO_AVX512_H + +#include "CPUInfo.h" +#if OCIO_USE_AVX512 + +#include + +#include +#include "BitDepthUtils.h" + +// Macros for alignment declarations +#define AVX512_SIMD_BYTES 64 +#define AVX512_ALIGN(decl) alignas(AVX512_SIMD_BYTES) decl + +namespace OCIO_NAMESPACE +{ + +inline __m512 av512_clamp(__m512 value, const __m512& maxValue) +{ + value = _mm512_max_ps(value, _mm512_setzero_ps()); + return _mm512_min_ps(value, maxValue); +} + +inline __m512 avx512_movelh_ps(__m512 a, __m512 b) +{ + return _mm512_castpd_ps(_mm512_unpacklo_pd(_mm512_castps_pd(a), _mm512_castps_pd(b))); +} + +inline __m512 avx512_movehl_ps(__m512 a, __m512 b) +{ + // NOTE: this is a and b are reversed to match sse2 movhlps which is different than unpckhpd + return _mm512_castpd_ps(_mm512_unpackhi_pd(_mm512_castps_pd(b), _mm512_castps_pd(a))); +} + + +inline void avx512RGBATranspose_4x4_4x4_4x4_4x4(__m512 row0, __m512 row1, __m512 row2, __m512 row3, + __m512 &out_r, __m512 &out_g, __m512 &out_b, __m512 &out_a ) +{ + // the rgba transpose result will look this + // + // 0 1 2 3 | 4 5 6 7 8 9 10 11 12 13 14 15 + // r0, g0, b0, a0 | r1, g1, b1, a1 | r2, g2, b2, a2 | r3, g3, b3, a3 + // r4, g4, b4, a4 | r5, g5, b5, a5 | r6, g6, b6, a6 | r7, g7, b7, a7 + // r8 g8, b8, a8 | r9, g9, b9, a9 | r10, g10, b10, a10 | r11, g11, b11, a11 + // r12, g12, b12, a12 | r13, g13, b13, a13 | r14, g14, b14, a14 | r15, g15, b15, a15 + // | | | + // | | | | | | | + // V | V | V | V + // | | | + // r0, r4, r8, r12 | r1, r5, r9, r13 | r2, r6, r10, r14 | r3, r7, r11, r15 + // g0, g4, g8, g12 | g1, g5, g9, g13 | g2, g6, g10, g14 | g3, g7, g11, g15 + // b0, b4, b9, b12 | b1, b5, b9, b13 | b2, b6, b10, b14 | b3, b7, b11, b15 + // a0, a4, a8, a12 | a1, a5, a9, a13 | a2, a6, a10, a14 | a3, a7, a11, a15 + + + // each 128 lane is transposed independently, + // the channel values end up with a even/odd shuffled order because of this. + // if exact order is important more cross lane shuffling is needed + + __m512 tmp0 = _mm512_unpacklo_ps(row0, row1); + __m512 tmp2 = _mm512_unpacklo_ps(row2, row3); + __m512 tmp1 = _mm512_unpackhi_ps(row0, row1); + __m512 tmp3 = _mm512_unpackhi_ps(row2, row3); + + out_r = avx512_movelh_ps(tmp0, tmp2); + out_g = avx512_movehl_ps(tmp2, tmp0); + out_b = avx512_movelh_ps(tmp1, tmp3); + out_a = avx512_movehl_ps(tmp3, tmp1); + +} + + +// Note Packing functions perform no 0.0 - 1.0 normalization +// but perform 0 - max value clamping for integer formats +template struct AVX512RGBAPack {}; + +template <> +struct AVX512RGBAPack +{ + static inline void Load(const uint8_t *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + __m512i rgba = _mm512_loadu_si512((const __m512i*)in); + + __m512 rgba0 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_castsi512_si128(rgba))); + __m512 rgba1 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(rgba, 1))); + __m512 rgba2 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(rgba, 2))); + __m512 rgba3 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(rgba, 3))); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + } + + static inline void LoadMasked(const uint8_t *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + __mmask16 k; + uint16_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 1) | 1; + } + + k = _mm512_int2mask(mask); + __m512i rgba = _mm512_maskz_loadu_epi32(k, (const __m512i*)in); + + __m512 rgba0 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_castsi512_si128(rgba))); + __m512 rgba1 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(rgba, 1))); + __m512 rgba2 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(rgba, 2))); + __m512 rgba3 = _mm512_cvtepi32_ps(_mm512_cvtepu8_epi32(_mm512_extracti32x4_epi32(rgba, 3))); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + } + + static inline void Store(uint8_t *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + const __m512 maxValue = _mm512_set1_ps(255.0f); + __m512 rgba0, rgba1,rgba2, rgba3; + + r = av512_clamp(r, maxValue); + g = av512_clamp(g, maxValue); + b = av512_clamp(b, maxValue); + a = av512_clamp(a, maxValue); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + __mmask16 k = _mm512_int2mask(0xFFFF); + _mm512_mask_cvtepi32_storeu_epi8(out+0, k, _mm512_cvtps_epi32(rgba0)); + _mm512_mask_cvtepi32_storeu_epi8(out+16, k, _mm512_cvtps_epi32(rgba1)); + _mm512_mask_cvtepi32_storeu_epi8(out+32, k, _mm512_cvtps_epi32(rgba2)); + _mm512_mask_cvtepi32_storeu_epi8(out+48, k, _mm512_cvtps_epi32(rgba3)); + } + + static inline void StoreMasked(uint8_t *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + const __m512 maxValue = _mm512_set1_ps(255.0f); + __m512 rgba0, rgba1,rgba2, rgba3; + + __mmask16 k; + uint64_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 4) | 0b1111; + } + + r = av512_clamp(r, maxValue); + g = av512_clamp(g, maxValue); + b = av512_clamp(b, maxValue); + a = av512_clamp(a, maxValue); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi8(out+0, k, _mm512_cvtps_epi32(rgba0)); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi8(out+16, k, _mm512_cvtps_epi32(rgba1)); + k = _mm512_int2mask((mask >> 32) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi8(out+32, k, _mm512_cvtps_epi32(rgba2)); + k = _mm512_int2mask((mask >> 48) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi8(out+48, k, _mm512_cvtps_epi32(rgba3)); + } +}; + +template +struct AVX512RGBAPack16 +{ + typedef typename BitDepthInfo::Type Type; + + static inline void Load(const Type *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + __m512i rgba_00_07 = _mm512_loadu_si512((const __m512i*)(in + 0)); + __m512i rgba_08_15 = _mm512_loadu_si512((const __m512i*)(in + 32)); + + __m512 rgba0 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(rgba_00_07))); + __m512 rgba1 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64 (rgba_00_07, 1))); + __m512 rgba2 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(rgba_08_15))); + __m512 rgba3 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64 (rgba_08_15, 1))); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + + } + + static inline void LoadMasked(const Type *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + __mmask16 k; + uint32_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 2) | 0b11; + } + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + __m512i rgba_00_07 = _mm512_maskz_loadu_epi32(k, (const __m512i*)(in + 0)); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + __m512i rgba_08_15 = _mm512_maskz_loadu_epi32(k, (const __m512i*)(in + 32)); + + __m512 rgba0 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(rgba_00_07))); + __m512 rgba1 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64 (rgba_00_07, 1))); + __m512 rgba2 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_castsi512_si256(rgba_08_15))); + __m512 rgba3 = _mm512_cvtepi32_ps(_mm512_cvtepu16_epi32(_mm512_extracti64x4_epi64 (rgba_08_15, 1))); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + + } + + static inline void Store(Type *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + const __m512 maxValue = _mm512_set1_ps((float)BitDepthInfo::maxValue); + __m512 rgba0, rgba1,rgba2, rgba3; + + r = av512_clamp(r, maxValue); + g = av512_clamp(g, maxValue); + b = av512_clamp(b, maxValue); + a = av512_clamp(a, maxValue); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + __mmask16 k = _mm512_int2mask(0xFFFF); + _mm512_mask_cvtepi32_storeu_epi16(out+0, k, _mm512_cvtps_epi32(rgba0)); + _mm512_mask_cvtepi32_storeu_epi16(out+16, k, _mm512_cvtps_epi32(rgba1)); + _mm512_mask_cvtepi32_storeu_epi16(out+32, k, _mm512_cvtps_epi32(rgba2)); + _mm512_mask_cvtepi32_storeu_epi16(out+48, k, _mm512_cvtps_epi32(rgba3)); + + } + + static inline void StoreMasked(Type *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + const __m512 maxValue = _mm512_set1_ps((float)BitDepthInfo::maxValue); + __m512 rgba0, rgba1,rgba2, rgba3; + + __mmask16 k; + uint64_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 4) | 0b1111; + } + + r = av512_clamp(r, maxValue); + g = av512_clamp(g, maxValue); + b = av512_clamp(b, maxValue); + a = av512_clamp(a, maxValue); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi16(out+0, k, _mm512_cvtps_epi32(rgba0)); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi16(out+16, k, _mm512_cvtps_epi32(rgba1)); + k = _mm512_int2mask((mask >> 32) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi16(out+32, k, _mm512_cvtps_epi32(rgba2)); + k = _mm512_int2mask((mask >> 48) & 0xFFFF); + _mm512_mask_cvtepi32_storeu_epi16(out+48, k, _mm512_cvtps_epi32(rgba3)); + + } +}; + +template <> +struct AVX512RGBAPack +{ + static inline void Load(const uint16_t *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + AVX512RGBAPack16::Load(in, r, g, b, a); + } + static inline void LoadMasked(const uint16_t *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + AVX512RGBAPack16::LoadMasked(in, r, g, b, a, pixel_count); + } + static inline void Store(uint16_t *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + AVX512RGBAPack16::Store(out, r, g, b, a); + } + static inline void StoreMasked(uint16_t *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + AVX512RGBAPack16::StoreMasked(out, r, g, b, a, pixel_count); + } + +}; + +template <> +struct AVX512RGBAPack +{ + static inline void Load(const uint16_t *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + AVX512RGBAPack16::Load(in, r, g, b, a); + } + static inline void LoadMasked(const uint16_t *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + AVX512RGBAPack16::LoadMasked(in, r, g, b, a, pixel_count); + } + static inline void Store(uint16_t *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + AVX512RGBAPack16::Store(out, r, g, b, a); + } + static inline void StoreMasked(uint16_t *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + AVX512RGBAPack16::StoreMasked(out, r, g, b, a, pixel_count); + } +}; + +template <> +struct AVX512RGBAPack +{ + static inline void Load(const uint16_t *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + AVX512RGBAPack16::Load(in, r, g, b, a); + } + static inline void LoadMasked(const uint16_t *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + AVX512RGBAPack16::LoadMasked(in, r, g, b, a, pixel_count); + } + static inline void Store(uint16_t *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + AVX512RGBAPack16::Store(out, r, g, b, a); + } + static inline void StoreMasked(uint16_t *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + AVX512RGBAPack16::StoreMasked(out, r, g, b, a, pixel_count); + } +}; + +template <> +struct AVX512RGBAPack +{ + static inline void Load(const half *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + __m512i rgba_00_07 = _mm512_loadu_si512((const __m512i*)(in + 0)); + __m512i rgba_08_15 = _mm512_loadu_si512((const __m512i*)(in + 32)); + + __m512 rgba0 = _mm512_cvtph_ps(_mm512_castsi512_si256(rgba_00_07)); + __m512 rgba1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(rgba_00_07, 1)); + + __m512 rgba2 = _mm512_cvtph_ps(_mm512_castsi512_si256(rgba_08_15)); + __m512 rgba3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(rgba_08_15, 1)); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + + } + + static inline void LoadMasked(const half *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + __mmask16 k; + uint32_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 2) | 0b11; + } + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + __m512i rgba_00_07 = _mm512_maskz_loadu_epi32(k, (const __m512i*)(in + 0)); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + __m512i rgba_08_15 = _mm512_maskz_loadu_epi32(k, (const __m512i*)(in + 32)); + + __m512 rgba0 = _mm512_cvtph_ps(_mm512_castsi512_si256(rgba_00_07)); + __m512 rgba1 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(rgba_00_07, 1)); + + __m512 rgba2 = _mm512_cvtph_ps(_mm512_castsi512_si256(rgba_08_15)); + __m512 rgba3 = _mm512_cvtph_ps(_mm512_extracti64x4_epi64(rgba_08_15, 1)); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + + } + + static inline void Store(half *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + __m512 rgba0, rgba1,rgba2, rgba3; + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + __m512i rgba0i = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtps_ph(rgba0, 0)), _mm512_cvtps_ph(rgba1, 0), 1); + __m512i rgba1i = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtps_ph(rgba2, 0)), _mm512_cvtps_ph(rgba3, 0), 1); + + _mm512_storeu_si512((__m512i*)(out + 0), rgba0i); + _mm512_storeu_si512((__m512i*)(out + 32), rgba1i); + } + + static inline void StoreMasked(half *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + __m512 rgba0, rgba1,rgba2, rgba3; + + __mmask16 k; + uint64_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 2) | 0b11; + } + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + __m512i rgba0i = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtps_ph(rgba0, 0)), _mm512_cvtps_ph(rgba1, 0), 1); + __m512i rgba1i = _mm512_inserti64x4(_mm512_castsi256_si512(_mm512_cvtps_ph(rgba2, 0)), _mm512_cvtps_ph(rgba3, 0), 1); + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + _mm512_mask_storeu_epi32((__m512i*)(out + 0), k, rgba0i); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + _mm512_mask_storeu_epi32((__m512i*)(out + 32), k, rgba1i); + } +}; + +template <> +struct AVX512RGBAPack +{ + static inline void Load(const float *in, __m512& r, __m512& g, __m512& b, __m512& a) + { + __m512 rgba0 = _mm512_loadu_ps(in + 0); + __m512 rgba1 = _mm512_loadu_ps(in + 16); + __m512 rgba2 = _mm512_loadu_ps(in + 32); + __m512 rgba3 = _mm512_loadu_ps(in + 48); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + } + + static inline void LoadMasked(const float *in, __m512& r, __m512& g, __m512& b, __m512& a, uint32_t pixel_count) + { + __mmask16 k; + uint64_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 4) | 0b1111; + } + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + __m512 rgba0 = _mm512_maskz_loadu_ps(k, in + 0); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + __m512 rgba1 = _mm512_maskz_loadu_ps(k, in + 16); + k = _mm512_int2mask((mask >> 32) & 0xFFFF); + __m512 rgba2 = _mm512_maskz_loadu_ps(k, in + 32); + k = _mm512_int2mask((mask >> 48) & 0xFFFF); + __m512 rgba3 = _mm512_maskz_loadu_ps(k, in + 48); + + avx512RGBATranspose_4x4_4x4_4x4_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); + } + + static inline void Store(float *out, __m512 r, __m512 g, __m512 b, __m512 a) + { + __m512 rgba0, rgba1,rgba2, rgba3; + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + _mm512_storeu_ps((__m512*)(out+0), rgba0); + _mm512_storeu_ps((__m512*)(out+16), rgba1); + _mm512_storeu_ps((__m512*)(out+32), rgba2); + _mm512_storeu_ps((__m512*)(out+48), rgba3); + } + + static inline void StoreMasked(float *out, __m512 r, __m512 g, __m512 b, __m512 a, uint32_t pixel_count) + { + __m512 rgba0, rgba1,rgba2, rgba3; + + __mmask16 k; + uint64_t mask = 0; + for (uint32_t i = 0; i < pixel_count; i++) { + mask = (mask << 4) | 0b1111; + } + + avx512RGBATranspose_4x4_4x4_4x4_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); + + k = _mm512_int2mask((mask >> 0) & 0xFFFF); + _mm512_mask_storeu_ps((__m512*)(out+0), k, rgba0); + k = _mm512_int2mask((mask >> 16) & 0xFFFF); + _mm512_mask_storeu_ps((__m512*)(out+16), k, rgba1); + k = _mm512_int2mask((mask >> 32) & 0xFFFF); + _mm512_mask_storeu_ps((__m512*)(out+32), k, rgba2); + k = _mm512_int2mask((mask >> 48) & 0xFFFF); + _mm512_mask_storeu_ps((__m512*)(out+48), k, rgba3); + } +}; + +} // namespace OCIO_NAMESPACE + +#endif // OCIO_USE_AVX512 +#endif // INCLUDED_OCIO_AVX512_H \ No newline at end of file diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index 4c6db157eb..0b4f9ee292 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -119,6 +119,7 @@ set(SOURCES ops/lut1d/Lut1DOpCPU_SSE2.cpp ops/lut1d/Lut1DOpCPU_AVX.cpp ops/lut1d/Lut1DOpCPU_AVX2.cpp + ops/lut1d/Lut1DOpCPU_AVX512.cpp ops/lut1d/Lut1DOpData.cpp ops/lut1d/Lut1DOpGPU.cpp ops/lut3d/Lut3DOp.cpp @@ -126,6 +127,7 @@ set(SOURCES ops/lut3d/Lut3DOpCPU_SSE2.cpp ops/lut3d/Lut3DOpCPU_AVX.cpp ops/lut3d/Lut3DOpCPU_AVX2.cpp + ops/lut3d/Lut3DOpCPU_AVX512.cpp ops/lut3d/Lut3DOpData.cpp ops/lut3d/Lut3DOpGPU.cpp ops/matrix/MatrixOpCPU.cpp @@ -212,9 +214,11 @@ if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON)) set_property(SOURCE ops/lut1d/Lut1DOpCPU_SSE2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE ops/lut1d/Lut1DOpCPU_AVX.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE ops/lut1d/Lut1DOpCPU_AVX2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) + set_property(SOURCE ops/lut1d/Lut1DOpCPU_AVX512.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS}) set_property(SOURCE ops/lut3d/Lut3DOpCPU_SSE2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) + set_property(SOURCE ops/lut3d/Lut3DOpCPU_AVX512.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS}) endif() configure_file(CPUInfoConfig.h.in CPUInfoConfig.h) diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp index 7dfaab8daf..6c618b9fd2 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp @@ -19,6 +19,7 @@ #include "Lut1DOpCPU_SSE2.h" #include "Lut1DOpCPU_AVX.h" #include "Lut1DOpCPU_AVX2.h" +#include "Lut1DOpCPU_AVX512.h" #define L_ADJUST(val) \ @@ -298,6 +299,13 @@ BaseLut1DRenderer::BaseLut1DRenderer(ConstLut1DOpDataRcPtr & lut) m_applyLutFunc = AVX2GetLut1DApplyFunc(inBD, outBD); } #endif + +#if OCIO_USE_AVX512 + if (CPUInfo::instance().hasAVX512()) + { + m_applyLutFunc = AVX512GetLut1DApplyFunc(inBD, outBD); + } +#endif } template @@ -641,7 +649,7 @@ void Lut1DRenderer::apply(const void * inImg, void * outImg, long n out += 4; } } - else if (this->m_applyLutFunc) + else if (this->m_applyLutFunc && numPixels > 1) { const float * lutR = (const float *)this->m_tmpLutR; const float * lutG = (const float *)this->m_tmpLutG; diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.cpp new file mode 100644 index 0000000000..7ca2dbd054 --- /dev/null +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.cpp @@ -0,0 +1,150 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + +#include "Lut1DOpCPU_AVX512.h" +#if OCIO_USE_AVX512 + +#include +#include + +#include "AVX512.h" + +namespace OCIO_NAMESPACE +{ + +namespace { + + +static inline __m512 apply_lut_avx512(const float *lut, __m512 v, const __m512& scale, const __m512& lut_max) +{ + __m512 zero = _mm512_setzero_ps(); + __m512 one_f = _mm512_set1_ps(1); + + __m512 scaled = _mm512_mul_ps(v, scale); + + // clamp, max first, NAN set to zero + __m512 x = _mm512_min_ps(_mm512_max_ps(scaled, zero), lut_max); + __m512 prev_f = _mm512_floor_ps(x); + __m512 d = _mm512_sub_ps(x, prev_f); + __m512 next_f = _mm512_min_ps(_mm512_add_ps(prev_f, one_f), lut_max); + + __m512i prev_i = _mm512_cvttps_epi32(prev_f); + __m512i next_i = _mm512_cvttps_epi32(next_f); + + __m512 p = _mm512_i32gather_ps(prev_i, lut, sizeof(float)); + __m512 n = _mm512_i32gather_ps(next_i, lut, sizeof(float)); + + // lerp: a + (b - a) * t; + v = _mm512_fmadd_ps(_mm512_sub_ps(n, p), d, p); + + return v; +} + +template +static inline void linear1D(const float *lutR, const float *lutG,const float *lutB, int dim, const void *inImg, void *outImg, long numPixels) +{ + + typedef typename BitDepthInfo::Type InType; + typedef typename BitDepthInfo::Type OutType; + + const InType *src = (const InType*)inImg; + OutType *dst = (OutType*)outImg; + __m512 r,g,b,a, alpha_scale; + + float rgb_scale = 1.0f / (float)BitDepthInfo::maxValue * ((float)dim -1); + const __m512 lut_scale = _mm512_set1_ps(rgb_scale); + const __m512 lut_max = _mm512_set1_ps((float)dim -1); + + if (inBD != outBD) + alpha_scale = _mm512_set1_ps((float)BitDepthInfo::maxValue / (float)BitDepthInfo::maxValue); + + int pixel_count = numPixels / 16 * 16; + int remainder = numPixels - pixel_count; + + for (int i = 0; i < pixel_count; i += 16 ) { + AVX512RGBAPack::Load(src, r, g, b, a); + + r = apply_lut_avx512(lutR, r, lut_scale, lut_max); + g = apply_lut_avx512(lutG, g, lut_scale, lut_max); + b = apply_lut_avx512(lutB, b, lut_scale, lut_max); + + if (inBD != outBD) + a = _mm512_mul_ps(a, alpha_scale); + + AVX512RGBAPack::Store(dst, r, g, b, a); + + src += 64; + dst += 64; + } + + // handler leftovers pixels + if (remainder) { + AVX512RGBAPack::LoadMasked(src, r, g, b, a, remainder); + + r = apply_lut_avx512(lutR, r, lut_scale, lut_max); + g = apply_lut_avx512(lutG, g, lut_scale, lut_max); + b = apply_lut_avx512(lutB, b, lut_scale, lut_max); + + if (inBD != outBD) + a = _mm512_mul_ps(a, alpha_scale); + + AVX512RGBAPack::StoreMasked(dst, r, g, b, a, remainder); + } +} + +template +inline Lut1DOpCPUApplyFunc * GetConvertInBitDepth(BitDepth outBD) +{ + switch(outBD) + { + case BIT_DEPTH_UINT8: + return linear1D; + case BIT_DEPTH_UINT10: + return linear1D; + case BIT_DEPTH_UINT12: + return linear1D; + case BIT_DEPTH_UINT16: + return linear1D; + case BIT_DEPTH_F16: + return linear1D; + case BIT_DEPTH_F32: + return linear1D; + case BIT_DEPTH_UINT14: + case BIT_DEPTH_UINT32: + case BIT_DEPTH_UNKNOWN: + default: + break; + } + + return nullptr; +} + +} // anonymous namespace + +Lut1DOpCPUApplyFunc * AVX512GetLut1DApplyFunc(BitDepth inBD, BitDepth outBD) +{ + + // Lut1DOp only uses interpolation for in float in formats + switch(inBD) + { + case BIT_DEPTH_UINT8: + case BIT_DEPTH_UINT10: + case BIT_DEPTH_UINT12: + case BIT_DEPTH_UINT16: + case BIT_DEPTH_F16: + break; + case BIT_DEPTH_F32: + return GetConvertInBitDepth(outBD); + case BIT_DEPTH_UINT14: + case BIT_DEPTH_UINT32: + case BIT_DEPTH_UNKNOWN: + default: + break; + } + + return nullptr; +} + +} // OCIO_NAMESPACE + +#endif // OCIO_USE_AVX512 \ No newline at end of file diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.h b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.h new file mode 100644 index 0000000000..2120d7354e --- /dev/null +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.h @@ -0,0 +1,23 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + +#ifndef INCLUDED_OCIO_LUT1DOP_CPU_AVX512_H +#define INCLUDED_OCIO_LUT1DOP_CPU_AVX512_H + +#include + +#include "CPUInfo.h" + +typedef void (Lut1DOpCPUApplyFunc)(const float *, const float *, const float *, int, const void *, void *, long); + +#if OCIO_USE_AVX512 +namespace OCIO_NAMESPACE +{ + +Lut1DOpCPUApplyFunc * AVX512GetLut1DApplyFunc(BitDepth inBD, BitDepth outBD); + +} // namespace OCIO_NAMESPACE + +#endif // OCIO_USE_AVX512 + +#endif /* INCLUDED_OCIO_LUT1DOP_CPU_AVX512_H */ \ No newline at end of file diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp index cab9f648ba..d7ea8483a0 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp @@ -18,6 +18,7 @@ #include "Lut3DOpCPU_SSE2.h" #include "Lut3DOpCPU_AVX.h" #include "Lut3DOpCPU_AVX2.h" +#include "Lut3DOpCPU_AVX512.h" namespace OCIO_NAMESPACE { @@ -405,6 +406,13 @@ Lut3DTetrahedralRenderer::Lut3DTetrahedralRenderer(ConstLut3DOpDataRcPtr & lut) m_applyLutFunc = applyTetrahedralAVX2; } #endif + + #if OCIO_USE_AVX512 + if (CPUInfo::instance().hasAVX512()) + { + m_applyLutFunc = applyTetrahedralAVX512; + } + #endif } Lut3DTetrahedralRenderer::~Lut3DTetrahedralRenderer() @@ -416,7 +424,7 @@ void Lut3DTetrahedralRenderer::apply(const void * inImg, void * outImg, long num const float * in = (const float *)inImg; float * out = (float *)outImg; - if (m_applyLutFunc) + if (m_applyLutFunc && numPixels > 1) { m_applyLutFunc(m_optLut, m_dim, in, out, numPixels); } diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.cpp new file mode 100644 index 0000000000..b659dad4a2 --- /dev/null +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.cpp @@ -0,0 +1,259 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + +#include "Lut3DOpCPU_AVX512.h" +#if OCIO_USE_AVX512 + +#include +#include + +#include "AVX512.h" + +namespace OCIO_NAMESPACE +{ +namespace { + +struct Lut3DContextAVX512 { + const float *lut; + __m512 lutmax; + __m512 lutsize; + __m512 lutsize2; +}; + +struct rgbavec_avx512 { + __m512 r, g, b, a; +}; + +#define gather_rgb_avx512(src, idx) \ + sample_r = _mm512_i32gather_ps(idx, (void * )(src+0), 4); \ + sample_g = _mm512_i32gather_ps(idx, (void * )(src+1), 4); \ + sample_b = _mm512_i32gather_ps(idx, (void * )(src+2), 4) + +static inline rgbavec_avx512 interp_tetrahedral_avx512(const Lut3DContextAVX512 &ctx, __m512& r, __m512& g, __m512& b, __m512& a) +{ + __m512 x0, x1, x2; + __m512 cxxxa; + __m512 cxxxb; + __mmask16 mask; + __m512 sample_r, sample_g, sample_b; + + rgbavec_avx512 result; + + __m512 lut_max = ctx.lutmax; + __m512 lutsize = ctx.lutsize; + __m512 lutsize2 = ctx.lutsize2; + + __m512 one_f = _mm512_set1_ps(1.0f); + __m512 four_f = _mm512_set1_ps(4.0f); + + __m512 prev_r = _mm512_floor_ps(r); + __m512 prev_g = _mm512_floor_ps(g); + __m512 prev_b = _mm512_floor_ps(b); + + // rgb delta values + __m512 d_r = _mm512_sub_ps(r, prev_r); + __m512 d_g = _mm512_sub_ps(g, prev_g); + __m512 d_b = _mm512_sub_ps(b, prev_b); + + __m512 next_r = _mm512_min_ps(lut_max, _mm512_add_ps(prev_r, one_f)); + __m512 next_g = _mm512_min_ps(lut_max, _mm512_add_ps(prev_g, one_f)); + __m512 next_b = _mm512_min_ps(lut_max, _mm512_add_ps(prev_b, one_f)); + + // prescale indices + prev_r = _mm512_mul_ps(prev_r, lutsize2); + next_r = _mm512_mul_ps(next_r, lutsize2); + + prev_g = _mm512_mul_ps(prev_g, lutsize); + next_g = _mm512_mul_ps(next_g, lutsize); + + prev_b = _mm512_mul_ps(prev_b, four_f); + next_b = _mm512_mul_ps(next_b, four_f); + + // This is the tetrahedral blend equation + // red = (1-x0) * c000.r + (x0-x1) * cxxxa.r + (x1-x2) * cxxxb.r + x2 * c111.r; + // The x values are the rgb delta values sorted, x0 >= x1 >= x2 + // c### are samples from the lut, which are indices made with prev_(r,g,b) and next_(r,g,b) values + // 0 = use prev, 1 = use next + // c### = (prev_r or next_r) * (lutsize * lutsize) + (prev_g or next_g) * lutsize + (prev_b or next_b) + + // cxxxa + // always uses 1 next and 2 prev and next is largest delta + // r> == c100 == (r>g && r>b) == (!b>r && r>g) + // g> == c010 == (g>r && g>b) == (!r>g && g>b) + // b> == c001 == (b>r && b>g) == (!g>b && b>r) + + // cxxxb + // always uses 2 next and 1 prev and prev is smallest delta + // r< == c011 == (r<=g && r<=b) == (!r>g && b>r) + // g< == c101 == (g<=r && g<=b) == (!g>b && r>g) + // b< == c110 == (b<=r && b<=g) == (!b>r && g>b) + + // c000 and c111 are const (prev,prev,prev) and (next,next,next) + + __mmask16 gt_r = _mm512_cmp_ps_mask(d_r, d_g, _CMP_GT_OQ); // r>g + __mmask16 gt_g = _mm512_cmp_ps_mask(d_g, d_b, _CMP_GT_OQ); // g>b + __mmask16 gt_b = _mm512_cmp_ps_mask(d_b, d_r, _CMP_GT_OQ); // b>r + + // r> !b>r && r>g + mask = _mm512_kandn(gt_b, gt_r); + cxxxa = _mm512_mask_blend_ps(mask, prev_r, next_r); + + // r< !r>g && b>r + mask = _mm512_kandn(gt_r, gt_b); + cxxxb = _mm512_mask_blend_ps(mask, next_r, prev_r); + + // g> !r>g && g>b + mask = _mm512_kandn(gt_r, gt_g); + cxxxa = _mm512_add_ps(cxxxa, _mm512_mask_blend_ps(mask, prev_g, next_g)); + + // g< !g>b && r>g + mask = _mm512_kandn(gt_g, gt_r); + cxxxb = _mm512_add_ps(cxxxb, _mm512_mask_blend_ps(mask, next_g, prev_g)); + + // b> !g>b && b>r + mask = _mm512_kandn(gt_g, gt_b); + cxxxa = _mm512_add_ps(cxxxa, _mm512_mask_blend_ps(mask, prev_b, next_b)); + + // b< !b>r && g>b + mask = _mm512_kandn(gt_b, gt_g); + cxxxb = _mm512_add_ps(cxxxb, _mm512_mask_blend_ps(mask, next_b, prev_b)); + + __m512 c000 = _mm512_add_ps(_mm512_add_ps(prev_r, prev_g), prev_b); + __m512 c111 = _mm512_add_ps(_mm512_add_ps(next_r, next_g), next_b); + + // sort delta r,g,b x0 >= x1 >= x2 + __m512 rg_min = _mm512_min_ps(d_r, d_g); + __m512 rg_max = _mm512_max_ps(d_r, d_g); + + x2 = _mm512_min_ps(rg_min, d_b); + __m512 mid = _mm512_max_ps(rg_min, d_b); + + x0 = _mm512_max_ps(rg_max, d_b); + x1 = _mm512_min_ps(rg_max, mid); + + // convert indices to int + __m512i c000_idx = _mm512_cvttps_epi32(c000); + __m512i cxxxa_idx = _mm512_cvttps_epi32(cxxxa); + __m512i cxxxb_idx = _mm512_cvttps_epi32(cxxxb); + __m512i c111_idx = _mm512_cvttps_epi32(c111); + + gather_rgb_avx512(ctx.lut, c000_idx); + + // (1-x0) * c000 + __m512 v = _mm512_sub_ps(one_f, x0); + result.r = _mm512_mul_ps(sample_r, v); + result.g = _mm512_mul_ps(sample_g, v); + result.b = _mm512_mul_ps(sample_b, v); + + gather_rgb_avx512(ctx.lut, cxxxa_idx); + + // (x0-x1) * cxxxa + v = _mm512_sub_ps(x0, x1); + result.r = _mm512_fmadd_ps(v, sample_r, result.r); + result.g = _mm512_fmadd_ps(v, sample_g, result.g); + result.b = _mm512_fmadd_ps(v, sample_b, result.b); + + gather_rgb_avx512(ctx.lut, cxxxb_idx); + + // (x1-x2) * cxxxb + v = _mm512_sub_ps(x1, x2); + result.r = _mm512_fmadd_ps(v, sample_r, result.r); + result.g = _mm512_fmadd_ps(v, sample_g, result.g); + result.b = _mm512_fmadd_ps(v, sample_b, result.b); + + gather_rgb_avx512(ctx.lut, c111_idx); + + // x2 * c111 + result.r = _mm512_fmadd_ps(x2, sample_r, result.r); + result.g = _mm512_fmadd_ps(x2, sample_g, result.g); + result.b = _mm512_fmadd_ps(x2, sample_b, result.b); + + result.a = a; + + return result; +} + +template +inline void applyTetrahedralAVX512Func(const float *lut3d, int dim, const void *inImg, void *outImg, int numPixels) +{ + typedef typename BitDepthInfo::Type InType; + typedef typename BitDepthInfo::Type OutType; + + const InType * src = (InType *)inImg; + OutType * dst = (OutType *)outImg; + __m512 r,g,b,a; + rgbavec_avx512 c; + + Lut3DContextAVX512 ctx; + + float lutmax = (float)dim - 1; + __m512 scale = _mm512_set1_ps(lutmax); + __m512 zero = _mm512_setzero_ps(); + + ctx.lut = lut3d; + ctx.lutmax = _mm512_set1_ps(lutmax); + ctx.lutsize = _mm512_set1_ps((float)dim * 4); + ctx.lutsize2 = _mm512_set1_ps((float)dim * dim * 4); + + int pixel_count = numPixels / 16 * 16; + int remainder = numPixels - pixel_count; + + for (int i = 0; i < pixel_count; i += 16 ) + { + AVX512RGBAPack::Load(src, r, g, b, a); + + // scale and clamp values + r = _mm512_mul_ps(r, scale); + g = _mm512_mul_ps(g, scale); + b = _mm512_mul_ps(b, scale); + + r = _mm512_max_ps(r, zero); + g = _mm512_max_ps(g, zero); + b = _mm512_max_ps(b, zero); + + r = _mm512_min_ps(r, ctx.lutmax); + g = _mm512_min_ps(g, ctx.lutmax); + b = _mm512_min_ps(b, ctx.lutmax); + + c = interp_tetrahedral_avx512(ctx, r, g, b, a); + + AVX512RGBAPack::Store(dst, c.r, c.g, c.b, c.a); + + src += 64; + dst += 64; + } + + // handler leftovers pixels + if (remainder) + { + AVX512RGBAPack::LoadMasked(src, r, g, b, a, remainder); + + // scale and clamp values + r = _mm512_mul_ps(r, scale); + g = _mm512_mul_ps(g, scale); + b = _mm512_mul_ps(b, scale); + + r = _mm512_max_ps(r, zero); + g = _mm512_max_ps(g, zero); + b = _mm512_max_ps(b, zero); + + r = _mm512_min_ps(r, ctx.lutmax); + g = _mm512_min_ps(g, ctx.lutmax); + b = _mm512_min_ps(b, ctx.lutmax); + + c = interp_tetrahedral_avx512(ctx, r, g, b, a); + + AVX512RGBAPack::StoreMasked(dst, c.r, c.g, c.b, c.a, remainder); + } +} + +} // anonymous namespace + +void applyTetrahedralAVX512(const float *lut3d, int dim, const float *src, float *dst, int total_pixel_count) +{ + applyTetrahedralAVX512Func(lut3d, dim, src, dst, total_pixel_count); +} + +} // OCIO_NAMESPACE + +#endif // OCIO_USE_AVX512 \ No newline at end of file diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.h b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.h new file mode 100644 index 0000000000..70ff195cda --- /dev/null +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.h @@ -0,0 +1,21 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + +#ifndef INCLUDED_OCIO_LUT3DOP_CPU_AVX512_H +#define INCLUDED_OCIO_LUT3DOP_CPU_AVX512_H + +#include + +#include "CPUInfo.h" + +#if OCIO_USE_AVX512 +namespace OCIO_NAMESPACE +{ + +void applyTetrahedralAVX512(const float *lut3d, int dim, const float *src, float *dst, int total_pixel_count); + +} // namespace OCIO_NAMESPACE + +#endif // OCIO_USE_AVX512 + +#endif /* INCLUDED_OCIO_LUT3DOP_CPU_AVX512_H */ \ No newline at end of file diff --git a/tests/cpu/AVX2_tests.cpp b/tests/cpu/AVX2_tests.cpp index e711661841..1d8373fc90 100644 --- a/tests/cpu/AVX2_tests.cpp +++ b/tests/cpu/AVX2_tests.cpp @@ -161,8 +161,9 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) DEFINE_SIMD_TEST(packed_uint8_to_float_test) { - std::vector inImage(256); - std::vector outImage(256); + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); for (unsigned i = 0; i < inImage.size(); i++) { @@ -360,7 +361,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::AVX2RGBAPack::Load(&pixels[0], r, g, b, a); OCIO::AVX2RGBAPack::Store(&outImageU16[0], r, g, b, a); - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU10[i], outImageU16[i], 0, false), GetErrorMessage(resultU10[i], outImageU16[i], @@ -379,7 +380,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::AVX2RGBAPack::Load(&pixels[0], r, g, b, a); OCIO::AVX2RGBAPack::Store(&outImageU16[0], r, g, b, a); - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU12[i], outImageU16[i], 0, false), GetErrorMessage(resultU12[i], outImageU16[i], @@ -398,7 +399,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::AVX2RGBAPack::Load(&pixels[0], r, g, b, a); OCIO::AVX2RGBAPack::Store(&outImageU16[0], r, g, b, a); - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU16[i], outImageU16[i], 0, false), GetErrorMessage(resultU16[i], outImageU16[i], diff --git a/tests/cpu/AVX512_tests.cpp b/tests/cpu/AVX512_tests.cpp new file mode 100644 index 0000000000..8b8e6f9cd7 --- /dev/null +++ b/tests/cpu/AVX512_tests.cpp @@ -0,0 +1,515 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + + +#include "CPUInfo.h" +#if OCIO_USE_AVX512 + +#include + +#include + +#include +#include "MathUtils.h" +#include "BitDepthUtils.h" +#include "AVX512.h" +#include "testutils/UnitTest.h" + +namespace OCIO = OCIO_NAMESPACE; + +#define DEFINE_SIMD_TEST(name) \ +void avx512_test_##name() + +namespace +{ + +std::string GetFormatName(OCIO::BitDepth BD) +{ + switch(BD) + { + case OCIO::BIT_DEPTH_UINT8: + return "BIT_DEPTH_UINT8"; + case OCIO::BIT_DEPTH_UINT10: + return "BIT_DEPTH_UINT10"; + case OCIO::BIT_DEPTH_UINT12: + return "BIT_DEPTH_UINT12"; + case OCIO::BIT_DEPTH_UINT16: + return "BIT_DEPTH_UINT16"; + case OCIO::BIT_DEPTH_F16: + return "BIT_DEPTH_F16"; + case OCIO::BIT_DEPTH_F32: + return "BIT_DEPTH_F32"; + case OCIO::BIT_DEPTH_UINT14: + case OCIO::BIT_DEPTH_UINT32: + case OCIO::BIT_DEPTH_UNKNOWN: + default: + break; + } + + return "BIT_DEPTH_UNKNOWN"; +} + +std::string GetErrorMessage(float expected, float actual, OCIO::BitDepth inBD, OCIO::BitDepth outBD) +{ + std::ostringstream oss; + oss << "expected: " << expected << " != " << "actual: " << actual << " : " << GetFormatName(inBD) << " -> " << GetFormatName(outBD); + return oss.str(); +} + +template +typename OCIO::BitDepthInfo::Type scale_unsigned(unsigned i) +{ + return i; +} + +template <> +float scale_unsigned(unsigned i) +{ + return static_cast(i) * 1.0f/65535.0f; +} + +template <> +half scale_unsigned(unsigned i) +{ + return static_cast(1.0f/65535.0f * static_cast(i)); +} + +template +void testConvert_OutBitDepth() +{ + typedef typename OCIO::BitDepthInfo::Type InType; + typedef typename OCIO::BitDepthInfo::Type OutType; + + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + + if (OCIO::BitDepthInfo::isFloat) + maxValue = 65536; + + std::vector inImage(maxValue); + std::vector outImage(maxValue); + + for (unsigned i = 0; i < maxValue; i++) + { + inImage[i] = scale_unsigned(i); + } + + float scale = (float)OCIO::BitDepthInfo::maxValue / (float)OCIO::BitDepthInfo::maxValue; + __m512 s = _mm512_set1_ps(scale); + + for (unsigned i = 0; i < inImage.size(); i += 64) + { + __m512 r, g, b, a; + OCIO::AVX512RGBAPack::Load(&inImage[i], r, g, b, a); + r = _mm512_mul_ps(r, s); + g = _mm512_mul_ps(g, s); + b = _mm512_mul_ps(b, s); + a = _mm512_mul_ps(a, s); + OCIO::AVX512RGBAPack::Store(&outImage[i], r, g, b, a); + } + for (unsigned i = 0; i < outImage.size(); i++) + { + float v = (float)inImage[i] * scale; + + if (OCIO::BitDepthInfo::isFloat) + v = (OutType)v; // casts to half if format is half + else + v = rintf(v); + + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(v, (float)outImage[i], 0, false), + GetErrorMessage(v, (float)outImage[i], inBD, outBD)); + } + + // Test Load/Store Masked + for (unsigned pixel_count = 0; pixel_count <= 16; pixel_count++) + { + __m512 r, g, b, a; + // reset all values to zero + for (unsigned i = 0; i < outImage.size(); i++) + { + outImage[i] = 0; + } + + OCIO::AVX512RGBAPack::LoadMasked(&inImage[0], r, g, b, a, pixel_count); + r = _mm512_mul_ps(r, s); + g = _mm512_mul_ps(g, s); + b = _mm512_mul_ps(b, s); + a = _mm512_mul_ps(a, s); + OCIO::AVX512RGBAPack::StoreMasked(&outImage[0], r, g, b, a, pixel_count); + + for (unsigned i = 0; i < outImage.size(); i++) + { + float v = (float)inImage[i] * scale; + + // values geater then the pixel count should not have been written to + if (i >= pixel_count*4) + v = 0.0f; + + if (OCIO::BitDepthInfo::isFloat) + v = (OutType)v; // casts to half if format is half + else + v = rintf(v); + + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(v, (float)outImage[i], 0, false), + GetErrorMessage(v, (float)outImage[i], inBD, outBD)); + } + } +} + +template +void testConvert_InBitDepth(OCIO::BitDepth outBD) +{ + switch(outBD) + { + case OCIO::BIT_DEPTH_UINT8: + return testConvert_OutBitDepth(); + case OCIO::BIT_DEPTH_UINT10: + return testConvert_OutBitDepth(); + case OCIO::BIT_DEPTH_UINT12: + return testConvert_OutBitDepth(); + case OCIO::BIT_DEPTH_UINT16: + return testConvert_OutBitDepth(); + case OCIO::BIT_DEPTH_F16: + return testConvert_OutBitDepth(); + break; + case OCIO::BIT_DEPTH_F32: + return testConvert_OutBitDepth(); + + case OCIO::BIT_DEPTH_UINT14: + case OCIO::BIT_DEPTH_UINT32: + case OCIO::BIT_DEPTH_UNKNOWN: + default: + break; + } +} + +} + +DEFINE_SIMD_TEST(packed_uint8_to_float_test) +{ + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); + + for (unsigned i = 0; i < inImage.size(); i++) + { + inImage[i] = i; + } + + for (unsigned i = 0; i < inImage.size(); i += 64) + { + __m512 r, g, b, a; + OCIO::AVX512RGBAPack::Load(&inImage[i], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImage[i], r, g, b, a); + } + + for (unsigned i = 0; i < outImage.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((float)inImage[i], (float)outImage[i], 0, false), + GetErrorMessage((float)inImage[i], (float)outImage[i], + OCIO::BIT_DEPTH_UINT8, OCIO::BIT_DEPTH_F32)); + } +} + +DEFINE_SIMD_TEST(packed_uint10_to_f32_test) +{ + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); + + for (unsigned i = 0; i < inImage.size(); i++) + { + inImage[i] = i; + } + + for (unsigned i = 0; i < inImage.size(); i += 64) + { + __m512 r, g, b, a; + OCIO::AVX512RGBAPack::Load(&inImage[i], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImage[i], r, g, b, a); + } + + for (unsigned i = 0; i < outImage.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((float)inImage[i], (float)outImage[i], 0, false), + GetErrorMessage((float)inImage[i], (float)outImage[i], + OCIO::BIT_DEPTH_UINT10, OCIO::BIT_DEPTH_F32)); + } +} + +DEFINE_SIMD_TEST(packed_uint12_to_f32_test) +{ + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); + + for (unsigned i = 0; i < inImage.size(); i++) + { + inImage[i] = i; + } + + for (unsigned i = 0; i < inImage.size(); i += 64) + { + __m512 r, g, b, a; + OCIO::AVX512RGBAPack::Load(&inImage[i], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImage[i], r, g, b, a); + } + + for (unsigned i = 0; i < outImage.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((float)inImage[i], (float)outImage[i], 0, false), + GetErrorMessage((float)inImage[i], (float)outImage[i], + OCIO::BIT_DEPTH_UINT12, OCIO::BIT_DEPTH_F32)); + } +} + +DEFINE_SIMD_TEST(packed_uint16_to_f32_test) +{ + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); + + for (unsigned i = 0; i < inImage.size(); i++) + { + inImage[i] = i; + } + + for (unsigned i = 0; i < inImage.size(); i += 64) + { + __m512 r, g, b, a; + OCIO::AVX512RGBAPack::Load(&inImage[i], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImage[i], r, g, b, a); + } + + for (unsigned i = 0; i < outImage.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((float)inImage[i], (float)outImage[i], 0, false), + GetErrorMessage((float)inImage[i], (float)outImage[i], + OCIO::BIT_DEPTH_UINT16, OCIO::BIT_DEPTH_F32)); + } +} + +DEFINE_SIMD_TEST(packed_f16_to_f32_test) +{ + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); + + uint16_t *u16Image =(uint16_t*)&inImage[0]; + for (unsigned i = 0; i < inImage.size(); i++) + { + u16Image[i] = i; + } + + for (unsigned i = 0; i < inImage.size(); i += 64) + { + __m512 r, g, b, a; + OCIO::AVX512RGBAPack::Load(&inImage[i], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImage[i], r, g, b, a); + } + + for (unsigned i = 0; i < outImage.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((float)inImage[i], (float)outImage[i], 0, false), + GetErrorMessage((float)inImage[i], (float)outImage[i], + OCIO::BIT_DEPTH_F16, OCIO::BIT_DEPTH_F32)); + } +} + +DEFINE_SIMD_TEST(packed_nan_inf_test) +{ + const float qnan = std::numeric_limits::quiet_NaN(); + const float inf = std::numeric_limits::infinity(); + const float maxf = std::numeric_limits::max(); + + __m512 r, g, b, a; + std::vector outImageHalf(64); + std::vector outImageU8(64); + std::vector outImageU16(64); + + const float pixels[64] = { qnan, qnan, qnan, 0.25f, + maxf, -maxf, 3.2f, qnan, + inf, inf, inf, inf, + -inf, -inf, -inf, -inf, + 0.0f, 270.0f, 500.0f, 2.0f, + -0.0f, -1.0f, - 2.0f, -5.0f, + 100000.0f, 200000.0f, -10.0f, -2000.0f, + 65535.0f, 65537.0f, -65536.0f, -65537.0f, + qnan, qnan, qnan, 0.25f, + maxf, -maxf, 3.2f, qnan, + inf, inf, inf, inf, + -inf, -inf, -inf, -inf, + 0.0f, 270.0f, 500.0f, 2.0f, + -0.0f, -1.0f, - 2.0f, -5.0f, + 100000.0f, 200000.0f, -10.0f, -2000.0f, + 65535.0f, 65537.0f, -65536.0f, -65537.0f }; + + + OCIO::AVX512RGBAPack::Load(&pixels[0], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImageHalf[0], r, g, b, a); + + for (unsigned i = 0; i < outImageHalf.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((half)pixels[i], (float)outImageHalf[i], 0, false), + GetErrorMessage((half)pixels[i], (float)outImageHalf[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_F16)); + } + + + const uint8_t resultU8[64] = { 0, 0, 0, 0, + 255, 0, 3, 0, + 255, 255, 255, 255, + 0, 0, 0, 0, + 0, 255, 255, 2, + 0, 0, 0, 0, + 255, 255, 0, 0, + 255, 255, 0, 0, + 0, 0, 0, 0, + 255, 0, 3, 0, + 255, 255, 255, 255, + 0, 0, 0, 0, + 0, 255, 255, 2, + 0, 0, 0, 0, + 255, 255, 0, 0, + 255, 255, 0, 0 }; + + OCIO::AVX512RGBAPack::Load(&pixels[0], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImageU8[0], r, g, b, a); + + for (unsigned i = 0; i < outImageU8.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU8[i], outImageU8[i], 0, false), + GetErrorMessage(resultU8[i], outImageU8[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_UINT8)); + } + + const uint16_t resultU10[64] = { 0, 0, 0, 0, + 1023, 0, 3, 0, + 1023, 1023, 1023, 1023, + 0, 0, 0, 0, + 0, 270, 500, 2, + 0, 0, 0, 0, + 1023, 1023, 0, 0, + 1023, 1023, 0, 0, + 0, 0, 0, 0, + 1023, 0, 3, 0, + 1023, 1023, 1023, 1023, + 0, 0, 0, 0, + 0, 270, 500, 2, + 0, 0, 0, 0, + 1023, 1023, 0, 0, + 1023, 1023, 0, 0}; + + OCIO::AVX512RGBAPack::Load(&pixels[0], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImageU16[0], r, g, b, a); + + for (unsigned i = 0; i < outImageU16.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU10[i], outImageU16[i], 0, false), + GetErrorMessage(resultU10[i], outImageU16[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_UINT10)); + } + + const uint16_t resultU12[64] = { 0, 0, 0, 0, + 4095, 0, 3, 0, + 4095, 4095, 4095, 4095, + 0, 0, 0, 0, + 0, 270, 500, 2, + 0, 0, 0, 0, + 4095, 4095, 0, 0, + 4095, 4095, 0, 0, + 0, 0, 0, 0, + 4095, 0, 3, 0, + 4095, 4095, 4095, 4095, + 0, 0, 0, 0, + 0, 270, 500, 2, + 0, 0, 0, 0, + 4095, 4095, 0, 0, + 4095, 4095, 0, 0}; + + OCIO::AVX512RGBAPack::Load(&pixels[0], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImageU16[0], r, g, b, a); + + for (unsigned i = 0; i < outImageU16.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU12[i], outImageU16[i], 0, false), + GetErrorMessage(resultU12[i], outImageU16[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_UINT12)); + } + + const uint16_t resultU16[64] = { 0, 0, 0, 0, + 65535, 0, 3, 0, + 65535, 65535, 65535, 65535, + 0, 0, 0, 0, + 0, 270, 500, 2, + 0, 0, 0, 0, + 65535, 65535, 0, 0, + 65535, 65535, 0, 0, + 0, 0, 0, 0, + 65535, 0, 3, 0, + 65535, 65535, 65535, 65535, + 0, 0, 0, 0, + 0, 270, 500, 2, + 0, 0, 0, 0, + 65535, 65535, 0, 0, + 65535, 65535, 0, 0}; + + OCIO::AVX512RGBAPack::Load(&pixels[0], r, g, b, a); + OCIO::AVX512RGBAPack::Store(&outImageU16[0], r, g, b, a); + + for (unsigned i = 0; i < outImageU16.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU16[i], outImageU16[i], 0, false), + GetErrorMessage(resultU16[i], outImageU16[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_UINT16)); + } +} + +DEFINE_SIMD_TEST(packed_all_test) +{ + const std::vector< OCIO::BitDepth> formats = { + OCIO::BIT_DEPTH_UINT8, + OCIO::BIT_DEPTH_UINT10, + OCIO::BIT_DEPTH_UINT12, + OCIO::BIT_DEPTH_UINT16, + OCIO::BIT_DEPTH_F16, + OCIO::BIT_DEPTH_F32, + }; + + for(unsigned i = 0; i < formats.size(); i++) + { + OCIO::BitDepth inBD = formats[i]; + for(unsigned j = 0; j < formats.size(); j++) + { + OCIO::BitDepth outBD = formats[j]; + switch(inBD) + { + case OCIO::BIT_DEPTH_UINT8: + testConvert_InBitDepth(outBD); + break; + case OCIO::BIT_DEPTH_UINT10: + testConvert_InBitDepth(outBD); + break; + case OCIO::BIT_DEPTH_UINT12: + testConvert_InBitDepth(outBD); + break; + case OCIO::BIT_DEPTH_UINT16: + testConvert_InBitDepth(outBD); + break; + case OCIO::BIT_DEPTH_F16: + testConvert_InBitDepth(outBD); + break; + case OCIO::BIT_DEPTH_F32: + testConvert_InBitDepth(outBD); + break; + case OCIO::BIT_DEPTH_UINT14: + case OCIO::BIT_DEPTH_UINT32: + case OCIO::BIT_DEPTH_UNKNOWN: + break; + default: + break; + } + } + } +} + +#endif // OCIO_USE_AVX \ No newline at end of file diff --git a/tests/cpu/AVX_tests.cpp b/tests/cpu/AVX_tests.cpp index 60bd008373..a88b52a8a1 100644 --- a/tests/cpu/AVX_tests.cpp +++ b/tests/cpu/AVX_tests.cpp @@ -161,8 +161,9 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) DEFINE_SIMD_TEST(packed_uint8_to_float_test) { - std::vector inImage(256); - std::vector outImage(256); + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); for (unsigned i = 0; i < inImage.size(); i++) { @@ -361,7 +362,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::AVXRGBAPack::Load(&pixels[0], r, g, b, a); OCIO::AVXRGBAPack::Store(&outImageU16[0], r, g, b, a); - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU10[i], outImageU16[i], 0, false), GetErrorMessage(resultU10[i], outImageU16[i], @@ -380,7 +381,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::AVXRGBAPack::Load(&pixels[0], r, g, b, a); OCIO::AVXRGBAPack::Store(&outImageU16[0], r, g, b, a); - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU12[i], outImageU16[i], 0, false), GetErrorMessage(resultU12[i], outImageU16[i], @@ -399,7 +400,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::AVXRGBAPack::Load(&pixels[0], r, g, b, a); OCIO::AVXRGBAPack::Store(&outImageU16[0], r, g, b, a); - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU16[i], outImageU16[i], 0, false), GetErrorMessage(resultU16[i], outImageU16[i], diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index 3f6a65038e..eb4c323875 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -107,6 +107,10 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) add_ocio_test_variant(${TEST_NAME}_avx2+f16c ${TEST_BINARY} --avx2 --f16c) endif() endif() + + if(OCIO_USE_AVX512) + add_ocio_test_variant(${TEST_NAME}_avx512 ${TEST_BINARY} --avx512) + endif() else() add_ocio_test_variant(${TEST_NAME} ${TEST_BINARY}) endif() @@ -151,10 +155,12 @@ set(SOURCES ops/lut1d/Lut1DOpCPU_SSE2.cpp ops/lut1d/Lut1DOpCPU_AVX.cpp ops/lut1d/Lut1DOpCPU_AVX2.cpp + ops/lut1d/Lut1DOpCPU_AVX512.cpp ops/lut3d/Lut3DOpGPU.cpp ops/lut3d/Lut3DOpCPU_SSE2.cpp ops/lut3d/Lut3DOpCPU_AVX.cpp ops/lut3d/Lut3DOpCPU_AVX2.cpp + ops/lut3d/Lut3DOpCPU_AVX512.cpp ops/matrix/MatrixOpGPU.cpp ops/OpTools.cpp ops/range/RangeOpGPU.cpp @@ -280,6 +286,7 @@ set(TESTS SSE2_tests.cpp AVX_tests.cpp AVX2_tests.cpp + AVX512_tests.cpp transforms/AllocationTransform_tests.cpp transforms/builtins/BuiltinTransformRegistry_tests.cpp transforms/BuiltinTransform_tests.cpp @@ -328,12 +335,15 @@ if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON)) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) + set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX512.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) + set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX512.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS}) set_property(SOURCE "SSE2_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE "AVX_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE "AVX2_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) + set_property(SOURCE "AVX512_tests.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX512_ARGS}) endif() add_ocio_test(cpu "${SOURCES}" TRUE) diff --git a/tests/cpu/SIMD_tests.cpp b/tests/cpu/SIMD_tests.cpp index 3cd5b76bc4..82ffc02057 100644 --- a/tests/cpu/SIMD_tests.cpp +++ b/tests/cpu/SIMD_tests.cpp @@ -74,4 +74,27 @@ OCIO_ADD_TEST_AVX2(packed_uint16_to_f32_test) OCIO_ADD_TEST_AVX2(packed_nan_inf_test) OCIO_ADD_TEST_AVX2(packed_all_test) +#endif + +#if OCIO_USE_AVX512 + +#define AVX512_CHECK() \ + if (!OCIO::CPUInfo::instance().hasAVX512()) throw SkipException() + +#define OCIO_ADD_TEST_AVX512(name) \ +void avx512_test_##name(); \ +OCIO_ADD_TEST(AVX512, name) \ +{ \ + AVX512_CHECK(); \ + avx512_test_##name(); \ +} + +OCIO_ADD_TEST_AVX512(packed_uint8_to_float_test) +OCIO_ADD_TEST_AVX512(packed_uint10_to_f32_test) +OCIO_ADD_TEST_AVX512(packed_uint12_to_f32_test) +OCIO_ADD_TEST_AVX512(packed_uint16_to_f32_test) +OCIO_ADD_TEST_AVX512(packed_f16_to_f32_test) +OCIO_ADD_TEST_AVX512(packed_nan_inf_test) +OCIO_ADD_TEST_AVX512(packed_all_test) + #endif \ No newline at end of file diff --git a/tests/cpu/SSE2_tests.cpp b/tests/cpu/SSE2_tests.cpp index 3431d04b35..b7d2961cf5 100644 --- a/tests/cpu/SSE2_tests.cpp +++ b/tests/cpu/SSE2_tests.cpp @@ -150,8 +150,9 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) DEFINE_SIMD_TEST(packed_uint8_to_float_test) { - std::vector inImage(256); - std::vector outImage(256); + size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; + std::vector inImage(maxValue); + std::vector outImage(maxValue); for (unsigned i = 0; i < inImage.size(); i++) { @@ -352,7 +353,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::SSE2RGBAPack::Store(&outImageU16[i], r, g, b, a); } - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU10[i], outImageU16[i], 0, false), GetErrorMessage(resultU10[i], outImageU16[i], @@ -374,7 +375,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::SSE2RGBAPack::Store(&outImageU16[i], r, g, b, a); } - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU12[i], outImageU16[i], 0, false), GetErrorMessage(resultU12[i], outImageU16[i], @@ -396,7 +397,7 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) OCIO::SSE2RGBAPack::Store(&outImageU16[i], r, g, b, a); } - for (unsigned i = 0; i < outImageU8.size(); i++) + for (unsigned i = 0; i < outImageU16.size(); i++) { OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer(resultU16[i], outImageU16[i], 0, false), GetErrorMessage(resultU16[i], outImageU16[i], diff --git a/tests/cpu/UnitTestMain.cpp b/tests/cpu/UnitTestMain.cpp index a29dd3e085..6986d72262 100644 --- a/tests/cpu/UnitTestMain.cpp +++ b/tests/cpu/UnitTestMain.cpp @@ -67,6 +67,7 @@ int main(int argc, const char ** argv) bool sse2 = false; bool avx = false; bool avx2 = false; + bool avx512 = false; bool f16c = false; #endif ArgParse ap; @@ -78,7 +79,8 @@ int main(int argc, const char ** argv) "--sse2", &sse2, "Enable SSE2 Accelerated features", "--avx", &avx, "Enable AVX Accelerated features", "--avx2", &avx2, "Enable AVX2 Accelerated features", - "--f16c", &f16c, "Enable F16C Accelerated features", + "--avx512", &avx512, "Enable AVX512 Accelerated features", + "--f16c", &f16c, "Enable F16C Accelerated features (only used with AVX/AVX2)", #endif "--run_only %s", &filter, "Run only some unit tests\n" "\tex: --run_only \"FileRules/clone\"\n" @@ -101,7 +103,7 @@ int main(int argc, const char ** argv) #if defined(ENABLE_SIMD_USAGE) OCIO::CPUInfo &cpu = OCIO::CPUInfo::instance(); - if (no_accel || sse2 || avx || avx2 || f16c) + if (no_accel || sse2 || avx || avx2 || avx512 || f16c) { unsigned flags = 0; if (sse2) @@ -132,6 +134,16 @@ int main(int argc, const char ** argv) } flags |= X86_CPU_FLAG_AVX2; } + if (avx512) + { + if (!cpu.hasAVX512()) + { + std::cerr << "-avx512 not supported by processor\n"; + GetUnitTests().clear(); + } + flags |= X86_CPU_FLAG_AVX512; + } + if (f16c) { if (!cpu.hasF16C()) @@ -153,6 +165,9 @@ int main(int argc, const char ** argv) if (cpu.hasAVX2()) std::cerr << "+avx2"; + + if (cpu.hasAVX512()) + std::cerr << "+avx512"; if (cpu.hasF16C()) std::cerr << "+f16c";