From e60e6cd04a98d3152734e52f5799a3e65a77b0e7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Thu, 24 Aug 2023 08:38:18 -0400 Subject: [PATCH 01/22] Merging the previous ARM Neon branch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- .github/workflows/ci_workflow.yml | 6 +- CMakeLists.txt | 62 ++++++++- docs/quick_start/installation.rst | 13 +- .../macros/CheckForOpenEXRCompatibility.cmake | 66 +++++++++ .../ocio_check_dependency_version.cmake | 38 ------ share/cmake/modules/FindExtPackages.cmake | 13 +- .../modules/install/InstallOpenEXR.cmake | 2 + .../modules/install/Installsse2neon.cmake | 43 ++++++ share/cmake/utils/CheckSupportARMNeon.cmake | 27 ++++ share/cmake/utils/CheckSupportSSE2.cmake | 54 ++++++-- share/cmake/utils/CompilerFlags.cmake | 16 ++- share/dev/windows/ocio.bat | 2 +- src/OpenColorIO/CMakeLists.txt | 23 +++- src/OpenColorIO/SSE.h | 129 +++++++++++++----- tests/cpu/CMakeLists.txt | 28 +++- tests/cpu/ops/log/LogOpCPU_tests.cpp | 118 ++++++++++------ tests/gpu/CMakeLists.txt | 4 +- tests/osl/CMakeLists.txt | 4 +- 18 files changed, 493 insertions(+), 155 deletions(-) create mode 100644 share/cmake/macros/CheckForOpenEXRCompatibility.cmake delete mode 100644 share/cmake/macros/ocio_check_dependency_version.cmake create mode 100644 share/cmake/modules/install/Installsse2neon.cmake create mode 100644 share/cmake/utils/CheckSupportARMNeon.cmake diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index a411b62e5e..076bf6e4ed 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -243,7 +243,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SSE=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ @@ -384,7 +384,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SSE=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ @@ -532,7 +532,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SSE=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ diff --git a/CMakeLists.txt b/CMakeLists.txt index a0d650bbe8..6b553e5a7a 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,6 +25,14 @@ if(APPLE AND NOT DEFINED CMAKE_OSX_DEPLOYMENT_TARGET) endif() +############################################################################### +# By default, build the library, tests, tools, and Python binding as universal binaries for macOS. + +if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL "")) + set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE) +endif() + + ############################################################################### # Project definition. @@ -173,8 +181,11 @@ endif() ############################################################################### # Optimization / internal linking preferences - -option(OCIO_USE_SSE "Specify whether to enable SSE CPU performance optimizations" ON) +# TODO Remove OCIO_USE_SSE once it is fully deprecated. +option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)" ON) +# TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated. +mark_as_advanced(OCIO_USE_SSE) +option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE}) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)") @@ -197,6 +208,53 @@ message(STATUS "Checking for GPU configuration...") include(CheckSupportGL) +############################################################################### +# Check for ARM neon + +if(OCIO_USE_SIMD) + include(CheckSupportARMNeon) +endif() + + +############################################################################### +# Add sse2neon to the build if ARM NEON intrinsics are supported. + +if(HAVE_NEON AND OCIO_USE_SIMD) + # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is + # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake. + + # Sse2neon is not treated like an imported target. The logic to find sse2neon is here because + # a find module is not suitable for sse2neon's use case. + find_path(sse2neon_INCLUDE_DIR + NAMES + sse2neon.h + HINTS + ${sse2neon_ROOT} + PATH_SUFFIXES + sse2neon + include + sse2neon/include + ) + + # As per instructions on sse2neon's GitHub page, the following compiler flags should be used: + # "-march=armv8-a+fp+simd+crypto+crc". These flags are required for some ARM platforms that do + # not enable floating point calculations or SIMD instructions by default. However, for ARM64 + # (Apple ARM platform) and x86_64 platforms, these features are already enabled by default. + # Therefore, no additional compiler flags are needed. + if (NOT sse2neon_INCLUDE_DIR) + include(Installsse2neon) + else() + # Any changes to the following lines must be replicated in Installsse2neon.cmake as well. + # Create a target for sse2neon (non-imported) + add_library(sse2neon INTERFACE) + # Add the include directories to the target. + target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") + # Ignore the warnings coming from sse2neon.h as they are false positives. + target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) + endif() +endif() + + ############################################################################### # Define compilation and link flags diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index 788ae7e16b..fc58c5ec6f 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -277,7 +277,8 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR) - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding) - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins) -- ``-DOCIO_USE_SSE=ON`` (Set to OFF to turn off SSE CPU performance optimizations) +- ``-DOCIO_USE_SSE=ON`` (Deprecated -- please use OCIO_USE_SIMD) +- ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON) - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests) - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests) - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering) @@ -285,6 +286,16 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation) - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation) +On the MacOS under the ARM architecture, the default is to make a universal build +(natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option +may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. + +When doing a universal build, note that the OCIO dependencies must be built as universal libraries +too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if +any of your installed libraries are not universal. The easiest way to address this is to set +OCIO_INSTALL_EXT_PACKAGES=ALL in order to let OCIO build everything. Alternatively, you may set +CMAKE_OSX_ARCHITECTURES to just the platform you are targeting. + Several command-line tools (such as ``ocioconvert``) require reading or writing image files. If ``OCIO_USE_OIIO_FOR_APPS=OFF``, these will be built using OpenEXR rather than OpenImageIO and therefore you will be limited to using OpenEXR files with these tools rather than the diff --git a/share/cmake/macros/CheckForOpenEXRCompatibility.cmake b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake new file mode 100644 index 0000000000..bd79eb96e1 --- /dev/null +++ b/share/cmake/macros/CheckForOpenEXRCompatibility.cmake @@ -0,0 +1,66 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. +# +# Check for compatibility between OpenEXR and OpenImageIO since OCIO requires OpenEXR 3+. +# + +message(STATUS "Checking if the OpenImageIO found is built with OpenEXR 3+...") + +find_path (OpenImageIO_INCLUDE_DIR + NAMES + OpenImageIO/imageio.h + HINTS + ${OpenImageIO_ROOT} + # Assuming that OpenImageIO was installed normally, go back a few folders down + # to get the equivalent of OpenImageIO_ROOT. + ${OpenImageIO_DIR}/../../.. + PATH_SUFFIXES + OpenImageIO/include + include +) + +if (NOT OpenImageIO_INCLUDE_DIR) + message(STATUS "${ColorWarning}Could not find OpenImageIO header to evaluate the OpenEXR version.") + message(STATUS "Please provide the OpenImageIO_DIR variable.") + message(STATUS "If your OpenImageIO's files are located in different root directory, \ +please provide the OpenImageIO_ROOT where the include files are located.${ColorReset}") +endif() + +# Try to figure out version number +set (OIIO_VERSION_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/oiioversion.h") +if (EXISTS "${OIIO_VERSION_HEADER}") + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MAJOR .*$") + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MAJOR ${TMP}) + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_MINOR .*$") + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_MINOR ${TMP}) + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_PATCH .*$") + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_PATCH ${TMP}) + file (STRINGS "${OIIO_VERSION_HEADER}" TMP REGEX "^#define OIIO_VERSION_TWEAK .*$") + if (TMP) + string (REGEX MATCHALL "[0-9]+" OpenImageIO_VERSION_TWEAK ${TMP}) + else () + set (OpenImageIO_VERSION_TWEAK 0) + endif () + set (OpenImageIO_VERSION "${OpenImageIO_VERSION_MAJOR}.${OpenImageIO_VERSION_MINOR}.${OpenImageIO_VERSION_PATCH}.${OpenImageIO_VERSION_TWEAK}") +endif () + +set (OIIO_IMATH_HEADER "${OpenImageIO_INCLUDE_DIR}/OpenImageIO/Imath.h") +if (EXISTS "${OIIO_IMATH_HEADER}") + file(STRINGS "${OIIO_IMATH_HEADER}" TMP REGEX "^#define OIIO_USING_IMATH .*$") + string(REGEX MATCHALL "[0-9]" OIIO_IMATH_VERSION ${TMP}) + if (OIIO_IMATH_VERSION LESS 3) + message(STATUS "Skipping OpenImageIO built against OpenEXR 2, please use version 3 or greater.") + else() + set(is_OpenEXR_VERSION_valid TRUE) + endif() +endif() + +# clean up variables +unset(OpenImageIO_INCLUDE_DIR) +unset(OIIO_VERSION_HEADER) +unset(OIIO_VERSION_MAJOR) +unset(OIIO_VERSION_MINOR) +unset(OIIO_VERSION_PATCH) +unset(OIIO_VERSION_TWEAK) +unset(OIIO_IMATH_HEADER) +unset(OIIO_IMATH_VERSION) \ No newline at end of file diff --git a/share/cmake/macros/ocio_check_dependency_version.cmake b/share/cmake/macros/ocio_check_dependency_version.cmake deleted file mode 100644 index 93abe0d03d..0000000000 --- a/share/cmake/macros/ocio_check_dependency_version.cmake +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: BSD-3-Clause -# Copyright Contributors to the OpenColorIO Project. - -################################################################################################### -# ocio_check_dependency_version try to find the specified dependency and validate the version. -# -# Note that a function is used here to scoped-in any variables set by find_package. We do not want -# those variables to be propagated to the caller of the function. -# -# Argument: -# dep_name is the name of the dependency (package). Please note that dep_name is case sensitive. -# -################################################################################################### - -function (ocio_check_dependency_version dep_name output) - cmake_parse_arguments( - # prefix - Must be different than the one used in ocio_handle_dependency.cmake. - ocio_cdv - # options - "" - # one value keywords - "MIN_VERSION" - # multi value keywords - "" - # args - ${ARGN}) - - if (dep_name) - find_package(${dep_name} ${ocio_cdv_UNPARSED_ARGUMENTS}) - if (ocio_cdv_MIN_VERSION AND ${dep_name}_VERSION) - if (${${dep_name}_VERSION} VERSION_GREATER_EQUAL ocio_cdv_MIN_VERSION) - set(${output} TRUE PARENT_SCOPE) - else() - set(${output} FALSE PARENT_SCOPE) - endif() - endif() - endif() -endfunction() \ No newline at end of file diff --git a/share/cmake/modules/FindExtPackages.cmake b/share/cmake/modules/FindExtPackages.cmake index 37659eb23d..2625242cf1 100644 --- a/share/cmake/modules/FindExtPackages.cmake +++ b/share/cmake/modules/FindExtPackages.cmake @@ -197,13 +197,10 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS) # Supported from OIIO 2.4+. Setting this for lower versions doesn't affect anything. set(OPENIMAGEIO_CONFIG_DO_NOT_FIND_IMATH 1) - include(ocio_check_dependency_version) - # Since OpenImageIO will try to find OpenEXR through its OpenImageIOConfig.cmake file, - # let's try to find OpenEXR first and if the version is too old, OCIO will not try to find - # OpenImageIO. - ocio_check_dependency_version( OpenEXR "is_OpenEXR_VERSION_valid" - MIN_VERSION ${OpenEXR_MININUM_VERSION} - CONFIG) + set(is_OpenEXR_VERSION_valid FALSE) + # Check for compatibility between OpenEXR and OpenImageIO. + # Will set is_OpenEXR_VERSION_valid to TRUE if valid. + include(CheckForOpenEXRCompatibility) # Do not try to find OpenImageIO if the version of OpenEXR is too old. if (is_OpenEXR_VERSION_valid) @@ -227,8 +224,6 @@ if((OCIO_BUILD_APPS AND OCIO_USE_OIIO_FOR_APPS) OR OCIO_BUILD_TESTS) MIN_VERSION ${OIIO_VERSION} RECOMMENDED_VERSION ${OIIO_RECOMMENDED_VERSION} PROMOTE_TARGET OpenImageIO::OpenImageIO) - else() - message(WARNING "Skipping OpenImageIO because the OpenEXR found by OpenImageIO is too old (under ${OpenEXR_MININUM_VERSION})") endif() endif() diff --git a/share/cmake/modules/install/InstallOpenEXR.cmake b/share/cmake/modules/install/InstallOpenEXR.cmake index be9d1b14fc..44109ea17b 100644 --- a/share/cmake/modules/install/InstallOpenEXR.cmake +++ b/share/cmake/modules/install/InstallOpenEXR.cmake @@ -201,6 +201,7 @@ if(_OpenEXR_TARGET_CREATE) IMPORTED_LOCATION ${IlmThread_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;OpenEXR::IlmThreadConfig;OpenEXR::Iex;Threads::Threads" + STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols" ) set_target_properties(OpenEXR::IlmThreadConfig PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR};${OpenEXR_INCLUDE_DIR}/OpenEXR" @@ -217,6 +218,7 @@ if(_OpenEXR_TARGET_CREATE) IMPORTED_LOCATION ${OpenEXRCore_LIBRARY} INTERFACE_INCLUDE_DIRECTORIES "${OpenEXR_INCLUDE_DIR}" INTERFACE_LINK_LIBRARIES "OpenEXR::IlmThreadConfig;ZLIB::ZLIB;\$" + STATIC_LIBRARY_OPTIONS "-no_warning_for_no_symbols" ) set_target_properties(OpenEXR::OpenEXRUtil PROPERTIES IMPORTED_LOCATION ${OpenEXRUtil_LIBRARY} diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake new file mode 100644 index 0000000000..5f0f810ca1 --- /dev/null +++ b/share/cmake/modules/install/Installsse2neon.cmake @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. +# +# Install sse2neon (header-only version) +# https://github.com/DLTcollab/sse2neon +# +# +# Global targets defined by this module: +# sse2neon +############################################################################### + +# Download sse2neon using FetchContent and make it available at configure time. + +include(FetchContent) + +set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon") +FetchContent_Declare(sse2neon + GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git + GIT_TAG v1.6.0 +) + +# FetchContent_MakeAvailable is not available until CMake 3.14+. +# Using FetchContent_GetProperties and FetchContent_Populate instead. +FetchContent_GetProperties(sse2neon) + +if(NOT sse2neon_POPULATED) + FetchContent_Populate(sse2neon) + + set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}") + file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon") + + # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSE2.cmake and to create sse2neon + # target for OCIO. + set(sse2neon_INCLUDE_DIR "${sse2neon_SOURCE_DIR}") + + # Any changes to the following lines must be replicated in ./CMakeLists.txt as well. + # Create a target for sse2neon (non-imported) + add_library(sse2neon INTERFACE) + # Add the include directories to the target. + target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}") + # Ignore the warnings coming from sse2neon.h as they are false positives. + target_compile_options(sse2neon INTERFACE -Wno-unused-parameter) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake new file mode 100644 index 0000000000..5d17854757 --- /dev/null +++ b/share/cmake/utils/CheckSupportARMNeon.cmake @@ -0,0 +1,27 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +# Checks for ARM NEON availability + +include(CheckCXXSourceCompiles) + +set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + +if(APPLE) + set(CMAKE_OSX_ARCHITECTURES "arm64") +endif() + +set(source_code " +#include +int main() +{ + float32x4_t v = vdupq_n_f32(0); + return 0; +}") + +check_cxx_source_compiles ("${source_code}" HAVE_NEON) + +set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + +unset(_cmake_osx_architectures_orig) +mark_as_advanced(HAVE_NEON) diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake index f30bbb763c..07fecbd7a5 100644 --- a/share/cmake/utils/CheckSupportSSE2.cmake +++ b/share/cmake/utils/CheckSupportSSE2.cmake @@ -3,7 +3,9 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_old "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") +set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8) # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger @@ -16,8 +18,10 @@ if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8) endif() endif() -check_cxx_source_compiles (" - #include + +macro(check_sse2_availability _check_sse2_header_ _check_output_var_name_) + set(_SSE2_TEST_SOURCE_CODE " + ${_check_sse2_header_} int main () { __m128d a, b; @@ -26,10 +30,44 @@ check_cxx_source_compiles (" b = _mm_add_pd (a,a); _mm_storeu_pd (vals,b); return (0); - }" - HAVE_SSE2) + }") + + check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" ${_check_output_var_name_}) + mark_as_advanced(${_check_output_var_name_}) +endmacro() + +if(NOT HAVE_NEON) + check_sse2_availability("#include " HAVE_SSE2) +elseif(APPLE AND HAVE_NEON) + # Test for both supported architectures + # x86_64 and arm64 + set(ARCHITECTURES_LIST "arm64;x86_64") + + message(STATUS "Checking SSE2 support using SSE2NEON library for arm64 and x86_64 architectures") + foreach (current_arch IN LISTS ARCHITECTURES_LIST) + + set (CMAKE_OSX_ARCHITECTURES "${current_arch}") + + if(current_arch STREQUAL arm64) + set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR}) + set(_sse2_header_ "#include ") + set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON") + elseif(current_arch STREQUAL x86_64) + set(_sse2_header_ "#include ") + set(_output_var_name_ "HAVE_SSE2") + endif() + + check_sse2_availability("${_sse2_header_}" ${_output_var_name_}) + + endforeach() +endif() + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}") +set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + +unset(_cmake_required_flags_orig) +unset(_cmake_required_includes_orig) +unset(_cmake_osx_architectures_orig) -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_old}") -unset(_cmake_required_flags_old) -mark_as_advanced(HAVE_SSE2) diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index d4c24b6436..1511693a51 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -8,6 +8,21 @@ set(PLATFORM_COMPILE_OPTIONS "") set(PLATFORM_LINK_OPTIONS "") +############################################################################### +# Define if SSE2 can be used. + +if(OCIO_USE_SIMD) + include(CheckSupportSSE2) +endif() + +if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON) + message(STATUS "Disabling SSE optimizations, as the target doesn't support them") + set(OCIO_USE_SIMD OFF) +endif() + +############################################################################### +# Compile flags + if(USE_MSVC) set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/DUSE_MSVC") @@ -40,7 +55,6 @@ elseif(USE_CLANG) # Use of 'register' specifier must be removed for C++17 support. set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-Wno-deprecated-register") - elseif(USE_GCC) set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};-DUSE_GCC") diff --git a/share/dev/windows/ocio.bat b/share/dev/windows/ocio.bat index 7f24bc279b..a4762a97d9 100644 --- a/share/dev/windows/ocio.bat +++ b/share/dev/windows/ocio.bat @@ -206,7 +206,7 @@ if !DO_CONFIGURE!==1 ( -DOCIO_BUILD_TESTS=ON^ -DOCIO_BUILD_GPU_TESTS=ON^ -DOCIO_BUILD_DOCS=OFF^ - -DOCIO_USE_SSE=ON^ + -DOCIO_USE_SIMD=ON^ -DOCIO_WARNING_AS_ERROR=ON^ -DOCIO_BUILD_JAVA=OFF^ "!OCIO_PATH!" diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index 8ec954afe3..46fbd45670 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -311,6 +311,10 @@ target_link_libraries(OpenColorIO MINIZIP::minizip-ng ) +if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON) + target_link_libraries(OpenColorIO PRIVATE $) +endif() + if(APPLE) target_link_libraries(OpenColorIO PRIVATE @@ -341,11 +345,20 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX)) set_property(TARGET OpenColorIO PROPERTY POSITION_INDEPENDENT_CODE ON) endif() -if(OCIO_USE_SSE) - target_compile_definitions(OpenColorIO - PRIVATE - USE_SSE - ) +if(OCIO_USE_SIMD) + if(HAVE_SSE2) + target_compile_definitions(OpenColorIO + PRIVATE + USE_SSE + ) + endif() + + if(HAVE_SSE2_WITH_SSE2NEON) + target_compile_definitions(OpenColorIO + PRIVATE + USE_SSE2_WITH_SSE2NEON + ) + endif() endif() if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS) diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index e94eb6b084..549a24dee2 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -6,12 +6,19 @@ #define INCLUDED_OCIO_SSE_H -#ifdef USE_SSE - - -#include -#include - +#if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON) + +// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). +#if !defined(__aarch64__) + #if defined(USE_SSE) + #include + #endif +#elif defined(__aarch64__) + // ARM architecture A64 (ARM64) + #if defined(USE_SSE2_WITH_SSE2NEON) + #include + #endif +#endif #include @@ -20,6 +27,34 @@ namespace OCIO_NAMESPACE { +// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since +// it is redefining two of the functions from sse2neon. + +#if defined(__aarch64__) + #if defined(USE_SSE2_WITH_SSE2NEON) + // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to + // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. + + // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were + // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the + // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the + // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as + // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument + // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in + // the first argument continues to be filtered out. + static inline __m128 _mm_max_ps(__m128 a, __m128 b) + { + return vreinterpretq_m128_f32( + vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + } + static inline __m128 _mm_min_ps(__m128 a, __m128 b) + { + return vreinterpretq_m128_f32( + vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + } + #endif +#endif + // Macros for alignment declarations #define OCIO_SIMD_BYTES 16 #define OCIO_ALIGN(decl) alignas(OCIO_SIMD_BYTES) decl @@ -38,7 +73,7 @@ static const __m128i EBIAS = _mm_set1_epi32(EXP_BIAS); static const __m128 EONE = _mm_set1_ps(1.0f); static const __m128 EZERO = _mm_set1_ps(0.0f); static const __m128 ENEG126 = _mm_set1_ps(-126.0f); -static const __m128 EPOS127 = _mm_set1_ps(127.0f); +static const __m128 EPOS128 = _mm_set1_ps(128.0f); static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits::infinity()); @@ -65,10 +100,10 @@ inline __m128 isNegativeSpecial(const __m128 x) return _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(x), SIGN_SHIFT)); } -// Select function in SSE version 2 +// Bit-wise select function in SSE version 2 // -// Return the parameter arg_false when the parameter mask is 0x0, -// or the parameter arg_true when the mask is 0xffffffff. +// Return the parameter arg_false bit where the parameter mask is 0x0, +// return the parameter arg_true bit where the mask is 1. // // Algorithm Explanation: // @@ -98,7 +133,11 @@ inline __m128 isNegativeSpecial(const __m128 x) // inline __m128 sseSelect(const __m128& mask, const __m128& arg_true, const __m128& arg_false) { - return _mm_xor_ps( arg_false, _mm_and_ps( mask, _mm_xor_ps( arg_true, arg_false ) ) ); + return _mm_xor_ps( // bit-wise XOR of arg_false, (...) + arg_false, + _mm_and_ps( // bit-wise AND of mask, (...) + mask, + _mm_xor_ps( arg_true, arg_false ) ) ); // bit-wise XOR of arg_true, arg_false } // Coefficients of Chebyshev (minimax) degree 5 polynomial @@ -118,6 +157,10 @@ static const __m128 PNEXP2 = _mm_set1_ps((float)2.414427569091865207710e-1); static const __m128 PNEXP1 = _mm_set1_ps((float)6.930038344665415134202e-1); static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644); +// Note: The above polynomials have been chosen to achieve a precision of +// approximately 15 bits of mantissa. + + // log2 function in SSE version 2 // // The function log2() is evaluated by performing argument @@ -125,12 +168,14 @@ static const __m128 PNEXP0 = _mm_set1_ps((float)1.000002593370603213644); // over a restricted range. inline __m128 sseLog2(__m128 x) { - // y = log2( x ) = log2( 2^exposant * mantissa ) - // = exposant + log2( mantissa ) + // y = log2( x ) = log2( 2^exponent * mantissa ) + // = exponent + log2( mantissa ) __m128 mantissa - = _mm_or_ps( - _mm_andnot_ps(_mm_castsi128_ps(EMASK), x), EONE); + = _mm_or_ps( // OR with EONE + _mm_andnot_ps( // NOT(EMASK) AND x + _mm_castsi128_ps(EMASK), x), // reinterpret cast int to float + EONE); __m128 log2 = _mm_add_ps( @@ -154,14 +199,15 @@ inline __m128 sseLog2(__m128 x) PNLOG0); __m128i exponent - = _mm_sub_epi32( - _mm_srli_epi32( - _mm_and_si128(_mm_castps_si128(x), + = _mm_sub_epi32( // subtract EBIAS + _mm_srli_epi32( // right-shift by EXP_SHIFT + _mm_and_si128(_mm_castps_si128(x), // bit-wise AND with EMASK EMASK), EXP_SHIFT), EBIAS); - log2 = _mm_add_ps(log2, _mm_cvtepi32_ps(exponent)); + log2 = _mm_add_ps(log2, + _mm_cvtepi32_ps(exponent)); // convert exponent to float return log2; } @@ -180,24 +226,30 @@ inline __m128 sseExp2(__m128 x) // Compute the largest integer not greater than x, i.e., floor(x) // Note: cvttps_epi32 simply cast the float value to int. That means cvttps_epi32(-2.7) = -2 // rather than -3, hence for negative numbers we need to add -1. This ensures that "fraction" - // is always in the range [0, 1). + // is always in the range [0, 1). Note that _mm_castps_si128(0xFFFFFFFF) is -1. + // If x is outside the INT_MIN to INT_MAX range, _mm_cvttps_epi32 will return 0x80000000 + // (i.e. INT_MIN, just the sign bit set), which Intel calls the "integer indefinite" value. + // When 1 is subtracted from INT_MIN, it gives INT_MAX. So floor_x is wrong for values + // outside [INT_MIN, INT_MAX] but it's ignored thanks to the checks at the bottom. + // It's also wrong for x=NaN, but again it's ok since the polynomial returns NaN and + // hence the output is NaN, regardless of floor_x. __m128i floor_x - = _mm_add_epi32( - _mm_cvttps_epi32(x), - _mm_castps_si128( - _mm_cmpnle_ps(EZERO, x))); + = _mm_add_epi32( // add a pair of integer arguments + _mm_cvttps_epi32(x), // convert float to int via truncation + _mm_castps_si128( // reinterpret cast float to int + _mm_cmpnle_ps(EZERO, x))); // NOT( EZERO <= x ) ? 0xFFFFFFFF : 0 // Compute exp2(floor_x) by moving floor_x to the exponent bits of the floating-point number. __m128 zf - = _mm_castsi128_ps( - _mm_slli_epi32( - _mm_add_epi32(floor_x, EBIAS), + = _mm_castsi128_ps( // reinterpret cast int to float + _mm_slli_epi32( // left shift by EXP_SHIFT + _mm_add_epi32(floor_x, EBIAS), // add a pair of integer arguments EXP_SHIFT)); - __m128 iexp = _mm_cvtepi32_ps(floor_x); - __m128 fraction = _mm_sub_ps(x, iexp); + __m128 iexp = _mm_cvtepi32_ps(floor_x); // convert floor_x to float + __m128 fraction = _mm_sub_ps(x, iexp); // x - iexp - // Compute exp2(fraction) using a polynomial approximation + // Compute exp2(fraction) using a polynomial approximation. __m128 mexp = _mm_add_ps( _mm_mul_ps( @@ -215,19 +267,26 @@ inline __m128 sseExp2(__m128 x) fraction), PNEXP0); - __m128 exp2 = _mm_mul_ps(zf, mexp); + __m128 exp2 = _mm_mul_ps(zf, mexp); // zf * mexp // Handle underflow: // If the (unbiased) exponent of zf is less than -126, the result is smaller than // the smallest representable floating-point number and an underflow computation is // potentially happening. When this happens, force the result to zero. - exp2 = _mm_andnot_ps(_mm_cmplt_ps(iexp, ENEG126), exp2); + // Note that as described above, floor_x is inaccurate, so the test here uses x. + exp2 = _mm_andnot_ps( // NOT(...) AND exp2 + _mm_cmplt_ps(x, ENEG126), // iexp < ENEG126 ? 0xFFFFFFFF : 0 + exp2); // Handle overflow: // If the (unbiased) exponent of zf is greater than 127, the result is larger than // the largest representable floating-point number and an overflow computation is // potentially happening. When this happens, force the result to positive infinity. - exp2 = sseSelect(_mm_cmpgt_ps(iexp, EPOS127), EPOSINF, exp2); + // Note that as described above, floor_x is inaccurate, so the test here uses x. + exp2 = sseSelect( // (...) is a mask to select EPOSINF, exp2 + _mm_cmpge_ps(x, EPOS128), // iexp > EPOS128 ? 0xFFFFFFFF : 0 + EPOSINF, + exp2); return exp2; } @@ -586,7 +645,7 @@ inline void sseSinCos(const float x, float& sin_x, float& cos_x) } // namespace OCIO_NAMESPACE -#endif +#endif // USE_SSE -#endif +#endif // INCLUDED_OCIO_SSE_H diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index fdc200a7c4..5d43331da6 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -26,6 +26,10 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) xxHash ) + if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON) + target_link_libraries(${TEST_BINARY} PRIVATE sse2neon) + endif() + if(APPLE) # Frameworks needed to access the ICC monitor profile. target_link_libraries(${TEST_BINARY} @@ -43,12 +47,23 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) "${PROJECT_BINARY_DIR}/generated_include" ) endif(PRIVATE_INCLUDES) - if(OCIO_USE_SSE) - target_compile_definitions(${TEST_BINARY} - PRIVATE - USE_SSE - ) - endif(OCIO_USE_SSE) + + if(OCIO_USE_SIMD) + if (HAVE_SSE2) + target_compile_definitions(${TEST_BINARY} + PRIVATE + USE_SSE + ) + endif() + + if(HAVE_SSE2_WITH_SSE2NEON) + target_compile_definitions(${TEST_BINARY} + PRIVATE + USE_SSE2_WITH_SSE2NEON + ) + endif() + endif(OCIO_USE_SIMD) + if(WIN32) # A windows application linking to eXpat static libraries must # have the global macro XML_STATIC defined @@ -66,6 +81,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) ) endif() endif(WIN32) + set_target_properties(${TEST_BINARY} PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}" diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp index 79649a23ae..06e98fe0da 100644 --- a/tests/cpu/ops/log/LogOpCPU_tests.cpp +++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp @@ -9,7 +9,6 @@ namespace OCIO = OCIO_NAMESPACE; - constexpr float qnan = std::numeric_limits::quiet_NaN(); constexpr float inf = std::numeric_limits::infinity(); @@ -23,6 +22,7 @@ void TestLog(float logBase) 0.f, 0.f, 0.f, inf, -inf, -inf, -inf, 0.f, 0.f, 0.f, 0.f, -inf }; + float rgba[32] = {}; OCIO::ConstLogOpDataRcPtr logOp = std::make_shared( @@ -52,16 +52,25 @@ void TestLog(float logBase) expected = logf(std::max(minValue, (float)expected)) / logf(logBase); } + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_CLOSE(result, expected, error); } const float resMin = logf(minValue) / logf(logBase); + + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_CLOSE(rgba[8], resMin, error); OCIO_CHECK_EQUAL(rgba[11], 0.0f); + + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], resMin, error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); + // SSE implementation of sseLog2 & sseExp2 do not behave like CPU. // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below. + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. #ifdef USE_SSE if (logBase == 10.0f) { @@ -75,10 +84,16 @@ void TestLog(float logBase) OCIO_CHECK_EQUAL(rgba[16], inf); #endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], resMin, error); OCIO_CHECK_EQUAL(rgba[23], inf); + + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_CLOSE(rgba[24], resMin, error); OCIO_CHECK_EQUAL(rgba[27], 0.0f); + + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], resMin, error); OCIO_CHECK_EQUAL(rgba[31], -inf); } @@ -127,30 +142,33 @@ void TestAntiLog(float logBase) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison. + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f)); } -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[8], inf); -#else + + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8])); -#endif OCIO_CHECK_EQUAL(rgba[11], 0.0f); + + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], 1.0f, rtol); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[16], 0.0f); // sseExp2(inf) is 0 -#else + + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. OCIO_CHECK_EQUAL(rgba[16], inf); -#endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], 1.0f, rtol); OCIO_CHECK_EQUAL(rgba[23], inf); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[24], inf); -#else + + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_EQUAL(rgba[24], 0.0f); -#endif OCIO_CHECK_EQUAL(rgba[27], 0.0f); + + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], 1.0f, rtol); OCIO_CHECK_EQUAL(rgba[31], -inf); } @@ -263,39 +281,35 @@ OCIO_ADD_TEST(LogOpCPU, log2lin_test) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison. + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_ASSERT(OCIO::EqualWithSafeRelError(result, expected, rtol, 1.0f)); } const float res0 = ComputeLog2LinEval(0.0f, redP); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[8], inf); -#else + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[8])); -#endif - OCIO_CHECK_EQUAL(rgba[11], 0.0f); + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], res0, rtol); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); -#ifdef USE_SSE - OCIO_CHECK_CLOSE(rgba[16], -0.003041422227f, rtol); -#else + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. OCIO_CHECK_EQUAL(rgba[16], inf); -#endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], res0, rtol); OCIO_CHECK_EQUAL(rgba[23], inf); -#ifdef USE_SSE - OCIO_CHECK_EQUAL(rgba[24], inf); -#else + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_CLOSE(rgba[24], ComputeLog2LinEval(-inf, redP), rtol); -#endif OCIO_CHECK_EQUAL(rgba[27], 0.0f); + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], res0, rtol); OCIO_CHECK_EQUAL(rgba[31], -inf); } @@ -399,18 +413,24 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison + // Evaluating output for input rgbaImage[0-7] = { 0.0367126f, 0.5f, 1.f, 0.f, + // 0.2f, 0.f, .99f, 128.f, + // ... } OCIO_CHECK_CLOSE(result, expected, error); } const float res0 = ComputeLin2LogEval(0.0f, redP); const float resMin = ComputeLin2LogEval(-100.0f, redP); + // Evaluating output for input rgbaImage[8-11] = {qnan, qnan, qnan, 0.}. OCIO_CHECK_CLOSE(rgba[8], resMin, error); OCIO_CHECK_EQUAL(rgba[11], 0.0f); + // Evaluating output for input rgbaImage[12-15] = {0., 0., 0., qnan.}. OCIO_CHECK_CLOSE(rgba[12], res0, error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); + // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error); #else @@ -418,17 +438,20 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test) #endif OCIO_CHECK_EQUAL(rgba[19], 0.0f); + // Evaluating output for input rgbaImage[20-23] = {0., 0., 0., inf}. OCIO_CHECK_CLOSE(rgba[20], res0, error); OCIO_CHECK_EQUAL(rgba[23], inf); + // Evaluating output for input rgbaImage[24-27] = {-inf, -inf, -inf, 0.}. OCIO_CHECK_CLOSE(rgba[24], resMin, error); OCIO_CHECK_EQUAL(rgba[27], 0.0f); + // Evaluating output for input rgbaImage[28-31] = {0., 0., 0., -inf}. OCIO_CHECK_CLOSE(rgba[28], res0, error); OCIO_CHECK_EQUAL(rgba[31], -inf); } -OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) +OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) { constexpr int numPixels = 3; constexpr int numValues = 4 * numPixels; @@ -460,18 +483,21 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) const float error = 1e-7f; #endif // USE_SSE + // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba[0], -0.168771237955f, error); OCIO_CHECK_CLOSE(rgba[1], -0.048771237955f, error); OCIO_CHECK_CLOSE(rgba[2], -0.036771237955f, error); + + // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error); #else OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error); #endif // USE_SSE - OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error); + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. #ifdef USE_SSE OCIO_CHECK_EQUAL(rgba[8], -inf); OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error); @@ -492,9 +518,12 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO::ConstOpCPURcPtr pRendererNoLS = OCIO::GetLogRenderer(lognols, true); pRendererNoLS->apply(rgbaImage, rgba_nols, numPixels); + // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba_nols[0], -0.325512374199f, error); OCIO_CHECK_CLOSE(rgba_nols[1], -0.127141806077f, error); OCIO_CHECK_CLOSE(rgba_nols[2], -0.107304749265f, error); + + // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error); @@ -502,8 +531,9 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error); #endif // USE_SSE OCIO_CHECK_CLOSE(rgba_nols[6], 0.68141615509f, error); - OCIO_CHECK_EQUAL(rgba_nols[8], -inf); + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. + OCIO_CHECK_EQUAL(rgba_nols[8], -inf); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10])); @@ -527,12 +557,18 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) #else const float error2 = 1e-7f; #endif // USE_SSE + + // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[0], -24.6f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[1], -0.264385618977f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[2], -0.20700938942f, error2); + + // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[4], 0.028548034423f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[5], 0.170878935551f, error2); OCIO_CHECK_CLOSE(rgba_nobreak[6], 0.68141615509, error2); + + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2); #ifdef USE_SSE OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2); @@ -542,17 +578,11 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO_CHECK_CLOSE(rgba_nobreak[10], -24.6f, error2); } -OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) +OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) { // Inverse of previous test. - const float rgbaImage[12] = { -0.168771237955f, - -0.048771237955f, - -0.036771237955f, - 0.f, - 0.047228762045f, - 0.170878935551f, - 0.68141615509f, - 0.f, + const float rgbaImage[12] = { -0.168771237955f, -0.048771237955f, -0.036771237955f, 0.f, + 0.047228762045f, 0.170878935551f, 0.68141615509f, 0.f, -inf, inf, qnan, 0.0f }; float rgba[12] = {}; @@ -571,18 +601,22 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) #else const float error = 1e-7f; #endif // USE_SSE + + // Evaluating output for input rgbaImage[0-2] = + // { -0.168771237955f, -0.048771237955f, -0.036771237955f, ... }. OCIO_CHECK_CLOSE(rgba[0], -0.1f, error); OCIO_CHECK_CLOSE(rgba[1], 0.0f, error); OCIO_CHECK_CLOSE(rgba[2], 0.01f, error); + + // Evaluating output for input rgbaImage[4-6] = + // { 0.047228762045f, 0.170878935551f, 0.68141615509f, ... }. OCIO_CHECK_CLOSE(rgba[4], 0.08f, error); OCIO_CHECK_CLOSE(rgba[5], 0.16f, error); OCIO_CHECK_CLOSE(rgba[6], 1.16f, 10.0f * error); + + // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. OCIO_CHECK_EQUAL(rgba[8], -inf); -#ifdef USE_SSE - OCIO_CHECK_CLOSE(rgba[9], -0.0454545f, error); // sseExp2(inf) is 0 -#else OCIO_CHECK_EQUAL(rgba[9], inf); -#endif OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10])); } diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt index ca045ade4b..0a2da9ddcb 100644 --- a/tests/gpu/CMakeLists.txt +++ b/tests/gpu/CMakeLists.txt @@ -26,12 +26,12 @@ set(SOURCES add_executable(test_gpu_exec ${SOURCES}) -if(OCIO_USE_SSE) +if(OCIO_USE_SIMD) target_compile_definitions(test_gpu_exec PRIVATE USE_SSE ) -endif(OCIO_USE_SSE) +endif(OCIO_USE_SIMD) set_target_properties(test_gpu_exec PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt index cdd95fb0f3..17addff1d4 100644 --- a/tests/osl/CMakeLists.txt +++ b/tests/osl/CMakeLists.txt @@ -18,12 +18,12 @@ set(SOURCES add_executable(test_osl_exec ${SOURCES}) -if(OCIO_USE_SSE) +if(OCIO_USE_SIMD) target_compile_definitions(test_osl_exec PRIVATE USE_SSE ) -endif(OCIO_USE_SSE) +endif(OCIO_USE_SIMD) set_target_properties(test_osl_exec PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" From 00e06c414b5fd8527900d229de2592069e188b6f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Mon, 21 Aug 2023 09:18:32 -0400 Subject: [PATCH 02/22] Testing each SIMD variants using a small code snippet and first pass on unifying the way related cmake variables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 44 ++++++---- share/cmake/utils/CheckSupportARMNeon.cmake | 14 +-- share/cmake/utils/CheckSupportAVX.cmake | 47 ++++++++++ share/cmake/utils/CheckSupportAVX2.cmake | 55 ++++++++++++ share/cmake/utils/CheckSupportAVX512.cmake | 43 +++++++++ share/cmake/utils/CheckSupportF16C.cmake | 25 ++++++ share/cmake/utils/CheckSupportSSE2.cmake | 87 ++++++++----------- share/cmake/utils/CheckSupportSSE3.cmake | 43 +++++++++ share/cmake/utils/CheckSupportSSE4.cmake | 43 +++++++++ share/cmake/utils/CheckSupportSSE42.cmake | 44 ++++++++++ .../utils/CheckSupportSSEUsingSSE2NEON.cmake | 62 +++++++++++++ share/cmake/utils/CheckSupportSSSE3.cmake | 43 +++++++++ share/cmake/utils/CheckSupportX86SIMD.cmake | 87 ++++++++++--------- share/cmake/utils/CompilerFlags.cmake | 60 ++++++------- src/OpenColorIO/CMakeLists.txt | 11 +-- src/OpenColorIO/CPUInfoConfig.h.in | 6 ++ src/OpenColorIO/SSE.h | 6 +- src/OpenColorIO/SSE2.h | 13 ++- tests/cpu/CMakeLists.txt | 11 +-- 19 files changed, 575 insertions(+), 169 deletions(-) create mode 100644 share/cmake/utils/CheckSupportAVX.cmake create mode 100644 share/cmake/utils/CheckSupportAVX2.cmake create mode 100644 share/cmake/utils/CheckSupportAVX512.cmake create mode 100644 share/cmake/utils/CheckSupportF16C.cmake create mode 100644 share/cmake/utils/CheckSupportSSE3.cmake create mode 100644 share/cmake/utils/CheckSupportSSE4.cmake create mode 100644 share/cmake/utils/CheckSupportSSE42.cmake create mode 100644 share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake create mode 100644 share/cmake/utils/CheckSupportSSSE3.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 6b553e5a7a..6be5c81475 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -181,24 +181,29 @@ endif() ############################################################################### # Optimization / internal linking preferences -# TODO Remove OCIO_USE_SSE once it is fully deprecated. -option(OCIO_USE_SSE "Specify whether to enable SSE (supplanted by OCIO_USE_SIMD)" ON) -# TODO Remove mark_as_advanced once OCIO_USE_SSE is fully deprecated. -mark_as_advanced(OCIO_USE_SSE) -option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ${OCIO_USE_SSE}) +option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ON) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)") - option(OCIO_USE_SSE2 "Specify whether to enable SSE2 CPU performance optimizations" ON) - option(OCIO_USE_SSE3 "Specify whether to enable SSE3 CPU performance optimizations" ON) - option(OCIO_USE_SSSE3 "Specify whether to enable SSSE3 CPU performance optimizations" ON) - option(OCIO_USE_SSE4 "Specify whether to enable SSE4 CPU performance optimizations" ON) - option(OCIO_USE_SSE42 "Specify whether to enable SSE4.2 CPU performance optimizations" ON) - option(OCIO_USE_AVX "Specify whether to enable AVX CPU performance optimizations" ON) - option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizations" ON) - option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ON) - option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS ON) set(OCIO_ARCH_X86 1) +else() + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS OFF) + set(OCIO_ARCH_X86 0) +endif() + +option(OCIO_USE_SSE2 "Specify whether to enable SSE2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_SSE3 "Specify whether to enable SSE3 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_SSSE3 "Specify whether to enable SSSE3 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_SSE4 "Specify whether to enable SSE4 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_SSE42 "Specify whether to enable SSE4.2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_AVX "Specify whether to enable AVX CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) +option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) + +if (APPLE) + option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations on Apple ARM using SSE2NEON" ON) endif() ############################################################################### @@ -209,17 +214,24 @@ include(CheckSupportGL) ############################################################################### -# Check for ARM neon +# Check for ARM neon here because we need to know if ARM NEON is supported +# quickly. Once we know that ARM NEON is supported, we can add sse2neon library +# to the build. if(OCIO_USE_SIMD) include(CheckSupportARMNeon) + + if (NOT COMPILER_SUPPORTS_ARM_NEON) + # Force it to OFF since ARM Neon is not supported. + set(OCIO_USE_SSE2NEON OFF) + endif() endif() ############################################################################### # Add sse2neon to the build if ARM NEON intrinsics are supported. -if(HAVE_NEON AND OCIO_USE_SIMD) +if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake. diff --git a/share/cmake/utils/CheckSupportARMNeon.cmake b/share/cmake/utils/CheckSupportARMNeon.cmake index 5d17854757..efa76b9095 100644 --- a/share/cmake/utils/CheckSupportARMNeon.cmake +++ b/share/cmake/utils/CheckSupportARMNeon.cmake @@ -12,16 +12,16 @@ if(APPLE) endif() set(source_code " -#include -int main() -{ - float32x4_t v = vdupq_n_f32(0); - return 0; + #include + int main() + { + float32x4_t v = vdupq_n_f32(0); + return 0; }") -check_cxx_source_compiles ("${source_code}" HAVE_NEON) +check_cxx_source_compiles ("${source_code}" COMPILER_SUPPORTS_ARM_NEON) set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") unset(_cmake_osx_architectures_orig) -mark_as_advanced(HAVE_NEON) +mark_as_advanced(COMPILER_SUPPORTS_ARM_NEON) diff --git a/share/cmake/utils/CheckSupportAVX.cmake b/share/cmake/utils/CheckSupportAVX.cmake new file mode 100644 index 0000000000..60605f9066 --- /dev/null +++ b/share/cmake/utils/CheckSupportAVX.cmake @@ -0,0 +1,47 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +if(MSVC) + set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX") +elseif(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -mavx") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(AVX_CODE " + #include + int main() + { + // Create two arrays of floats + float a[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; + float b[8] = {2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0}; + + _mm256_add_ps(_mm256_load_ps(a), _mm256_load_ps(b)); + return 0; + } +") +check_cxx_source_compiles("${AVX_CODE}" COMPILER_SUPPORTS_AVX) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportAVX2.cmake b/share/cmake/utils/CheckSupportAVX2.cmake new file mode 100644 index 0000000000..95ac6b4361 --- /dev/null +++ b/share/cmake/utils/CheckSupportAVX2.cmake @@ -0,0 +1,55 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +if(MSVC) + set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX2") +elseif(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -mavx2 -mfma -mf16c") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(AVX2_CODE " + #include + + int main() + { + __m256i a = _mm256_set_epi32(1, 2, 3, 4, 5, 6, 7, 8); + __m256i b = _mm256_set_epi32(8, 7, 6, 5, 4, 3, 2, 1); + __m256i result = _mm256_add_epi32(a, b); + + __m256 result_f16c = _mm256_cvtph_ps(_mm_set_epi16(1, 2, 3, 4, 5, 6, 7, 8)); + + __m256 result_fma = _mm256_fmadd_ps( + _mm256_set_ps(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0), + _mm256_set_ps(8.0, 7.0, 6.0, 5.0, 4.0, 3.0, 2.0, 1.0), + _mm256_set_ps(1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0) + ); + + return 0; + } +") +check_cxx_source_compiles("${AVX2_CODE}" COMPILER_SUPPORTS_AVX2) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportAVX512.cmake b/share/cmake/utils/CheckSupportAVX512.cmake new file mode 100644 index 0000000000..ea9007904a --- /dev/null +++ b/share/cmake/utils/CheckSupportAVX512.cmake @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +if(MSVC) + set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX512") +elseif(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -mavx512f") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(AVX512_CODE " + #include + + int main() { + __m512i vec = _mm512_set1_epi32(42); + return 0; + } +") +check_cxx_source_compiles("${AVX512_CODE}" COMPILER_SUPPORTS_AVX512) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportF16C.cmake b/share/cmake/utils/CheckSupportF16C.cmake new file mode 100644 index 0000000000..524b69a44d --- /dev/null +++ b/share/cmake/utils/CheckSupportF16C.cmake @@ -0,0 +1,25 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +# MSVC doesn't have flags +if(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -mf16c") +endif() + +set(F16C_CODE " + #include + + int main() + { + _mm_cvtph_ps(_mm_set1_epi16(0x3C00)); + return 0; + } +") +check_cxx_source_compiles("${F16C_CODE}" COMPILER_SUPPORTS_F16C) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake index 07fecbd7a5..e8c6e181ff 100644 --- a/share/cmake/utils/CheckSupportSSE2.cmake +++ b/share/cmake/utils/CheckSupportSSE2.cmake @@ -4,70 +4,53 @@ include(CheckCXXSourceCompiles) set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") -set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") -set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") -if(NOT CMAKE_SIZE_OF_VOID_P EQUAL 8) - # As CheckCXXCompilerFlag implicitly uses CMAKE_CXX_FLAGS some custom flags could trigger - # unrelated warnings causing a detection failure. So, the code disables all warnings to focus - # on the SSE2 detection. - if(USE_MSVC) - set(CMAKE_REQUIRED_FLAGS "/w /arch:SSE2") - elseif(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -msse2") +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +if(MSVC) + # x86_64 always has SSE2 + if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "8") + # Simulate the same message we would get by using check_cxx_source_compiles. + message(STATUS "x86_64 always support SSE2 - COMPILER_SUPPORTS_SSE2 - Success") + # By setting the variable to 1, tuhe check_cxx_source_compiles will be skipped automatically. + set(COMPILER_SUPPORTS_SSE2 1) + else() + check_cxx_compiler_flag("/arch:SSE2" COMPILER_SUPPORTS_SSE2) endif() +elseif(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -msse2") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. endif() +set(SSE2_CODE " + #include -macro(check_sse2_availability _check_sse2_header_ _check_output_var_name_) - set(_SSE2_TEST_SOURCE_CODE " - ${_check_sse2_header_} - int main () - { + int main() + { __m128d a, b; double vals[2] = {0}; a = _mm_loadu_pd (vals); b = _mm_add_pd (a,a); _mm_storeu_pd (vals,b); return (0); - }") - - check_cxx_source_compiles ("${_SSE2_TEST_SOURCE_CODE}" ${_check_output_var_name_}) - mark_as_advanced(${_check_output_var_name_}) -endmacro() - -if(NOT HAVE_NEON) - check_sse2_availability("#include " HAVE_SSE2) -elseif(APPLE AND HAVE_NEON) - # Test for both supported architectures - # x86_64 and arm64 - set(ARCHITECTURES_LIST "arm64;x86_64") - - message(STATUS "Checking SSE2 support using SSE2NEON library for arm64 and x86_64 architectures") - foreach (current_arch IN LISTS ARCHITECTURES_LIST) - - set (CMAKE_OSX_ARCHITECTURES "${current_arch}") - - if(current_arch STREQUAL arm64) - set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR}) - set(_sse2_header_ "#include ") - set(_output_var_name_ "HAVE_SSE2_WITH_SSE2NEON") - elseif(current_arch STREQUAL x86_64) - set(_sse2_header_ "#include ") - set(_output_var_name_ "HAVE_SSE2") - endif() - - check_sse2_availability("${_sse2_header_}" ${_output_var_name_}) - - endforeach() -endif() + } +") +check_cxx_source_compiles("${SSE2_CODE}" COMPILER_SUPPORTS_SSE2) set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}") -set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") - unset(_cmake_required_flags_orig) -unset(_cmake_required_includes_orig) -unset(_cmake_osx_architectures_orig) - +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE3.cmake b/share/cmake/utils/CheckSupportSSE3.cmake new file mode 100644 index 0000000000..2ccb156553 --- /dev/null +++ b/share/cmake/utils/CheckSupportSSE3.cmake @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +# MSVC doesn't have flags +if(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -msse3") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(SSE3_CODE " + #include + + int main() + { + _mm_addsub_ps(_mm_setzero_ps(), _mm_setzero_ps()); + return 0; + } +") +check_cxx_source_compiles("${SSE3_CODE}" COMPILER_SUPPORTS_SSE3) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE4.cmake b/share/cmake/utils/CheckSupportSSE4.cmake new file mode 100644 index 0000000000..c45c9eab13 --- /dev/null +++ b/share/cmake/utils/CheckSupportSSE4.cmake @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +# MSVC doesn't have flags +if(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -msse4") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(SSE4_CODE " + #include + + int main() + { + _mm_blend_epi16(_mm_setzero_si128(), _mm_setzero_si128(), 0); + return 0; + } +") +check_cxx_source_compiles("${SSE4_CODE}" COMPILER_SUPPORTS_SSE4) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE42.cmake b/share/cmake/utils/CheckSupportSSE42.cmake new file mode 100644 index 0000000000..6f7486cc1a --- /dev/null +++ b/share/cmake/utils/CheckSupportSSE42.cmake @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +# MSVC doesn't have flags +if(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -msse4.2") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(SSE42_CODE " + #include + + int main() + { + _mm_cmpgt_epi64(_mm_set_epi64x(5, 10), _mm_set_epi64x(8, 5)); + return 0; + } +") +check_cxx_source_compiles("${SSE42_CODE}" COMPILER_SUPPORTS_SSE42) + + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake new file mode 100644 index 0000000000..2259ad2f0a --- /dev/null +++ b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake @@ -0,0 +1,62 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") +set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + +# Compiling using CMAKE_OSX_ARCHITECTURES="arm64" will return SUCCESS. +# Compiling using CMAKE_OSX_ARCHITECTURES="x86_64" will return FAILED. +# Compiling using CMAKE_OSX_ARCHITECTURES="arm64;x86_64" will return FAILED. + +if(APPLE AND COMPILER_SUPPORTS_ARM_NEON) + + set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR}) + + if("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR + "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + # universal build + # Force the test to build under arm64 + set(CMAKE_OSX_ARCHITECTURES "arm64") + endif() + + set(SSE2NEON_CODE " + #include + + int main() + { + // SSE2 + __m128d a, b; + double vals[2] = {0}; + a = _mm_loadu_pd (vals); + b = _mm_add_pd (a,a); + _mm_storeu_pd (vals,b); + + // SSE3 + _mm_addsub_ps(_mm_setzero_ps(), _mm_setzero_ps()); + + // SSSE3 + _mm_shuffle_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + + // SSE4 + _mm_blend_epi16(_mm_setzero_si128(), _mm_setzero_si128(), 0); + + // SSE 4.2 + _mm_cmpgt_epi64(_mm_set_epi64x(5, 10), _mm_set_epi64x(8, 5)); + + return (0); + } + ") + check_cxx_source_compiles("${SSE2NEON_CODE}" COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) +endif() + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}") +set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + +unset(_cmake_required_flags_orig) +unset(_cmake_required_includes_orig) +unset(_cmake_osx_architectures_orig) + diff --git a/share/cmake/utils/CheckSupportSSSE3.cmake b/share/cmake/utils/CheckSupportSSSE3.cmake new file mode 100644 index 0000000000..5171e6f594 --- /dev/null +++ b/share/cmake/utils/CheckSupportSSSE3.cmake @@ -0,0 +1,43 @@ +# SPDX-License-Identifier: BSD-3-Clause +# Copyright Contributors to the OpenColorIO Project. + +include(CheckCXXSourceCompiles) + +set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() + +# MSVC doesn't have flags +if(USE_GCC OR USE_CLANG) + set(CMAKE_REQUIRED_FLAGS "-w -msse3") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE/AVX to ARM Neon. +endif() + +set(SSSE3_CODE " + #include + + int main() + { + _mm_shuffle_epi8(_mm_setzero_si128(), _mm_setzero_si128()); + return 0; + } +") +check_cxx_source_compiles("${SSSE3_CODE}" COMPILER_SUPPORTS_SSSE3) + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) + +if(__universal_build) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportX86SIMD.cmake b/share/cmake/utils/CheckSupportX86SIMD.cmake index 7a73fac73e..c8390b2938 100644 --- a/share/cmake/utils/CheckSupportX86SIMD.cmake +++ b/share/cmake/utils/CheckSupportX86SIMD.cmake @@ -5,86 +5,95 @@ ############################################################################### # Check if compiler supports X86 SIMD extensions +# Please note that some compilers could ignore unknown compilers flags and +# return SUCCESS even if the options are not supported. +# We could test the SSE with small snippet of code. + +# Using a small code snippet to test each sets. It is more robust that using only compilers +# flags because some compilers might ignore the flags and check_cxx_compiler_flag could +# return a false positive. + +message(STATUS "...AVX") +include(CheckSupportAVX) +include(CheckSupportAVX2) +include(CheckSupportAVX512) + +include(CheckSupportSSE42) +include(CheckSupportSSE4) +include(CheckSupportSSSE3) +include(CheckSupportSSE3) +include(CheckSupportSSE2) +include(CheckSupportF16C) + if(MSVC) - # x86_64 always has SSE2 - if("${CMAKE_SIZEOF_VOID_P}" STREQUAL "8") - set(COMPILER_SUPPORTS_SSE2 1) - else() - check_cxx_compiler_flag("/arch:SSE2" COMPILER_SUPPORTS_SSE2) + if (COMPILER_SUPPORTS_SSE2) set(OCIO_SSE2_ARGS "/arch:SSE2") endif() - check_cxx_compiler_flag("/arch:AVX" COMPILER_SUPPORTS_AVX) - check_cxx_compiler_flag("/arch:AVX2" COMPILER_SUPPORTS_AVX2) - check_cxx_compiler_flag("/arch:AVX512" COMPILER_SUPPORTS_AVX512) - # MSVC doesn't have flags for these, if AVX available assume they are too - set(COMPILER_SUPPORTS_SSE42 ${COMPILER_SUPPORTS_AVX}) - set(COMPILER_SUPPORTS_SSE4 ${COMPILER_SUPPORTS_AVX}) - set(COMPILER_SUPPORTS_SSSE3 ${COMPILER_SUPPORTS_AVX}) - set(COMPILER_SUPPORTS_SSE3 ${COMPILER_SUPPORTS_AVX}) - set(COMPILER_SUPPORTS_F16C ${COMPILER_SUPPORTS_AVX}) - - set(OCIO_AVX_ARGS "/arch:AVX") - set(OCIO_AVX2_ARGS "/arch:AVX2") + if (COMPILER_SUPPORTS_AVX) + set(OCIO_AVX_ARGS "/arch:AVX") + endif() + + if (COMPILER_SUPPORTS_AVX2) + set(OCIO_AVX2_ARGS "/arch:AVX2") + endif() else() - check_cxx_compiler_flag("-msse2" COMPILER_SUPPORTS_SSE2) - check_cxx_compiler_flag("-msse3" COMPILER_SUPPORTS_SSE3) - check_cxx_compiler_flag("-mssse3" COMPILER_SUPPORTS_SSSE3) - check_cxx_compiler_flag("-msse4" COMPILER_SUPPORTS_SSE4) - check_cxx_compiler_flag("-msse4.2" COMPILER_SUPPORTS_SSE42) - check_cxx_compiler_flag("-mavx" COMPILER_SUPPORTS_AVX) - check_cxx_compiler_flag("-mavx2 -mfma -mf16c" CCOMPILER_SUPPORTS_AVX2) - check_cxx_compiler_flag("-mavx512f" COMPILER_SUPPORTS_AVX512) - check_cxx_compiler_flag("-mf16c" COMPILER_SUPPORTS_F16C) - - set(OCIO_SSE2_ARGS "-msse2") - set(OCIO_AVX_ARGS "-mavx") - set(OCIO_AVX2_ARGS "-mavx2" "-mfma") + if (COMPILER_SUPPORTS_SSE2) + set(OCIO_SSE2_ARGS "-msse2") + endif() + + if (COMPILER_SUPPORTS_AVX) + set(OCIO_AVX_ARGS "-mavx") + endif() + + if (COMPILER_SUPPORTS_AVX2) + set(OCIO_AVX2_ARGS "-mavx2" "-mfma") + endif() endif() if(${OCIO_USE_AVX512} AND NOT ${COMPILER_SUPPORTS_AVX512}) message(STATUS "OCIO_USE_AVX512 requested but compiler does not support, disabling") - set(OCIO_USE_AVX512 0) + set(OCIO_USE_AVX512 OFF) endif() if(${OCIO_USE_AVX2} AND NOT ${COMPILER_SUPPORTS_AVX2}) message(STATUS "OCIO_USE_AVX2 requested but compiler does not support, disabling") - set(OCIO_USE_AVX2 0) + set(OCIO_USE_AVX2 OFF) endif() if(${OCIO_USE_AVX} AND NOT ${COMPILER_SUPPORTS_AVX}) message(STATUS "OCIO_USE_AVX requested but compiler does not support, disabling") - set(OCIO_USE_AVX 0) + set(OCIO_USE_AVX OFF) endif() if(${OCIO_USE_SSE42} AND NOT ${COMPILER_SUPPORTS_SSE42}) message(STATUS "OCIO_USE_SSE42 requested but compiler does not support, disabling") - set(OCIO_USE_SSE42 0) + set(OCIO_USE_SSE42 OFF) endif() if(${OCIO_USE_SSE4} AND NOT ${COMPILER_SUPPORTS_SSE4}) message(STATUS "OCIO_USE_SSE4 requested but compiler does not support, disabling") - set(OCIO_USE_SSE4 0) + set(OCIO_USE_SSE4 OFF) endif() if(${OCIO_USE_SSSE3} AND NOT ${COMPILER_SUPPORTS_SSSE3}) message(STATUS "OCIO_USE_SSSE3 requested but compiler does not support, disabling") - set(OCIO_USE_SSSE3 0) + set(OCIO_USE_SSSE3 OFF) endif() if(${OCIO_USE_SSE3} AND NOT ${COMPILER_SUPPORTS_SSE3}) message(STATUS "OCIO_USE_SSE3 requested but compiler does not support, disabling") - set(OCIO_USE_SSE3 0) + set(OCIO_USE_SSE3 OFF) endif() if(${OCIO_USE_SSE2} AND NOT ${COMPILER_SUPPORTS_SSE2}) message(STATUS "OCIO_USE_SSE2 requested but compiler does not support, disabling") - set(OCIO_USE_SSE2 0) + set(OCIO_USE_SSE2 OFF) endif() if(${OCIO_USE_F16C} AND NOT ${COMPILER_SUPPORTS_F16C}) message(STATUS "OCIO_USE_F16C requested but compiler does not support, disabling") - set(OCIO_USE_F16C 0) + set(OCIO_USE_F16C OFF) endif() if(${OCIO_USE_F16C}) diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index 1511693a51..364c6f2109 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -9,17 +9,38 @@ set(PLATFORM_COMPILE_OPTIONS "") set(PLATFORM_LINK_OPTIONS "") ############################################################################### -# Define if SSE2 can be used. +# Verify SIMD compatibility if(OCIO_USE_SIMD) - include(CheckSupportSSE2) -endif() + if (OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) + include(CheckSupportSSEUsingSSE2NEON) + + if(NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) + set(OCIO_USE_SSE2NEON OFF) + endif() + endif() -if(NOT HAVE_SSE2 AND NOT HAVE_SSE2_WITH_SSE2NEON) - message(STATUS "Disabling SSE optimizations, as the target doesn't support them") - set(OCIO_USE_SIMD OFF) + include(CheckSupportX86SIMD) +else() + set(OCIO_USE_SSE2 OFF) + set(OCIO_USE_SSE3 OFF) + set(OCIO_USE_SSSE3 OFF) + set(OCIO_USE_SSE4 OFF) + set(OCIO_USE_SSE42 OFF) + set(OCIO_USE_AVX OFF) + set(OCIO_USE_AVX2 OFF) + set(OCIO_USE_AVX512 OFF) + set(OCIO_USE_F16C OFF) + + set(OCIO_USE_SSE2NEON OFF) endif() +#TODOCED Does not make sense anymore as we have AVX and AVX2 support now. +# if(NOT COMPILER_SUPPORTS_SSE2 AND NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) +# message(STATUS "Disabling SSE optimizations, as the target doesn't support them") +# set(OCIO_USE_SIMD OFF) +# endif() + ############################################################################### # Compile flags @@ -104,33 +125,6 @@ set_unless_defined(CMAKE_CXX_VISIBILITY_PRESET hidden) set_unless_defined(CMAKE_VISIBILITY_INLINES_HIDDEN YES) -############################################################################### -# Define if SSE2 can be used. - - -message(STATUS "") -message(STATUS "Checking for SSE2 support...") -include(CheckSupportSSE2) - -if(NOT HAVE_SSE2) - message(STATUS "Disabling SSE optimizations, as the target doesn't support them") - set(OCIO_USE_SSE OFF) -endif(NOT HAVE_SSE2) - -if(OCIO_USE_SSE) - include(CheckSupportX86SIMD) -else() - set(OCIO_USE_SSE2 OFF) - set(OCIO_USE_SSE3 OFF) - set(OCIO_USE_SSSE3 OFF) - set(OCIO_USE_SSE4 OFF) - set(OCIO_USE_SSE42 OFF) - set(OCIO_USE_AVX OFF) - set(OCIO_USE_AVX2 OFF) - set(OCIO_USE_AVX512 OFF) - set(OCIO_USE_F16C OFF) -endif() - ############################################################################### # Define RPATH. diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index 46fbd45670..34da7156f4 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -311,7 +311,7 @@ target_link_libraries(OpenColorIO MINIZIP::minizip-ng ) -if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON) +if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) target_link_libraries(OpenColorIO PRIVATE $) endif() @@ -346,19 +346,12 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX)) endif() if(OCIO_USE_SIMD) - if(HAVE_SSE2) + if(COMPILER_SUPPORTS_SSE2) target_compile_definitions(OpenColorIO PRIVATE USE_SSE ) endif() - - if(HAVE_SSE2_WITH_SSE2NEON) - target_compile_definitions(OpenColorIO - PRIVATE - USE_SSE2_WITH_SSE2NEON - ) - endif() endif() if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS) diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index 725094c3ad..cae9cb3003 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -13,3 +13,9 @@ #cmakedefine01 OCIO_USE_AVX2 #cmakedefine01 OCIO_USE_AVX512 #cmakedefine01 OCIO_USE_F16C + + +#cmakedefine01 APPLE +#if APPLE + #cmakedefine01 OCIO_USE_SSE2NEON +#endif \ No newline at end of file diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index 549a24dee2..a56bf9c8fd 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -6,7 +6,7 @@ #define INCLUDED_OCIO_SSE_H -#if defined(USE_SSE) || defined(USE_SSE2_WITH_SSE2NEON) +#if defined(USE_SSE) || defined(OCIO_USE_SSE2NEON) // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). #if !defined(__aarch64__) @@ -15,7 +15,7 @@ #endif #elif defined(__aarch64__) // ARM architecture A64 (ARM64) - #if defined(USE_SSE2_WITH_SSE2NEON) + #if defined(OCIO_USE_SSE2NEON) #include #endif #endif @@ -31,7 +31,7 @@ namespace OCIO_NAMESPACE // it is redefining two of the functions from sse2neon. #if defined(__aarch64__) - #if defined(USE_SSE2_WITH_SSE2NEON) + #if defined(OCIO_USE_SSE2NEON) // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 8f7592cab7..21a2064e46 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -8,7 +8,18 @@ #include "CPUInfo.h" #ifdef OCIO_USE_SSE2 -#include +// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). +#if !defined(__aarch64__) + #if defined(USE_SSE) + #include + #endif +#elif defined(__aarch64__) && defined(USE_SSE2_WITH_SSE2NEON) + // ARM architecture A64 (ARM64) + #if defined(USE_SSE2_WITH_SSE2NEON) + #include + #endif +#endif + #include #include diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index 5d43331da6..e5b602e3d4 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -26,7 +26,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) xxHash ) - if(OCIO_USE_SIMD AND HAVE_SSE2_WITH_SSE2NEON) + if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) target_link_libraries(${TEST_BINARY} PRIVATE sse2neon) endif() @@ -49,19 +49,12 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) endif(PRIVATE_INCLUDES) if(OCIO_USE_SIMD) - if (HAVE_SSE2) + if (COMPILER_SUPPORTS_SSE2) target_compile_definitions(${TEST_BINARY} PRIVATE USE_SSE ) endif() - - if(HAVE_SSE2_WITH_SSE2NEON) - target_compile_definitions(${TEST_BINARY} - PRIVATE - USE_SSE2_WITH_SSE2NEON - ) - endif() endif(OCIO_USE_SIMD) if(WIN32) From 036ed40cf12d89e8a921480efbd487bf3cbe3227 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Mon, 21 Aug 2023 12:55:21 -0400 Subject: [PATCH 03/22] Removing the usage of USE_SSE in favor of the new OCIO_USE_SSE2 as they serve the same purpose MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- share/cmake/utils/CheckSupportX86SIMD.cmake | 1 - src/OpenColorIO/CMakeLists.txt | 9 ---- src/OpenColorIO/CPUInfoConfig.h.in | 6 +-- src/OpenColorIO/SSE.h | 10 ++-- src/OpenColorIO/SSE2.h | 10 ++-- src/OpenColorIO/ops/cdl/CDLOpCPU.cpp | 24 +++++----- .../ExposureContrastOpCPU.cpp | 10 ++-- src/OpenColorIO/ops/gamma/GammaOpCPU.cpp | 48 +++++++++---------- .../gradingprimary/GradingPrimaryOpCPU.cpp | 28 +++++------ .../gradingrgbcurve/GradingRGBCurveOpCPU.cpp | 6 +-- .../ops/gradingtone/GradingToneOpCPU.cpp | 6 +-- src/OpenColorIO/ops/log/LogOpCPU.cpp | 44 ++++++++--------- src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp | 2 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp | 10 ++-- src/OpenColorIO/ops/matrix/MatrixOpCPU.cpp | 4 +- tests/cpu/CMakeLists.txt | 9 ---- tests/cpu/SSE_tests.cpp | 2 +- tests/cpu/UnitTestUtils.h | 1 + .../cpu/ops/allocation/AllocationOp_tests.cpp | 2 +- tests/cpu/ops/cdl/CDLOp_tests.cpp | 14 +++--- tests/cpu/ops/gamma/GammaOpCPU_tests.cpp | 20 ++++---- .../GradingPrimaryOpCPU_tests.cpp | 8 ++-- .../GradingRGBCurveOpCPU_tests.cpp | 8 ++-- .../gradingtone/GradingToneOpCPU_tests.cpp | 8 ++-- tests/cpu/ops/log/LogOpCPU_tests.cpp | 36 +++++++------- tests/cpu/ops/log/LogOp_tests.cpp | 4 +- tests/gpu/CMakeLists.txt | 7 --- tests/gpu/GPUUnitTest.h | 2 + tests/gpu/GammaOp_test.cpp | 20 ++++---- tests/gpu/LogOp_test.cpp | 2 +- tests/osl/CMakeLists.txt | 7 --- 31 files changed, 166 insertions(+), 202 deletions(-) diff --git a/share/cmake/utils/CheckSupportX86SIMD.cmake b/share/cmake/utils/CheckSupportX86SIMD.cmake index c8390b2938..7de95aab1d 100644 --- a/share/cmake/utils/CheckSupportX86SIMD.cmake +++ b/share/cmake/utils/CheckSupportX86SIMD.cmake @@ -13,7 +13,6 @@ # flags because some compilers might ignore the flags and check_cxx_compiler_flag could # return a false positive. -message(STATUS "...AVX") include(CheckSupportAVX) include(CheckSupportAVX2) include(CheckSupportAVX512) diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index 34da7156f4..c49f61a9b4 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -345,15 +345,6 @@ if(BUILD_SHARED_LIBS OR (OCIO_BUILD_PYTHON AND UNIX)) set_property(TARGET OpenColorIO PROPERTY POSITION_INDEPENDENT_CODE ON) endif() -if(OCIO_USE_SIMD) - if(COMPILER_SUPPORTS_SSE2) - target_compile_definitions(OpenColorIO - PRIVATE - USE_SSE - ) - endif() -endif() - if(MSVC AND BUILD_TYPE_DEBUG AND BUILD_SHARED_LIBS) set_target_properties(OpenColorIO PROPERTIES PDB_NAME ${PROJECT_NAME}${OCIO_LIBNAME_SUFFIX}_${OpenColorIO_VERSION_MAJOR}_${OpenColorIO_VERSION_MINOR} diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index cae9cb3003..68fdb55831 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -14,8 +14,4 @@ #cmakedefine01 OCIO_USE_AVX512 #cmakedefine01 OCIO_USE_F16C - -#cmakedefine01 APPLE -#if APPLE - #cmakedefine01 OCIO_USE_SSE2NEON -#endif \ No newline at end of file +#cmakedefine01 OCIO_USE_SSE2NEON \ No newline at end of file diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index a56bf9c8fd..049c8ad20b 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -5,17 +5,17 @@ #ifndef INCLUDED_OCIO_SSE_H #define INCLUDED_OCIO_SSE_H - -#if defined(USE_SSE) || defined(OCIO_USE_SSE2NEON) +#include "CPUInfoConfig.h" +#if OCIO_USE_SSE2 || OCIO_USE_SSE2NEON // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). #if !defined(__aarch64__) - #if defined(USE_SSE) + #if OCIO_USE_SSE2 #include #endif #elif defined(__aarch64__) // ARM architecture A64 (ARM64) - #if defined(OCIO_USE_SSE2NEON) + #if OCIO_USE_SSE2NEON #include #endif #endif @@ -645,7 +645,7 @@ inline void sseSinCos(const float x, float& sin_x, float& cos_x) } // namespace OCIO_NAMESPACE -#endif // USE_SSE +#endif // OCIO_USE_SSE2 #endif // INCLUDED_OCIO_SSE_H diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 21a2064e46..15d76d8908 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -6,16 +6,14 @@ #define INCLUDED_OCIO_SSE2_H #include "CPUInfo.h" -#ifdef OCIO_USE_SSE2 +#if OCIO_USE_SSE2 // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). #if !defined(__aarch64__) - #if defined(USE_SSE) - #include - #endif -#elif defined(__aarch64__) && defined(USE_SSE2_WITH_SSE2NEON) + #include +#elif defined(__aarch64__) // ARM architecture A64 (ARM64) - #if defined(USE_SSE2_WITH_SSE2NEON) + #if OCIO_USE_SSE2NEON #include #endif #endif diff --git a/src/OpenColorIO/ops/cdl/CDLOpCPU.cpp b/src/OpenColorIO/ops/cdl/CDLOpCPU.cpp index 467e2db828..982e992f14 100644 --- a/src/OpenColorIO/ops/cdl/CDLOpCPU.cpp +++ b/src/OpenColorIO/ops/cdl/CDLOpCPU.cpp @@ -99,7 +99,7 @@ void RenderParams::update(ConstCDLOpDataRcPtr & cdl) } -#ifdef USE_SSE +#if OCIO_USE_SSE2 static const __m128 LumaWeights = _mm_setr_ps(0.2126f, 0.7152f, 0.0722f, 0.0); @@ -170,7 +170,7 @@ inline void ApplySaturation(__m128& pix, const __m128 saturation) pix = _mm_add_ps(luma, _mm_mul_ps(saturation, _mm_sub_ps(pix, luma))); } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 inline void ApplyScale(float * pix, const float scale) { @@ -283,7 +283,7 @@ class CDLRendererFwd : public CDLOpCPU virtual void apply(const void * inImg, void * outImg, long numPixels) const; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 template class CDLRendererFwdSSE : public CDLRendererFwd { @@ -309,7 +309,7 @@ class CDLRendererRev : public CDLOpCPU virtual void apply(const void * inImg, void * outImg, long numPixels) const; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 template class CDLRendererRevSSE : public CDLRendererRev { @@ -329,7 +329,7 @@ CDLOpCPU::CDLOpCPU(ConstCDLOpDataRcPtr & cdl) m_renderParams.update(cdl); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void LoadRenderParams(const RenderParams & renderParams, __m128 & slope, __m128 & offset, @@ -343,7 +343,7 @@ void LoadRenderParams(const RenderParams & renderParams, } #endif -#ifdef USE_SSE +#if OCIO_USE_SSE2 template void CDLRendererFwdSSE::apply(const void * inImg, void * outImg, long numPixels) const { @@ -406,7 +406,7 @@ void CDLRendererFwd::apply(const void * inImg, void * outImg, long numPix } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 template void CDLRendererRevSSE::apply(const void * inImg, void * outImg, long numPixels) const { @@ -472,31 +472,31 @@ void CDLRendererRev::apply(const void * inImg, void * outImg, long numPix // clamp (when needed). So by default, the following will only get called when power is not 1. ConstOpCPURcPtr GetCDLCPURenderer(ConstCDLOpDataRcPtr & cdl, bool fastPower) { -#ifndef USE_SSE +#if OCIO_USE_SSE2 == 0 std::ignore = fastPower; #endif switch(cdl->getStyle()) { case CDLOpData::CDL_V1_2_FWD: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared>(cdl); else #endif return std::make_shared>(cdl); case CDLOpData::CDL_NO_CLAMP_FWD: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared>(cdl); else #endif return std::make_shared>(cdl); case CDLOpData::CDL_V1_2_REV: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared>(cdl); else #endif return std::make_shared>(cdl); case CDLOpData::CDL_NO_CLAMP_REV: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared>(cdl); else #endif diff --git a/src/OpenColorIO/ops/exposurecontrast/ExposureContrastOpCPU.cpp b/src/OpenColorIO/ops/exposurecontrast/ExposureContrastOpCPU.cpp index 800fc4035d..3a0fc67754 100644 --- a/src/OpenColorIO/ops/exposurecontrast/ExposureContrastOpCPU.cpp +++ b/src/OpenColorIO/ops/exposurecontrast/ExposureContrastOpCPU.cpp @@ -174,7 +174,7 @@ void ECLinearRenderer::apply(const void * inImg, void * outImg, long numPixels) } else { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 contrast = _mm_set1_ps(contrastVal); __m128 exposure_over_pivot = _mm_set1_ps(exposureVal / m_pivot); __m128 piv = _mm_set1_ps(m_pivot); @@ -274,7 +274,7 @@ void ECLinearRevRenderer::apply(const void * inImg, void * outImg, long numPixel } else { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 inv_contrast = _mm_set1_ps(invContrastVal); const float pivotOverExposureVal = m_pivot * invExposureVal; @@ -384,7 +384,7 @@ void ECVideoRenderer::apply(const void * inImg, void * outImg, long numPixels) c } else { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 contrast = _mm_set1_ps(contrastVal); __m128 exposure_over_pivot = _mm_set1_ps(exposureVal / m_pivot); __m128 piv = _mm_set1_ps(m_pivot); @@ -489,7 +489,7 @@ void ECVideoRevRenderer::apply(const void * inImg, void * outImg, long numPixels } else { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 inv_contrast = _mm_set1_ps(invContrastVal); __m128 pivot_over_exposure = _mm_set1_ps(pivotOverExposureVal); __m128 inv_pivot = _mm_set1_ps(invPivotVal); @@ -578,7 +578,7 @@ void ECLogarithmicRenderer::apply(const void * inImg, void * outImg, long numPix const float * in = (float *)inImg; float * out = (float *)outImg; -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Equation is: // out = ( (in + expos) - pivot ) * contrast + pivot // Rearrange as: diff --git a/src/OpenColorIO/ops/gamma/GammaOpCPU.cpp b/src/OpenColorIO/ops/gamma/GammaOpCPU.cpp index 84b06c701e..ace0006ca8 100644 --- a/src/OpenColorIO/ops/gamma/GammaOpCPU.cpp +++ b/src/OpenColorIO/ops/gamma/GammaOpCPU.cpp @@ -40,7 +40,7 @@ class GammaBasicOpCPU : public OpCPU float m_alpGamma; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaBasicOpCPUSSE : public GammaBasicOpCPU { public: @@ -63,7 +63,7 @@ class GammaBasicMirrorOpCPU : public GammaBasicOpCPU void apply(const void * inImg, void * outImg, long numPixels) const override; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaBasicMirrorOpCPUSSE : public GammaBasicMirrorOpCPU { public: @@ -86,7 +86,7 @@ class GammaBasicPassThruOpCPU : public GammaBasicOpCPU void apply(const void * inImg, void * outImg, long numPixels) const override; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaBasicPassThruOpCPUSSE : public GammaBasicPassThruOpCPU { public: @@ -122,7 +122,7 @@ class GammaMoncurveOpCPUFwd : public GammaMoncurveOpCPU void update(ConstGammaOpDataRcPtr & gamma); }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaMoncurveOpCPUFwdSSE : public GammaMoncurveOpCPUFwd { public: @@ -147,7 +147,7 @@ class GammaMoncurveOpCPURev : public GammaMoncurveOpCPU }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaMoncurveOpCPURevSSE : public GammaMoncurveOpCPURev { public: @@ -171,7 +171,7 @@ class GammaMoncurveMirrorOpCPUFwd : public GammaMoncurveOpCPU void update(ConstGammaOpDataRcPtr & gamma); }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaMoncurveMirrorOpCPUFwdSSE : public GammaMoncurveMirrorOpCPUFwd { public: @@ -195,7 +195,7 @@ class GammaMoncurveMirrorOpCPURev : public GammaMoncurveOpCPU void update(ConstGammaOpDataRcPtr & gamma); }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class GammaMoncurveMirrorOpCPURevSSE : public GammaMoncurveMirrorOpCPURev { public: @@ -210,7 +210,7 @@ class GammaMoncurveMirrorOpCPURevSSE : public GammaMoncurveMirrorOpCPURev ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) { -#ifndef USE_SSE +#if OCIO_USE_SSE2 == 0 std::ignore = fastPower; #endif @@ -218,7 +218,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) { case GammaOpData::MONCURVE_FWD: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -228,7 +228,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) case GammaOpData::MONCURVE_REV: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -238,7 +238,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) case GammaOpData::MONCURVE_MIRROR_FWD: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -248,7 +248,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) case GammaOpData::MONCURVE_MIRROR_REV: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -259,7 +259,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) case GammaOpData::BASIC_FWD: case GammaOpData::BASIC_REV: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -269,7 +269,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) case GammaOpData::BASIC_MIRROR_FWD: case GammaOpData::BASIC_MIRROR_REV: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -279,7 +279,7 @@ ConstOpCPURcPtr GetGammaRenderer(ConstGammaOpDataRcPtr & gamma, bool fastPower) case GammaOpData::BASIC_PASS_THRU_FWD: case GammaOpData::BASIC_PASS_THRU_REV: { -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastPower) return std::make_shared(gamma); else #endif @@ -317,7 +317,7 @@ void GammaBasicOpCPU::update(ConstGammaOpDataRcPtr & gamma) m_alpGamma = (float)(forward ? gamma->getAlphaParams()[0] : 1. / gamma->getAlphaParams()[0]); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaBasicOpCPUSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; @@ -337,7 +337,7 @@ void GammaBasicOpCPUSSE::apply(const void * inImg, void * outImg, long numPixels out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 void GammaBasicOpCPU::apply(const void * inImg, void * outImg, long numPixels) const { @@ -366,7 +366,7 @@ GammaBasicMirrorOpCPU::GammaBasicMirrorOpCPU(ConstGammaOpDataRcPtr & gamma) { } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaBasicMirrorOpCPUSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; @@ -418,7 +418,7 @@ GammaBasicPassThruOpCPU::GammaBasicPassThruOpCPU(ConstGammaOpDataRcPtr & gamma) { } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaBasicPassThruOpCPUSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; @@ -480,7 +480,7 @@ void GammaMoncurveOpCPUFwd::update(ConstGammaOpDataRcPtr & gamma) ComputeParamsFwd(gamma->getAlphaParams(), m_alpha); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaMoncurveOpCPUFwdSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; @@ -520,7 +520,7 @@ void GammaMoncurveOpCPUFwdSSE::apply(const void * inImg, void * outImg, long num out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 void GammaMoncurveOpCPUFwd::apply(const void * inImg, void * outImg, long numPixels) const { @@ -569,7 +569,7 @@ void GammaMoncurveOpCPURev::update(ConstGammaOpDataRcPtr & gamma) ComputeParamsRev(gamma->getAlphaParams(), m_alpha); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaMoncurveOpCPURevSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; @@ -658,7 +658,7 @@ void GammaMoncurveMirrorOpCPUFwd::update(ConstGammaOpDataRcPtr & gamma) ComputeParamsFwd(gamma->getAlphaParams(), m_alpha); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaMoncurveMirrorOpCPUFwdSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; @@ -754,7 +754,7 @@ void GammaMoncurveMirrorOpCPURev::update(ConstGammaOpDataRcPtr & gamma) ComputeParamsRev(gamma->getAlphaParams(), m_alpha); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 void GammaMoncurveMirrorOpCPURevSSE::apply(const void * inImg, void * outImg, long numPixels) const { const float * in = (const float *)inImg; diff --git a/src/OpenColorIO/ops/gradingprimary/GradingPrimaryOpCPU.cpp b/src/OpenColorIO/ops/gradingprimary/GradingPrimaryOpCPU.cpp index 5596794d0f..0398f75d6c 100644 --- a/src/OpenColorIO/ops/gradingprimary/GradingPrimaryOpCPU.cpp +++ b/src/OpenColorIO/ops/gradingprimary/GradingPrimaryOpCPU.cpp @@ -119,7 +119,7 @@ class GradingPrimaryVidRevOpCPU : public GradingPrimaryVidFwdOpCPU /////////////////////////////////////////////////////////////////////////////// -#ifdef USE_SSE +#if OCIO_USE_SSE2 inline void ApplyContrast(__m128 & pix, const __m128 contrast, const __m128 pivot) { @@ -239,7 +239,7 @@ inline void ApplyClamp(float * pix, float clampMin, float clampMax) // pix[0] = Clamp(pix[0], clampMin, clampMax); // Default values that should not clamp will change clamp. } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 /////////////////////////////////////////////////////////////////////////////// @@ -269,7 +269,7 @@ void GradingPrimaryLogFwdOpCPU::apply(const void * inImg, void * outImg, long nu const bool isGammaIdentity = comp.isGammaIdentity(); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 brightness = _mm_set_ps(0.f, comp.getBrightness()[2], comp.getBrightness()[1], comp.getBrightness()[0]); @@ -421,7 +421,7 @@ void GradingPrimaryLogFwdOpCPU::apply(const void * inImg, void * outImg, long nu out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } GradingPrimaryLogRevOpCPU::GradingPrimaryLogRevOpCPU(ConstGradingPrimaryOpDataRcPtr & gp) @@ -448,7 +448,7 @@ void GradingPrimaryLogRevOpCPU::apply(const void * inImg, void * outImg, long nu const bool isGammaIdentity = comp.isGammaIdentity(); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 brightnessInv = _mm_set_ps(0.f, comp.getBrightness()[2], comp.getBrightness()[1], comp.getBrightness()[0]); @@ -595,7 +595,7 @@ void GradingPrimaryLogRevOpCPU::apply(const void * inImg, void * outImg, long nu out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } GradingPrimaryLinFwdOpCPU::GradingPrimaryLinFwdOpCPU(ConstGradingPrimaryOpDataRcPtr & gp) @@ -622,7 +622,7 @@ void GradingPrimaryLinFwdOpCPU::apply(const void * inImg, void * outImg, long nu const bool isContrastIdentity = comp.isContrastIdentity(); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 offset = _mm_set_ps(0.f, comp.getOffset()[2], comp.getOffset()[1], comp.getOffset()[0]); @@ -769,7 +769,7 @@ void GradingPrimaryLinFwdOpCPU::apply(const void * inImg, void * outImg, long nu out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } GradingPrimaryLinRevOpCPU::GradingPrimaryLinRevOpCPU(ConstGradingPrimaryOpDataRcPtr & gp) @@ -796,7 +796,7 @@ void GradingPrimaryLinRevOpCPU::apply(const void * inImg, void * outImg, long nu const bool isContrastIdentity = comp.isContrastIdentity(); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 offsetInv = _mm_set_ps(0.f, comp.getOffset()[2], comp.getOffset()[1], comp.getOffset()[0]); @@ -937,7 +937,7 @@ void GradingPrimaryLinRevOpCPU::apply(const void * inImg, void * outImg, long nu out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } GradingPrimaryVidFwdOpCPU::GradingPrimaryVidFwdOpCPU(ConstGradingPrimaryOpDataRcPtr & gp) @@ -964,7 +964,7 @@ void GradingPrimaryVidFwdOpCPU::apply(const void * inImg, void * outImg, long nu const bool isGammaIdentity = comp.isGammaIdentity(); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 offset = _mm_set_ps(0.f, comp.getOffset()[2], comp.getOffset()[1], comp.getOffset()[0]); @@ -1117,7 +1117,7 @@ void GradingPrimaryVidFwdOpCPU::apply(const void * inImg, void * outImg, long nu } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } GradingPrimaryVidRevOpCPU::GradingPrimaryVidRevOpCPU(ConstGradingPrimaryOpDataRcPtr & gp) @@ -1144,7 +1144,7 @@ void GradingPrimaryVidRevOpCPU::apply(const void * inImg, void * outImg, long nu const bool isGammaIdentity = comp.isGammaIdentity(); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 offsetInv = _mm_set_ps(0.f, comp.getOffset()[2], comp.getOffset()[1], comp.getOffset()[0]); @@ -1287,7 +1287,7 @@ void GradingPrimaryVidRevOpCPU::apply(const void * inImg, void * outImg, long nu out += 4; } } -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } } // Anonymous namespace diff --git a/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurveOpCPU.cpp b/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurveOpCPU.cpp index 90dcf1bba4..11a31b0215 100644 --- a/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurveOpCPU.cpp +++ b/src/OpenColorIO/ops/gradingrgbcurve/GradingRGBCurveOpCPU.cpp @@ -158,7 +158,7 @@ namespace LogLinConstants static constexpr float gain = 363.034608563f; static constexpr float offs = -7.f; static constexpr float ybrk = -5.5f; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 mxbrk = _mm_set1_ps(xbrk); const __m128 mshift = _mm_set1_ps(shift); const __m128 mm = _mm_set1_ps(m); @@ -175,7 +175,7 @@ namespace LogLinConstants inline void LinLog(const float * in, float * out) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 pix = _mm_loadu_ps(in); __m128 flag = _mm_cmpgt_ps(pix, LogLinConstants::mxbrk); @@ -206,7 +206,7 @@ inline void LinLog(const float * in, float * out) inline void LogLin(float * out) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 pix = _mm_loadu_ps(out); __m128 flag = _mm_cmpgt_ps(pix, LogLinConstants::mybrk); diff --git a/src/OpenColorIO/ops/gradingtone/GradingToneOpCPU.cpp b/src/OpenColorIO/ops/gradingtone/GradingToneOpCPU.cpp index e84ef15923..9e6d751b56 100644 --- a/src/OpenColorIO/ops/gradingtone/GradingToneOpCPU.cpp +++ b/src/OpenColorIO/ops/gradingtone/GradingToneOpCPU.cpp @@ -1062,7 +1062,7 @@ namespace LogLinConstants static constexpr float gain = 363.034608563f; static constexpr float offs = -7.f; static constexpr float ybrk = -5.5f; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const __m128 mxbrk = _mm_set1_ps(xbrk); const __m128 mshift = _mm_set1_ps(shift); const __m128 mm = _mm_set1_ps(m); @@ -1079,7 +1079,7 @@ namespace LogLinConstants inline void LinLog(const float * in, float * out) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 pix = _mm_loadu_ps(in); __m128 flag = _mm_cmpgt_ps(pix, LogLinConstants::mxbrk); @@ -1110,7 +1110,7 @@ inline void LinLog(const float * in, float * out) inline void LogLin(float * out) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 pix = _mm_loadu_ps(out); __m128 flag = _mm_cmpgt_ps(pix, LogLinConstants::mybrk); diff --git a/src/OpenColorIO/ops/log/LogOpCPU.cpp b/src/OpenColorIO/ops/log/LogOpCPU.cpp index dac776a4b7..bed3c9d5a1 100644 --- a/src/OpenColorIO/ops/log/LogOpCPU.cpp +++ b/src/OpenColorIO/ops/log/LogOpCPU.cpp @@ -4,7 +4,7 @@ #include #include #include -#ifndef USE_SSE +#if OCIO_USE_SSE2 == 0 #include #endif @@ -66,7 +66,7 @@ class Log2LinRenderer : public L2LBaseRenderer float m_minv[3]; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class Log2LinRendererSSE : public Log2LinRenderer { public: @@ -93,7 +93,7 @@ class Lin2LogRenderer : public L2LBaseRenderer float m_kb[3]; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class Lin2LogRendererSSE : public Lin2LogRenderer { public: @@ -136,7 +136,7 @@ class CameraLog2LinRenderer : public CameraL2LBaseRenderer float m_minuslino[3]; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class CameraLog2LinRendererSSE : public CameraLog2LinRenderer { public: @@ -164,7 +164,7 @@ class CameraLin2LogRenderer : public CameraL2LBaseRenderer float m_linb[3]; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class CameraLin2LogRendererSSE : public CameraLin2LogRenderer { public: @@ -186,7 +186,7 @@ class LogRenderer : public LogOpCPU float m_logScale; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class LogRendererSSE : public LogRenderer { public: @@ -208,7 +208,7 @@ class AntiLogRenderer : public LogOpCPU float m_log2_base; }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 class AntiLogRendererSSE : public AntiLogRenderer { public: @@ -223,7 +223,7 @@ static constexpr float LOG10_2 = ((float) 0.3010299956639811952137388947245); ConstOpCPURcPtr GetLogRenderer(ConstLogOpDataRcPtr & log, bool fastExp) { -#ifndef USE_SSE +#if OCIO_USE_SSE2 == 0 std::ignore = fastExp; #endif @@ -233,14 +233,14 @@ ConstOpCPURcPtr GetLogRenderer(ConstLogOpDataRcPtr & log, bool fastExp) switch (dir) { case TRANSFORM_DIR_FORWARD: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log, 1.0f); else #endif return std::make_shared(log, 1.0f); break; case TRANSFORM_DIR_INVERSE: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log, 1.0f); else #endif @@ -253,14 +253,14 @@ ConstOpCPURcPtr GetLogRenderer(ConstLogOpDataRcPtr & log, bool fastExp) switch (dir) { case TRANSFORM_DIR_FORWARD: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log, LOG10_2); else #endif return std::make_shared(log, LOG10_2); break; case TRANSFORM_DIR_INVERSE: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log, LOG2_10); else #endif @@ -275,14 +275,14 @@ ConstOpCPURcPtr GetLogRenderer(ConstLogOpDataRcPtr & log, bool fastExp) switch (dir) { case TRANSFORM_DIR_FORWARD: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log); else #endif return std::make_shared(log); break; case TRANSFORM_DIR_INVERSE: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log); else #endif @@ -295,14 +295,14 @@ ConstOpCPURcPtr GetLogRenderer(ConstLogOpDataRcPtr & log, bool fastExp) switch (dir) { case TRANSFORM_DIR_FORWARD: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log); else #endif return std::make_shared(log); break; case TRANSFORM_DIR_INVERSE: -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (fastExp) return std::make_shared(log); else #endif @@ -413,7 +413,7 @@ void LogRenderer::apply(const void * inImg, void * outImg, long numPixels) const } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 LogRendererSSE::LogRendererSSE(ConstLogOpDataRcPtr & log, float logScale) : LogRenderer(log, logScale) { @@ -481,7 +481,7 @@ void AntiLogRenderer::apply(const void * inImg, void * outImg, long numPixels) c } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 AntiLogRendererSSE::AntiLogRendererSSE(ConstLogOpDataRcPtr & log, float log2base) : AntiLogRenderer(log, log2base) { @@ -571,7 +571,7 @@ void Log2LinRenderer::apply(const void * inImg, void * outImg, long numPixels) c } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 Log2LinRendererSSE::Log2LinRendererSSE(ConstLogOpDataRcPtr & log) : Log2LinRenderer(log) { @@ -673,7 +673,7 @@ void Lin2LogRenderer::apply(const void * inImg, void * outImg, long numPixels) c } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 Lin2LogRendererSSE::Lin2LogRendererSSE(ConstLogOpDataRcPtr & log) : Lin2LogRenderer(log) { @@ -801,7 +801,7 @@ void CameraLog2LinRenderer::apply(const void * inImg, void * outImg, long numPix } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 CameraLog2LinRendererSSE::CameraLog2LinRendererSSE(ConstLogOpDataRcPtr & log) : CameraLog2LinRenderer(log) { @@ -919,7 +919,7 @@ void CameraLin2LogRenderer::apply(const void * inImg, void * outImg, long numPix } } -#ifdef USE_SSE +#if OCIO_USE_SSE2 CameraLin2LogRendererSSE::CameraLin2LogRendererSSE(ConstLogOpDataRcPtr & log) : CameraLin2LogRenderer(log) { diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp index 196137cec2..7dfaab8daf 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp @@ -898,7 +898,7 @@ void Lut1DRendererHueAdjust::apply(const void * inImg, void * outIm = orig_chroma == 0.f ? 0.f : (RGB[mid] - RGB[min]) / orig_chroma; -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 idx = _mm_mul_ps(_mm_set_ps(in[3], RGB[2], diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp index e4bb6f4715..80fe753c0c 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp @@ -193,7 +193,7 @@ class InvLut3DRenderer : public OpCPU }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 //---------------------------------------------------------------------------- // RGB channel ordering. @@ -316,7 +316,7 @@ BaseLut3DRenderer::BaseLut3DRenderer(ConstLut3DOpDataRcPtr & lut) BaseLut3DRenderer::~BaseLut3DRenderer() { -#ifdef USE_SSE +#if OCIO_USE_SSE2 Platform::AlignedFree(m_optLut); #else free(m_optLut); @@ -329,7 +329,7 @@ void BaseLut3DRenderer::updateData(ConstLut3DOpDataRcPtr & lut) m_step = ((float)m_dim - 1.0f); -#ifdef USE_SSE +#if OCIO_USE_SSE2 Platform::AlignedFree(m_optLut); m_components = 4; #else @@ -339,7 +339,7 @@ void BaseLut3DRenderer::updateData(ConstLut3DOpDataRcPtr & lut) m_optLut = createOptLut(lut->getArray().getValues()); } -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Creates a LUT aligned to a 16 byte boundary with RGB and 0 for alpha // in order to be able to load the LUT using _mm_load_ps. float* BaseLut3DRenderer::createOptLut(const Array::Values& lut) const @@ -629,7 +629,7 @@ void Lut3DRenderer::apply(const void * inImg, void * outImg, long numPixels) con const float * in = (const float *)inImg; float * out = (float *)outImg; -#ifdef USE_SSE +#if OCIO_USE_SSE2 __m128 step = _mm_set1_ps(m_step); __m128 maxIdx = _mm_set1_ps((float)(m_dim - 1)); diff --git a/src/OpenColorIO/ops/matrix/MatrixOpCPU.cpp b/src/OpenColorIO/ops/matrix/MatrixOpCPU.cpp index d539355539..a30e0d68ec 100644 --- a/src/OpenColorIO/ops/matrix/MatrixOpCPU.cpp +++ b/src/OpenColorIO/ops/matrix/MatrixOpCPU.cpp @@ -211,7 +211,7 @@ void MatrixWithOffsetRenderer::apply(const void * inImg, void * outImg, long num const float * in = (const float *)inImg; float * out = (float *)outImg; -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Matrix decomposition per _column. __m128 m0 = _mm_set_ps(m_column1[3], m_column1[2], @@ -325,7 +325,7 @@ void MatrixRenderer::apply(const void * inImg, void * outImg, long numPixels) co const float * in = (const float *)inImg; float * out = (float *)outImg; -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Matrix decomposition per _column. __m128 m0 = _mm_set_ps(m_column1[3], m_column1[2], diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index e5b602e3d4..ebe3bf20c1 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -48,15 +48,6 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) ) endif(PRIVATE_INCLUDES) - if(OCIO_USE_SIMD) - if (COMPILER_SUPPORTS_SSE2) - target_compile_definitions(${TEST_BINARY} - PRIVATE - USE_SSE - ) - endif() - endif(OCIO_USE_SIMD) - if(WIN32) # A windows application linking to eXpat static libraries must # have the global macro XML_STATIC defined diff --git a/tests/cpu/SSE_tests.cpp b/tests/cpu/SSE_tests.cpp index 52b9fee38e..a08e25f69c 100644 --- a/tests/cpu/SSE_tests.cpp +++ b/tests/cpu/SSE_tests.cpp @@ -1,7 +1,7 @@ // SPDX-License-Identifier: BSD-3-Clause // Copyright Contributors to the OpenColorIO Project. -#ifdef USE_SSE +#if OCIO_USE_SSE2 #include diff --git a/tests/cpu/UnitTestUtils.h b/tests/cpu/UnitTestUtils.h index 0fa467e9d0..4c65b0f521 100644 --- a/tests/cpu/UnitTestUtils.h +++ b/tests/cpu/UnitTestUtils.h @@ -19,6 +19,7 @@ #include "Op.h" #include "Platform.h" #include "pystring/pystring.h" +#include "CPUInfoConfig.h" namespace OCIO_NAMESPACE { diff --git a/tests/cpu/ops/allocation/AllocationOp_tests.cpp b/tests/cpu/ops/allocation/AllocationOp_tests.cpp index ce2fe8ab8e..b48bf14bde 100644 --- a/tests/cpu/ops/allocation/AllocationOp_tests.cpp +++ b/tests/cpu/ops/allocation/AllocationOp_tests.cpp @@ -64,7 +64,7 @@ OCIO_ADD_TEST(AllocationOps, create) OCIO::ConstOpRcPtr defaultLogOp = ops[0]; OCIO::ConstOpRcPtr defaultFitOp = ops[1]; -#ifndef USE_SSE +#if OCIO_USE_SSE2 == 0 const float error = 1e-6f; #else const float error = 2e-5f; diff --git a/tests/cpu/ops/cdl/CDLOp_tests.cpp b/tests/cpu/ops/cdl/CDLOp_tests.cpp index 20e3e73380..f4e2220511 100644 --- a/tests/cpu/ops/cdl/CDLOp_tests.cpp +++ b/tests/cpu/ops/cdl/CDLOp_tests.cpp @@ -301,7 +301,7 @@ OCIO_ADD_TEST(CDLOp, apply_clamp_fwd) CDL_DATA_1::slope, CDL_DATA_1::offset, CDL_DATA_1::power, CDL_DATA_1::saturation, OCIO::CDLOpData::CDL_V1_2_FWD, -#ifdef USE_SSE +#if OCIO_USE_SSE2 4e-6f); #else 2e-6f); @@ -340,7 +340,7 @@ OCIO_ADD_TEST(CDLOp, apply_clamp_rev) CDL_DATA_1::slope, CDL_DATA_1::offset, CDL_DATA_1::power, CDL_DATA_1::saturation, OCIO::CDLOpData::CDL_V1_2_REV, -#ifdef USE_SSE +#if OCIO_USE_SSE2 9e-6f); #else 1e-5f); @@ -379,7 +379,7 @@ OCIO_ADD_TEST(CDLOp, apply_noclamp_fwd) CDL_DATA_1::slope, CDL_DATA_1::offset, CDL_DATA_1::power, CDL_DATA_1::saturation, OCIO::CDLOpData::CDL_NO_CLAMP_FWD, -#ifdef USE_SSE +#if OCIO_USE_SSE2 2e-5f); #else 2e-6f); @@ -418,7 +418,7 @@ OCIO_ADD_TEST(CDLOp, apply_noclamp_rev) CDL_DATA_1::slope, CDL_DATA_1::offset, CDL_DATA_1::power, CDL_DATA_1::saturation, OCIO::CDLOpData::CDL_NO_CLAMP_REV, -#ifdef USE_SSE +#if OCIO_USE_SSE2 3e-5f); #else 1e-6f); @@ -459,7 +459,7 @@ OCIO_ADD_TEST(CDLOp, apply_clamp_fwd_2) CDL_DATA_2::slope, CDL_DATA_2::offset, CDL_DATA_2::power, CDL_DATA_2::saturation, OCIO::CDLOpData::CDL_V1_2_FWD, -#ifdef USE_SSE +#if OCIO_USE_SSE2 7e-6f); #else 1e-6f); @@ -534,7 +534,7 @@ OCIO_ADD_TEST(CDLOp, apply_clamp_fwd_3) CDL_DATA_3::slope, CDL_DATA_3::offset, CDL_DATA_3::power, CDL_DATA_3::saturation, OCIO::CDLOpData::CDL_V1_2_FWD, -#ifdef USE_SSE +#if OCIO_USE_SSE2 2e-5f); #else 1e-6f); @@ -601,7 +601,7 @@ OCIO_ADD_TEST(CDLOp, apply_noclamp_fwd_3) CDL_DATA_3::slope, CDL_DATA_3::offset, CDL_DATA_3::power, CDL_DATA_3::saturation, OCIO::CDLOpData::CDL_NO_CLAMP_FWD, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-6f); #else 1e-6f); diff --git a/tests/cpu/ops/gamma/GammaOpCPU_tests.cpp b/tests/cpu/ops/gamma/GammaOpCPU_tests.cpp index 275639a0e6..467e1029e0 100644 --- a/tests/cpu/ops/gamma/GammaOpCPU_tests.cpp +++ b/tests/cpu/ops/gamma/GammaOpCPU_tests.cpp @@ -75,7 +75,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_basic_style_fwd) // In OCIO v2, the behavior does *not* depend on the gamma. const std::vector gammaVals = { 1.2, 2.12, 1., 1.05 }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels*4] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.00005f, 0.48297336f, @@ -139,7 +139,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_basic_style_rev) const std::vector gammaVals = { 1.2, 2.12, 1.123, 1.05 }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels*4] = { 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.00014792f, 0.51678240f, @@ -205,7 +205,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_basic_mirror_style_fwd) const std::vector gammaVals = { 1.2, 2.12, 1.123, 1.05 }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels * 4] = { 0.00010933f, 0.00001323f, 0.03458935f, 0.73928129f, -0.00010933f, -0.00001323f, -0.03458935f, -0.73928129f, @@ -297,7 +297,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_basic_mirror_style_rev) const std::vector gammaVals = { 1.2, 2.12, 1.123, 1.05 }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels * 4] = { 0.00177476f, 0.08215060f, 0.06941742f, 0.76033723f, -0.00177476f, -0.08215060f, -0.06941742f, -0.76033723f, @@ -388,7 +388,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_basic_pass_thru_style_fwd) const std::vector gammaVals = { 1.2, 2.12, 1.123, 1.05 }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels * 4] = { 0.00010933f, 0.00001323f, 0.03458935f, 0.73928129f, input_32f[04], input_32f[05], input_32f[06], input_32f[07], @@ -467,7 +467,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_basic_pass_thru_style_rev) const std::vector gammaVals = { 1.2, 2.12, 1.123, 1.05 }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels * 4] = { 0.00177476f, 0.08215060f, 0.06941742f, 0.76033723f, input_32f[04], input_32f[05], input_32f[06], input_32f[07], @@ -542,7 +542,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_moncurve_style_fwd) 1.005f, 1.05f, 1.5f, -0.25f, -inf, inf, qnan, 0.0f }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels*4] = { -0.07738016f, -0.33144456f, -0.25f, 0.0f, -0.00019345f, 0.0f, 0.00005f, 0.49101364f, @@ -597,7 +597,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_moncurve_style_rev) 1.005f, 1.05f, 1.5f, -0.25f, -inf, inf, qnan, 0.0f }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels*4] = { -6.18606853f, -1.69711625f, -0.25f, 0.0f, -0.01546517f, 0.0f, 0.00005f, 0.50915080f, @@ -654,7 +654,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_moncurve_mirror_style_fwd) -1.005f, -1.05f, -1.5f, -1.0f, -inf, inf, qnan, 0.0f }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels * 4] = { 0.00003869f, 0.00220963f, 0.04081632f, 0.73652046f, -0.00003869f, -0.00220963f, -0.04081632f, -0.73652046f, @@ -715,7 +715,7 @@ OCIO_ADD_TEST(GammaOpCPU, apply_moncurve_mirror_style_rev) -1.005f, -1.05f, -1.5f, -1.0f, -inf, inf, qnan, 0.0f }; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float expected_32f[numPixels * 4] = { 0.00309303f, 0.01131410f, 0.06125000f, 0.76366448f, -0.00309303f, -0.01131410f, -0.06125000f, -0.76366448f, diff --git a/tests/cpu/ops/gradingprimary/GradingPrimaryOpCPU_tests.cpp b/tests/cpu/ops/gradingprimary/GradingPrimaryOpCPU_tests.cpp index b0ea847d94..14535386f8 100644 --- a/tests/cpu/ops/gradingprimary/GradingPrimaryOpCPU_tests.cpp +++ b/tests/cpu/ops/gradingprimary/GradingPrimaryOpCPU_tests.cpp @@ -13,11 +13,11 @@ namespace { void ValidateImage(const float * expected, const float * res, long numPix, unsigned line) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 static constexpr float error = 1e-4f; #else static constexpr float error = 1e-6f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 for (long i = 0; i < numPix; ++i) { @@ -25,11 +25,11 @@ void ValidateImage(const float * expected, const float * res, long numPix, unsig { if (OCIO::IsNan(expected[i * 4 + j])) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Do not test nan in SSE mode. #else OCIO_CHECK_ASSERT(OCIO::IsNan(res[i * 4 + j])); -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } else if (expected[i * 4 + j] != res[i * 4 + j]) { diff --git a/tests/cpu/ops/gradingrgbcurve/GradingRGBCurveOpCPU_tests.cpp b/tests/cpu/ops/gradingrgbcurve/GradingRGBCurveOpCPU_tests.cpp index 5882f33f58..cd9005cd3f 100644 --- a/tests/cpu/ops/gradingrgbcurve/GradingRGBCurveOpCPU_tests.cpp +++ b/tests/cpu/ops/gradingrgbcurve/GradingRGBCurveOpCPU_tests.cpp @@ -13,11 +13,11 @@ namespace { void ValidateImage(const float * expected, const float * res, long numPix, unsigned line) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 static constexpr float error = 5e-4f; #else static constexpr float error = 2e-5f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 for (long i = 0; i < numPix; ++i) { @@ -25,11 +25,11 @@ void ValidateImage(const float * expected, const float * res, long numPix, unsig { if (OCIO::IsNan(expected[i * 4 + j])) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Do not test nan in SSE mode. #else OCIO_CHECK_ASSERT(OCIO::IsNan(res[i * 4 + j])); -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } else if (expected[i * 4 + j] != res[i * 4 + j]) { diff --git a/tests/cpu/ops/gradingtone/GradingToneOpCPU_tests.cpp b/tests/cpu/ops/gradingtone/GradingToneOpCPU_tests.cpp index edbbeb7341..625084f4d8 100644 --- a/tests/cpu/ops/gradingtone/GradingToneOpCPU_tests.cpp +++ b/tests/cpu/ops/gradingtone/GradingToneOpCPU_tests.cpp @@ -14,11 +14,11 @@ namespace { void ValidateImage(const float * expected, const float * res, long numPix, unsigned line) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 static constexpr float error = 2e-4f; #else static constexpr float error = 1e-6f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 for (long i = 0; i < numPix; ++i) { @@ -26,11 +26,11 @@ void ValidateImage(const float * expected, const float * res, long numPix, unsig { if (OCIO::IsNan(expected[i * 4 + j])) { -#ifdef USE_SSE +#if OCIO_USE_SSE2 // Do not test nan in SSE mode. #else OCIO_CHECK_ASSERT(OCIO::IsNan(res[i * 4 + j])); -#endif // USE_SSE +#endif // OCIO_USE_SSE2 } else if (expected[i * 4 + j] != res[i * 4 + j]) { diff --git a/tests/cpu/ops/log/LogOpCPU_tests.cpp b/tests/cpu/ops/log/LogOpCPU_tests.cpp index 06e98fe0da..9826f6a1fa 100644 --- a/tests/cpu/ops/log/LogOpCPU_tests.cpp +++ b/tests/cpu/ops/log/LogOpCPU_tests.cpp @@ -35,11 +35,11 @@ void TestLog(float logBase) // LogOpCPU implementation uses optimized logarithm approximation // cannot use strict comparison. -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float error = 5e-5f; #else const float error = 1e-5f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 for (unsigned i = 0; i < 8; ++i) { @@ -71,7 +71,7 @@ void TestLog(float logBase) // SSE implementation of sseLog2 & sseExp2 do not behave like CPU. // TODO: Address issues with Inf/NaN handling demonstrated by many of the test results below. // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. -#ifdef USE_SSE +#if OCIO_USE_SSE2 if (logBase == 10.0f) { OCIO_CHECK_CLOSE(rgba[16], 38.53184509f, error); @@ -431,7 +431,7 @@ OCIO_ADD_TEST(LogOpCPU, lin2log_test) OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[15])); // Evaluating output for input rgbaImage[16-19] = {inf, inf, inf, 0.}. -#ifdef USE_SSE +#if OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba[16], 10.08598328f, error); #else OCIO_CHECK_EQUAL(rgba[16], inf); @@ -477,11 +477,11 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true); pRenderer->apply(rgbaImage, rgba, numPixels); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float error = 1e-6f; #else const float error = 1e-7f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba[0], -0.168771237955f, error); @@ -490,15 +490,15 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba[4], 0.047228762045f, error); -#ifdef USE_SSE +#if OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, 10.0f * error); #else OCIO_CHECK_CLOSE(rgba[5], 0.170878935551f, error); -#endif // USE_SSE +#endif // OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba[6], 0.68141615509f, error); // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. -#ifdef USE_SSE +#if OCIO_USE_SSE2 OCIO_CHECK_EQUAL(rgba[8], -inf); OCIO_CHECK_CLOSE(rgba[9], 26.2f, 10.0f * error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba[10])); @@ -525,22 +525,22 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) // Evaluating output for input rgbaImage[4-6] = { 0.08f, 0.16f, 1.16f, ... }. OCIO_CHECK_CLOSE(rgba_nols[4], 0.031554648421f, error); -#ifdef USE_SSE +#if OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, 10.0f * error); #else OCIO_CHECK_CLOSE(rgba_nols[5], 0.170878935551f, error); -#endif // USE_SSE +#endif // OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba_nols[6], 0.68141615509f, error); // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. OCIO_CHECK_EQUAL(rgba_nols[8], -inf); -#ifdef USE_SSE +#if OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba_nols[9], 26.2f, 10.0f * error); OCIO_CHECK_ASSERT(OCIO::IsNan(rgba_nols[10])); #else OCIO_CHECK_EQUAL(rgba_nols[9], inf); OCIO_CHECK_CLOSE(rgba_nols[10], -24.6f, error); -#endif // USE_SSE +#endif // OCIO_USE_SSE2 float rgba_nobreak[numValues] = {}; @@ -552,11 +552,11 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) OCIO::ConstOpCPURcPtr pRendererNoBreak = OCIO::GetLogRenderer(lognobreak, true); pRendererNoBreak->apply(rgbaImage, rgba_nobreak, numPixels); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float error2 = 1e-5f; #else const float error2 = 1e-7f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 // Evaluating output for input rgbaImage[0-2] = { -0.1f, 0.f, 0.01f, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[0], -24.6f, error2); @@ -570,7 +570,7 @@ OCIO_ADD_TEST(LogOpCPU, cameralin2log_test) // Evaluating output for input rgbaImage[8-10] = { -inf, inf, qnan, ... }. OCIO_CHECK_CLOSE(rgba_nobreak[8], -24.6f, error2); -#ifdef USE_SSE +#if OCIO_USE_SSE2 OCIO_CHECK_CLOSE(rgba_nobreak[9], 26.2f, error2); #else OCIO_CHECK_EQUAL(rgba_nobreak[9], inf); @@ -596,11 +596,11 @@ OCIO_ADD_TEST(LogOpCPU, cameralog2lin_test) OCIO::ConstOpCPURcPtr pRenderer = OCIO::GetLogRenderer(logOp, true); pRenderer->apply(rgbaImage, rgba, 3); -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float error = 1e-6f; #else const float error = 1e-7f; -#endif // USE_SSE +#endif // OCIO_USE_SSE2 // Evaluating output for input rgbaImage[0-2] = // { -0.168771237955f, -0.048771237955f, -0.036771237955f, ... }. diff --git a/tests/cpu/ops/log/LogOp_tests.cpp b/tests/cpu/ops/log/LogOp_tests.cpp index 49d415b2bd..3b9d02553d 100644 --- a/tests/cpu/ops/log/LogOp_tests.cpp +++ b/tests/cpu/ops/log/LogOp_tests.cpp @@ -186,11 +186,11 @@ OCIO_ADD_TEST(LogOp, inverse) ops[1]->apply(data, 3); -#ifndef USE_SSE +#if OCIO_USE_SSE2 == 0 const float error = 1e-3f; #else const float error = 1e-2f; -#endif // !USE_SSE +#endif // !OCIO_USE_SSE2 for(int i=0; i<12; ++i) { diff --git a/tests/gpu/CMakeLists.txt b/tests/gpu/CMakeLists.txt index 0a2da9ddcb..4e16243bfd 100644 --- a/tests/gpu/CMakeLists.txt +++ b/tests/gpu/CMakeLists.txt @@ -26,13 +26,6 @@ set(SOURCES add_executable(test_gpu_exec ${SOURCES}) -if(OCIO_USE_SIMD) - target_compile_definitions(test_gpu_exec - PRIVATE - USE_SSE - ) -endif(OCIO_USE_SIMD) - set_target_properties(test_gpu_exec PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}" diff --git a/tests/gpu/GPUUnitTest.h b/tests/gpu/GPUUnitTest.h index 00bd4f89e5..cf66798032 100644 --- a/tests/gpu/GPUUnitTest.h +++ b/tests/gpu/GPUUnitTest.h @@ -9,6 +9,8 @@ #include #include +#include "CPUInfoConfig.h" + class OCIOGPUTest; using OCIOTestFuncCallback = std::function; diff --git a/tests/gpu/GammaOp_test.cpp b/tests/gpu/GammaOp_test.cpp index f553d1b9f5..9db1075a9b 100644 --- a/tests/gpu/GammaOp_test.cpp +++ b/tests/gpu/GammaOp_test.cpp @@ -85,7 +85,7 @@ OCIO_ADD_GPU_TEST(ExponentOp, forward) const double exp[4] = { 2.6, 1.0, 1.8, 1.1 }; AddExponent(test, OCIO::TRANSFORM_DIR_FORWARD, exp, OCIO::NEGATIVE_CLAMP, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-4f #else 1e-5f @@ -98,7 +98,7 @@ OCIO_ADD_GPU_TEST(ExponentOp, forward_mirror) const double exp[4] = { 2.6, 1.0, 1.8, 1.1 }; AddExponent(test, OCIO::TRANSFORM_DIR_FORWARD, exp, OCIO::NEGATIVE_MIRROR, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-4f // TODO: Only related to the ssePower optimization ? #else 1e-5f @@ -111,7 +111,7 @@ OCIO_ADD_GPU_TEST(ExponentOp, forward_pass_thru) const double exp[4] = { 2.6, 1.0, 1.8, 1.1 }; AddExponent(test, OCIO::TRANSFORM_DIR_FORWARD, exp, OCIO::NEGATIVE_PASS_THRU, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-4f // TODO: Only related to the ssePower optimization ? #else 1e-5f @@ -144,7 +144,7 @@ OCIO_ADD_GPU_TEST(ExponentOp, inverse) const double exp[4] = { 2.6, 1.0, 1.8, 1.1 }; AddExponent(test, OCIO::TRANSFORM_DIR_INVERSE, exp, OCIO::NEGATIVE_CLAMP, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-4f // TODO: Only related to the ssePower optimization ? #else g_epsilon @@ -158,7 +158,7 @@ OCIO_ADD_GPU_TEST(ExponentOp, inverse_mirror) const double exp[4] = { 2.6, 1.0, 1.8, 1.1 }; AddExponent(test, OCIO::TRANSFORM_DIR_INVERSE, exp, OCIO::NEGATIVE_MIRROR, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-4f // TODO: Only related to the ssePower optimization ? #else g_epsilon @@ -172,7 +172,7 @@ OCIO_ADD_GPU_TEST(ExponentOp, inverse_pass_thru) const double exp[4] = { 2.6, 1.0, 1.8, 1.1 }; AddExponent(test, OCIO::TRANSFORM_DIR_INVERSE, exp, OCIO::NEGATIVE_PASS_THRU, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-4f // TODO: Only related to the ssePower optimization ? #else g_epsilon @@ -188,7 +188,7 @@ OCIO_ADD_GPU_TEST(ExponentWithLinearOp, forward) { AddExponentWithLinear(test, OCIO::TRANSFORM_DIR_FORWARD, gammaVals, offsetVals, OCIO::NEGATIVE_LINEAR, -#ifdef USE_SSE +#if OCIO_USE_SSE2 1e-4f // Note: Related to the ssePower optimization ! #else 5e-6f @@ -201,7 +201,7 @@ OCIO_ADD_GPU_TEST(ExponentWithLinearOp, mirror_forward) { AddExponentWithLinear(test, OCIO::TRANSFORM_DIR_FORWARD, gammaVals, offsetVals, OCIO::NEGATIVE_MIRROR, -#ifdef USE_SSE +#if OCIO_USE_SSE2 1e-4f // Note: Related to the ssePower optimization ! #else 5e-6f @@ -214,7 +214,7 @@ OCIO_ADD_GPU_TEST(ExponentWithLinearOp, inverse) { AddExponentWithLinear(test, OCIO::TRANSFORM_DIR_INVERSE, gammaVals, offsetVals, OCIO::NEGATIVE_LINEAR, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-5f // Note: Related to the ssePower optimization ! #else 5e-7f @@ -227,7 +227,7 @@ OCIO_ADD_GPU_TEST(ExponentWithLinearOp, mirror_inverse) { AddExponentWithLinear(test, OCIO::TRANSFORM_DIR_INVERSE, gammaVals, offsetVals, OCIO::NEGATIVE_MIRROR, -#ifdef USE_SSE +#if OCIO_USE_SSE2 5e-5f // Note: Related to the ssePower optimization ! #else 5e-7f diff --git a/tests/gpu/LogOp_test.cpp b/tests/gpu/LogOp_test.cpp index 3fedfd9f64..b2090437b0 100644 --- a/tests/gpu/LogOp_test.cpp +++ b/tests/gpu/LogOp_test.cpp @@ -10,7 +10,7 @@ namespace OCIO = OCIO_NAMESPACE; -#ifdef USE_SSE +#if OCIO_USE_SSE2 const float g_epsilon = 1e-4f; const float g_epsilon_inverse = 1e-3f; #else diff --git a/tests/osl/CMakeLists.txt b/tests/osl/CMakeLists.txt index 17addff1d4..f380bccaab 100644 --- a/tests/osl/CMakeLists.txt +++ b/tests/osl/CMakeLists.txt @@ -18,13 +18,6 @@ set(SOURCES add_executable(test_osl_exec ${SOURCES}) -if(OCIO_USE_SIMD) - target_compile_definitions(test_osl_exec - PRIVATE - USE_SSE - ) -endif(OCIO_USE_SIMD) - set_target_properties(test_osl_exec PROPERTIES COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS}" LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}" From a134556a27c7705f053ce15d48f4f7629e55ed51 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Wed, 23 Aug 2023 13:30:02 -0400 Subject: [PATCH 04/22] Using try_compile instead of check_cxx_source_compiles as it was given false positive. Stubbing cpuinfo for Apple ARM plateform. Handling Apple M1 correctly and adding support for SSE2NEON. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 74 +++++++++++++++---- share/cmake/utils/CheckSupportAVX.cmake | 39 +++++----- share/cmake/utils/CheckSupportAVX2.cmake | 35 ++++----- share/cmake/utils/CheckSupportAVX512.cmake | 35 ++++----- share/cmake/utils/CheckSupportF16C.cmake | 15 +++- share/cmake/utils/CheckSupportSSE2.cmake | 19 ++++- share/cmake/utils/CheckSupportSSE3.cmake | 19 ++++- share/cmake/utils/CheckSupportSSE4.cmake | 19 ++++- share/cmake/utils/CheckSupportSSE42.cmake | 18 ++++- share/cmake/utils/CheckSupportSSSE3.cmake | 19 ++++- share/cmake/utils/CompilerFlags.cmake | 20 +++-- src/OpenColorIO/CMakeLists.txt | 4 +- src/OpenColorIO/CPUInfo.cpp | 42 ++++++++++- src/OpenColorIO/CPUInfo.h | 43 ++++++++++- src/OpenColorIO/CPUInfoConfig.h.in | 1 + src/OpenColorIO/SSE2.h | 29 +++++++- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp | 1 - src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp | 2 - tests/cpu/CMakeLists.txt | 16 ++-- tests/cpu/SSE2_tests.cpp | 1 - tests/cpu/UnitTestMain.cpp | 6 +- 21 files changed, 337 insertions(+), 120 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6be5c81475..334886614b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,27 +184,71 @@ endif() option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ON) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) +if (APPLE) + execute_process(COMMAND "system_profiler" "SPHardwareDataType" OUTPUT_VARIABLE SYSTEM_INFO) + if(SYSTEM_INFO MATCHES "^.*Chip: Apple.*") + set(OCIO_APPLE_CHIP_PLATFORM "M") + elseif(SYSTEM_INFO MATCHES "^.* Processor Name: Intel*") + set(OCIO_APPLE_CHIP_PLATFORM "INTEL") + endif() + + if (OCIO_APPLE_CHIP_PLATFORM MATCHES "M") + option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations on Apple ARM using SSE2NEON" ON) + endif() +endif() + if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)") - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS ON) + # Note that CMAKE_SYSTEM_PROCESSOR will return x86_64 on an Apple M1 MacBook using a Rosetta + # terminal. We do not support the compilation of OCIO using a Rosetta terminal on an Apple M1 + # MacBook. The user should compile OCIO using a native terminal (ARM) and use CMAKE_OSX_ARCHITECTURES="x86_64" + # to compile for intel-based Macbook or do a universal build with CMAKE_OSX_ARCHITECTURES="arm64;x86_64". + # Also note that Rosetta does not support any AVXs or F16C instructions. + + # Assumes an Intel-based computer or an Intel-based Apple MacBook. set(OCIO_ARCH_X86 1) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) +elseif(APPLE AND OCIO_APPLE_CHIP_PLATFORM MATCHES "M" + AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" + AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64")) + # Assumes an Apple ARM MacBook. + set(OCIO_APPLE_M1_ARCH_X86 1) + + # OCIO translate SSE using SSE2NEON library. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + # OCIO does not have any process to translate from AVXs to ARM Neon. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + # Not supported. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) +elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" + AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") + # Assumes an Apple ARM MacBook. + + # OCIO translate SSE using SSE2NEON library. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + # OCIO does not have any process to translate from AVXs to ARM Neon. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + # Not supported. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) else() - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) set(OCIO_ARCH_X86 0) endif() -option(OCIO_USE_SSE2 "Specify whether to enable SSE2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_SSE3 "Specify whether to enable SSE3 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_SSSE3 "Specify whether to enable SSSE3 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_SSE4 "Specify whether to enable SSE4 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_SSE42 "Specify whether to enable SSE4.2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_AVX "Specify whether to enable AVX CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) -option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS}) - -if (APPLE) - option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations on Apple ARM using SSE2NEON" ON) -endif() +option(OCIO_USE_SSE2 "Specify whether to enable SSE2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) +option(OCIO_USE_SSE3 "Specify whether to enable SSE3 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) +option(OCIO_USE_SSSE3 "Specify whether to enable SSSE3 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) +option(OCIO_USE_SSE4 "Specify whether to enable SSE4 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) +option(OCIO_USE_SSE42 "Specify whether to enable SSE4.2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) +option(OCIO_USE_AVX "Specify whether to enable AVX CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX}) +option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX}) +option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX}) +option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C}) ############################################################################### # GPU configuration diff --git a/share/cmake/utils/CheckSupportAVX.cmake b/share/cmake/utils/CheckSupportAVX.cmake index 60605f9066..89b7688997 100644 --- a/share/cmake/utils/CheckSupportAVX.cmake +++ b/share/cmake/utils/CheckSupportAVX.cmake @@ -1,32 +1,20 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright Contributors to the OpenColorIO Project. -include(CheckCXXSourceCompiles) - set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") - set(__universal_build 1) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") -endif() - if(MSVC) set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX") elseif(USE_GCC OR USE_CLANG) set(CMAKE_REQUIRED_FLAGS "-w -mavx") endif() -if (APPLE AND __universal_build) - # Force the test to build under x86_64 - set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. -endif() - set(AVX_CODE " + #include #include + int main() - { + { // Create two arrays of floats float a[8] = {1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}; float b[8] = {2.0, 4.0, 6.0, 8.0, 10.0, 12.0, 14.0, 16.0}; @@ -35,13 +23,20 @@ set(AVX_CODE " return 0; } ") -check_cxx_source_compiles("${AVX_CODE}" COMPILER_SUPPORTS_AVX) + +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/avx_test.cpp" "${AVX_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_AVX") +try_compile(COMPILER_SUPPORTS_AVX + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/avx_test.cpp" +) + +if(COMPILER_SUPPORTS_AVX) + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) - -if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") - unset(_cmake_osx_architectures_orig) - unset(__universal_build) -endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportAVX2.cmake b/share/cmake/utils/CheckSupportAVX2.cmake index 95ac6b4361..611394f314 100644 --- a/share/cmake/utils/CheckSupportAVX2.cmake +++ b/share/cmake/utils/CheckSupportAVX2.cmake @@ -5,24 +5,12 @@ include(CheckCXXSourceCompiles) set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") - set(__universal_build 1) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") -endif() - if(MSVC) set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX2") elseif(USE_GCC OR USE_CLANG) set(CMAKE_REQUIRED_FLAGS "-w -mavx2 -mfma -mf16c") endif() -if (APPLE AND __universal_build) - # Force the test to build under x86_64 - set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. -endif() - set(AVX2_CODE " #include @@ -43,13 +31,20 @@ set(AVX2_CODE " return 0; } ") -check_cxx_source_compiles("${AVX2_CODE}" COMPILER_SUPPORTS_AVX2) -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/avx2_test.cpp" "${AVX2_CODE}") -if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") - unset(_cmake_osx_architectures_orig) - unset(__universal_build) -endif() \ No newline at end of file +message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2") +try_compile(COMPILER_SUPPORTS_AVX2 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/avx2_test.cpp" +) + +if(COMPILER_SUPPORTS_AVX2) + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2 - Failed") +endif() + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportAVX512.cmake b/share/cmake/utils/CheckSupportAVX512.cmake index ea9007904a..c3989e0fab 100644 --- a/share/cmake/utils/CheckSupportAVX512.cmake +++ b/share/cmake/utils/CheckSupportAVX512.cmake @@ -5,24 +5,12 @@ include(CheckCXXSourceCompiles) set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") - set(__universal_build 1) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") -endif() - if(MSVC) set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX512") elseif(USE_GCC OR USE_CLANG) set(CMAKE_REQUIRED_FLAGS "-w -mavx512f") endif() -if (APPLE AND __universal_build) - # Force the test to build under x86_64 - set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. -endif() - set(AVX512_CODE " #include @@ -31,13 +19,20 @@ set(AVX512_CODE " return 0; } ") -check_cxx_source_compiles("${AVX512_CODE}" COMPILER_SUPPORTS_AVX512) -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/avx512_test.cpp" "${AVX512_CODE}") -if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") - unset(_cmake_osx_architectures_orig) - unset(__universal_build) -endif() \ No newline at end of file +message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512") +try_compile(COMPILER_SUPPORTS_AVX512 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/avx512_test.cpp" +) + +if(COMPILER_SUPPORTS_AVX512) + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512 - Failed") +endif() + +set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") +unset(_cmake_required_flags_orig) \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportF16C.cmake b/share/cmake/utils/CheckSupportF16C.cmake index 524b69a44d..90825ee4cd 100644 --- a/share/cmake/utils/CheckSupportF16C.cmake +++ b/share/cmake/utils/CheckSupportF16C.cmake @@ -19,7 +19,20 @@ set(F16C_CODE " return 0; } ") -check_cxx_source_compiles("${F16C_CODE}" COMPILER_SUPPORTS_F16C) + +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse42_test.cpp" "${F16C_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_F16C") +try_compile(COMPILER_SUPPORTS_F16C + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/sse42_test.cpp" +) + +if(COMPILER_SUPPORTS_F16C) + message(STATUS "Performing Test COMPILER_SUPPORTS_F16C - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_F16C - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake index e8c6e181ff..099c705cb4 100644 --- a/share/cmake/utils/CheckSupportSSE2.cmake +++ b/share/cmake/utils/CheckSupportSSE2.cmake @@ -28,7 +28,7 @@ endif() if (APPLE AND __universal_build) # Force the test to build under x86_64 set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(SSE2_CODE " @@ -44,13 +44,26 @@ set(SSE2_CODE " return (0); } ") -check_cxx_source_compiles("${SSE2_CODE}" COMPILER_SUPPORTS_SSE2) + +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse2_test.cpp" "${SSE2_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2") +try_compile(COMPILER_SUPPORTS_SSE2 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/sse2_test.cpp" +) + +if(COMPILER_SUPPORTS_SSE2) + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2 - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") unset(_cmake_osx_architectures_orig) unset(__universal_build) endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE3.cmake b/share/cmake/utils/CheckSupportSSE3.cmake index 2ccb156553..a1c76b1546 100644 --- a/share/cmake/utils/CheckSupportSSE3.cmake +++ b/share/cmake/utils/CheckSupportSSE3.cmake @@ -19,7 +19,7 @@ endif() if (APPLE AND __universal_build) # Force the test to build under x86_64 set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(SSE3_CODE " @@ -31,13 +31,26 @@ set(SSE3_CODE " return 0; } ") -check_cxx_source_compiles("${SSE3_CODE}" COMPILER_SUPPORTS_SSE3) + +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse3_test.cpp" "${SSE3_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3") +try_compile(COMPILER_SUPPORTS_SSE3 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/sse3_test.cpp" +) + +if(COMPILER_SUPPORTS_SSE3) + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3 - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") unset(_cmake_osx_architectures_orig) unset(__universal_build) endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE4.cmake b/share/cmake/utils/CheckSupportSSE4.cmake index c45c9eab13..3bfc7095a7 100644 --- a/share/cmake/utils/CheckSupportSSE4.cmake +++ b/share/cmake/utils/CheckSupportSSE4.cmake @@ -19,7 +19,7 @@ endif() if (APPLE AND __universal_build) # Force the test to build under x86_64 set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(SSE4_CODE " @@ -31,13 +31,26 @@ set(SSE4_CODE " return 0; } ") -check_cxx_source_compiles("${SSE4_CODE}" COMPILER_SUPPORTS_SSE4) + +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse4_test.cpp" "${SSE4_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4") +try_compile(COMPILER_SUPPORTS_SSE4 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/sse4_test.cpp" +) + +if(COMPILER_SUPPORTS_SSE4) + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4 - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") unset(_cmake_osx_architectures_orig) unset(__universal_build) endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE42.cmake b/share/cmake/utils/CheckSupportSSE42.cmake index 6f7486cc1a..521885c89f 100644 --- a/share/cmake/utils/CheckSupportSSE42.cmake +++ b/share/cmake/utils/CheckSupportSSE42.cmake @@ -19,7 +19,7 @@ endif() if (APPLE AND __universal_build) # Force the test to build under x86_64 set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(SSE42_CODE " @@ -31,14 +31,26 @@ set(SSE42_CODE " return 0; } ") -check_cxx_source_compiles("${SSE42_CODE}" COMPILER_SUPPORTS_SSE42) +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse42_test.cpp" "${SSE42_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42") +try_compile(COMPILER_SUPPORTS_SSE42 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/sse42_test.cpp" +) + +if(COMPILER_SUPPORTS_SSE42) + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42 - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") unset(_cmake_osx_architectures_orig) unset(__universal_build) endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSSE3.cmake b/share/cmake/utils/CheckSupportSSSE3.cmake index 5171e6f594..3749e3e90d 100644 --- a/share/cmake/utils/CheckSupportSSSE3.cmake +++ b/share/cmake/utils/CheckSupportSSSE3.cmake @@ -19,7 +19,7 @@ endif() if (APPLE AND __universal_build) # Force the test to build under x86_64 set(CMAKE_OSX_ARCHITECTURES "x86_64") - # Apple has an automatic translation layer from SSE/AVX to ARM Neon. + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(SSSE3_CODE " @@ -31,13 +31,26 @@ set(SSSE3_CODE " return 0; } ") -check_cxx_source_compiles("${SSSE3_CODE}" COMPILER_SUPPORTS_SSSE3) + +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/ssse3_test.cpp" "${SSSE3_CODE}") + +message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3") +try_compile(COMPILER_SUPPORTS_SSSE3 + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/ssse3_test.cpp" +) + +if(COMPILER_SUPPORTS_SSSE3) + message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3 - Success") +else() + message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3 - Failed") +endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") unset(_cmake_required_flags_orig) if(__universal_build) - set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") unset(_cmake_osx_architectures_orig) unset(__universal_build) endif() \ No newline at end of file diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index 364c6f2109..4bb775e30e 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -12,15 +12,16 @@ set(PLATFORM_LINK_OPTIONS "") # Verify SIMD compatibility if(OCIO_USE_SIMD) + if (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86) + include(CheckSupportX86SIMD) + endif() + if (OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) include(CheckSupportSSEUsingSSE2NEON) - if(NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) set(OCIO_USE_SSE2NEON OFF) endif() endif() - - include(CheckSupportX86SIMD) else() set(OCIO_USE_SSE2 OFF) set(OCIO_USE_SSE3 OFF) @@ -35,11 +36,14 @@ else() set(OCIO_USE_SSE2NEON OFF) endif() -#TODOCED Does not make sense anymore as we have AVX and AVX2 support now. -# if(NOT COMPILER_SUPPORTS_SSE2 AND NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) -# message(STATUS "Disabling SSE optimizations, as the target doesn't support them") -# set(OCIO_USE_SIMD OFF) -# endif() +#if(NOT COMPILER_SUPPORTS_SSE2 AND NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) +if (NOT COMPILER_SUPPORTS_SSE2 AND NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON AND + NOT COMPILER_SUPPORTS_SSE3 AND NOT COMPILER_SUPPORTS_SSSE3 AND + NOT COMPILER_SUPPORTS_SSE4 AND NOT COMPILER_SUPPORTS_SSE42 AND + NOT COMPILER_SUPPORTS_AVX AND NOT COMPILER_SUPPORTS_AVX2 AND NOT COMPILER_SUPPORTS_AVX512) + message(STATUS "Disabling SIMD optimizations, as the target doesn't support them") + set(OCIO_USE_SIMD OFF) +endif() ############################################################################### # Compile flags diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index c49f61a9b4..5e8291a8db 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -205,7 +205,9 @@ endif() configure_file(res/OpenColorIO.pc.in ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -if(OCIO_ARCH_X86) + +if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86 OR OCIO_USE_SSE2NEON)) + # Note that these files are gated by preprocessors to remove them based on the OCIO_USE_* vars. set_property(SOURCE ops/lut1d/Lut1DOpCPU_SSE2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE ops/lut1d/Lut1DOpCPU_AVX.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE ops/lut1d/Lut1DOpCPU_AVX2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) diff --git a/src/OpenColorIO/CPUInfo.cpp b/src/OpenColorIO/CPUInfo.cpp index 4333fc772e..4b3f05d861 100644 --- a/src/OpenColorIO/CPUInfo.cpp +++ b/src/OpenColorIO/CPUInfo.cpp @@ -17,7 +17,7 @@ typedef __int64 int64_t; namespace OCIO_NAMESPACE { -#ifdef OCIO_ARCH_X86 +#if defined(OCIO_ARCH_X86) namespace { @@ -182,6 +182,46 @@ CPUInfo& CPUInfo::instance() return singleton; } +#elif defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +CPUInfo::CPUInfo() +{ + flags = 0; + memset(name, 0, sizeof(name)); + + // Hardcode name to Apple ARM. + snprintf(name, sizeof(name), "%s", "Apple ARM"); + + // Note that Rosetta does not support any AVX instructions. + // See https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment#What-Cant-Be-Translated + + // Also note that during testing on a Apple M1 MacBook, SSE 4.2 does not seems to be supported. + +#if !defined(__aarch64__) + // Enable SSE2 instructions support using Rosetta for the x86_64 architecture on Apple ARM cpu. + if (OCIO_USE_SSE2) + { + flags |= X86_CPU_FLAG_SSE2; + } + //TODO: Once the other SSE instructions are implemented into OCIO, these can be enabled here. +#elif defined(__aarch64__) + // ARM architecture A64 (ARM64) + // SSE2NEON library supports SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2. + // It does not support any AVX instructions. + if (OCIO_USE_SSE2) + { + flags |= X86_CPU_FLAG_SSE2; + } + //TODO: Once the other SSE instructions are implemented into OCIO, these can be enabled here. +#endif + + +} + +CPUInfo& CPUInfo::instance() +{ + static CPUInfo singleton = CPUInfo(); + return singleton; +} #endif // ARCH_X86 } // namespace OCIO_NAMESPACE \ No newline at end of file diff --git a/src/OpenColorIO/CPUInfo.h b/src/OpenColorIO/CPUInfo.h index 79d5d1d4b6..f21956d191 100644 --- a/src/OpenColorIO/CPUInfo.h +++ b/src/OpenColorIO/CPUInfo.h @@ -11,8 +11,6 @@ namespace OCIO_NAMESPACE { -#ifdef OCIO_ARCH_X86 - #define X86_CPU_FLAG_SSE2 (1 << 0) // SSE2 functions #define X86_CPU_FLAG_SSE2_SLOW (1 << 1) // SSE2 supported, but usually not faster than regular MMX/SSE (e.g. Core1) @@ -38,6 +36,8 @@ namespace OCIO_NAMESPACE #define x86_check_flags(cpuext) \ (OCIO_USE_ ## cpuext && ((flags) & X86_CPU_FLAG_ ## cpuext)) +#if defined(OCIO_ARCH_X86) + struct CPUInfo { unsigned int flags; @@ -79,6 +79,45 @@ struct CPUInfo #undef x86_check_flags +#elif defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON + +#define check_flags(cpuext) \ + (OCIO_USE_ ## cpuext && ((flags) & X86_CPU_FLAG_ ## cpuext)) + +struct CPUInfo +{ + unsigned int flags; + char name[65]; + + CPUInfo(); + + static CPUInfo& instance(); + + bool hasSSE2() const { return x86_check_flags(SSE2); } + bool SSE2Slow() const { return false; } + + bool hasSSE3() const { return x86_check_flags(SSE3); } + bool SSE3Slow() const { return false; } + + bool hasSSSE3() const { return x86_check_flags(SSSE3); } + bool SSSE3Slow() const { return false; } + + bool hasSSE4() const { return x86_check_flags(SSE4); } + bool hasSSE42() const { return false; } + + // Apple M1 does not support AVX SIMD instructions through Rosetta. + // SSE2NEON library does not supports AVX SIMD instructions. + bool hasAVX() const { return false; } + bool AVXSlow() const { return false; } + bool hasAVX2() const { return false; } + bool AVX2SlowGather() const { return false; } + bool hasAVX512() const { return false; } + bool hasF16C() const { return false; } + +}; + +#undef x86_check_flags + #endif // OCIO_ARCH_X86 } // namespace OCIO_NAMESPACE diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index 68fdb55831..f6330a5547 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -3,6 +3,7 @@ #cmakedefine OCIO_ARCH_X86 +#cmakedefine OCIO_APPLE_M1_ARCH_X86 #cmakedefine01 OCIO_USE_SSE2 #cmakedefine01 OCIO_USE_SSE3 diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 15d76d8908..441703fd3f 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -6,7 +6,7 @@ #define INCLUDED_OCIO_SSE2_H #include "CPUInfo.h" -#if OCIO_USE_SSE2 +#if OCIO_USE_SSE2 || OCIO_USE_SSE2NEON // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). #if !defined(__aarch64__) @@ -29,6 +29,33 @@ namespace OCIO_NAMESPACE { +// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since +// it is redefining two of the functions from sse2neon. + +#if defined(__aarch64__) + #if defined(OCIO_USE_SSE2NEON) + // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to + // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. + + // With the Intel intrinsics, if one value is a NaN, the second argument is output, as if it were + // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the + // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the + // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as + // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument + // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in + // the first argument continues to be filtered out. + static inline __m128 _mm_max_ps(__m128 a, __m128 b) + { + return vreinterpretq_m128_f32( + vmaxnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + } + static inline __m128 _mm_min_ps(__m128 a, __m128 b) + { + return vreinterpretq_m128_f32( + vminnmq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); + } + #endif +#endif inline __m128 sse2_clamp(__m128 value, const __m128& maxValue) { diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp index 81a7649db1..d542139ea2 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp @@ -2,7 +2,6 @@ #if OCIO_USE_SSE2 -#include #include #include "SSE2.h" diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp index 02e5fbf2c3..e98a0470d0 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_SSE2.cpp @@ -7,8 +7,6 @@ #include "SSE2.h" -#include - namespace OCIO_NAMESPACE { namespace { diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index ebe3bf20c1..1c44bfc3d6 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -71,24 +71,25 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}" ) - if(OCIO_ARCH_X86) + if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86 OR OCIO_USE_SSE2NEON)) add_test(NAME ${TEST_NAME} COMMAND ${TEST_BINARY}) add_test(NAME ${TEST_NAME}_no_accel COMMAND ${TEST_BINARY} --no_accel) - if(${OCIO_USE_SSE2}) + + if(OCIO_USE_SSE2) add_test(NAME ${TEST_NAME}_sse2 COMMAND ${TEST_BINARY} --sse2) - if(${OCIO_USE_F16C}) + if(OCIO_USE_F16C) add_test(NAME ${TEST_NAME}_sse2+f16c COMMAND ${TEST_BINARY} --sse2 --f16c) endif() endif() - if(${OCIO_USE_AVX}) + if(OCIO_USE_AVX) add_test(NAME ${TEST_NAME}_avx COMMAND ${TEST_BINARY} --avx) - if(${OCIO_USE_F16C}) + if(OCIO_USE_F16C) add_test(NAME ${TEST_NAME}_avx+f16c COMMAND ${TEST_BINARY} --avx --f16c) endif() endif() - if(${OCIO_USE_AVX2}) + if(OCIO_USE_AVX2) add_test(NAME ${TEST_NAME}_avx2 COMMAND ${TEST_BINARY} --avx2) endif() else() @@ -315,7 +316,8 @@ prepend(SOURCES "${PROJECT_SOURCE_DIR}/src/OpenColorIO/" ${SOURCES}) list(APPEND SOURCES ${TESTS}) -if(OCIO_ARCH_X86) +if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86 OR OCIO_USE_SSE2NEON)) + # Note that these files are gated by preprocessors to remove them based on the OCIO_USE_* vars. set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX2_ARGS}) diff --git a/tests/cpu/SSE2_tests.cpp b/tests/cpu/SSE2_tests.cpp index 3a0e2b7c36..2deb148cce 100644 --- a/tests/cpu/SSE2_tests.cpp +++ b/tests/cpu/SSE2_tests.cpp @@ -9,7 +9,6 @@ #include -#include #include "MathUtils.h" #include "BitDepthUtils.h" #include "SSE2.h" diff --git a/tests/cpu/UnitTestMain.cpp b/tests/cpu/UnitTestMain.cpp index cf5c1b1a9b..5a030b89e4 100644 --- a/tests/cpu/UnitTestMain.cpp +++ b/tests/cpu/UnitTestMain.cpp @@ -59,7 +59,7 @@ int main(int argc, const char ** argv) // Note that empty strings mean to run all the unit tests. std::string filter, utestGroupAllowed, utestNameAllowed; -#ifdef OCIO_ARCH_X86 +#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON bool no_accel = false; bool sse2 = false; bool avx = false; @@ -70,7 +70,7 @@ int main(int argc, const char ** argv) ap.options("\nCommand line arguments:\n", "--help", &printHelp, "Print help message", "--stop_on_error", &stopOnFirstError, "Stop on the first error", -#ifdef OCIO_ARCH_X86 +#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON "--no_accel", &no_accel, "Disable ALL Accelerated features", "--sse2", &sse2, "Enable SSE2 Accelerated features", "--avx", &avx, "Enable AVX Accelerated features", @@ -96,7 +96,7 @@ int main(int argc, const char ** argv) return 1; } -#ifdef OCIO_ARCH_X86 +#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON OCIO::CPUInfo &cpu = OCIO::CPUInfo::instance(); if (no_accel || sse2 || avx || avx2 || f16c) { From 248dde761e74939da652465c63fb84a8e2ee35f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Thu, 24 Aug 2023 10:46:02 -0400 Subject: [PATCH 05/22] Comments clean up and refactor some comments and documentations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 12 +++++++----- docs/quick_start/installation.rst | 17 ++++++++++++++++- .../utils/CheckSupportSSEUsingSSE2NEON.cmake | 4 ---- share/cmake/utils/CheckSupportX86SIMD.cmake | 9 ++------- share/cmake/utils/CompilerFlags.cmake | 4 ++-- tests/cpu/UnitTestMain.cpp | 9 ++++++--- 6 files changed, 33 insertions(+), 22 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 334886614b..c2ea3a25dc 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,7 +204,7 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686) # to compile for intel-based Macbook or do a universal build with CMAKE_OSX_ARCHITECTURES="arm64;x86_64". # Also note that Rosetta does not support any AVXs or F16C instructions. - # Assumes an Intel-based computer or an Intel-based Apple MacBook. + # Assumes an Intel-based architecture. set(OCIO_ARCH_X86 1) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) @@ -214,10 +214,11 @@ elseif(APPLE AND OCIO_APPLE_CHIP_PLATFORM MATCHES "M" AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64")) - # Assumes an Apple ARM MacBook. + + # The user wants to do a universal build (arm64 + x86_64) or a x86_64 only build. set(OCIO_APPLE_M1_ARCH_X86 1) - # OCIO translate SSE using SSE2NEON library. + # OCIO translates SSE using SSE2NEON library. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) # OCIO does not have any process to translate from AVXs to ARM Neon. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) @@ -225,9 +226,10 @@ elseif(APPLE AND OCIO_APPLE_CHIP_PLATFORM MATCHES "M" set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") - # Assumes an Apple ARM MacBook. + + # The user wants to do a arm64 only build. - # OCIO translate SSE using SSE2NEON library. + # OCIO translates SSE using SSE2NEON library. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) # OCIO does not have any process to translate from AVXs to ARM Neon. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index fc58c5ec6f..f0b362bb1d 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -264,6 +264,7 @@ CMake Options +++++++++++++ There are many options available in `CMake. + `_ Several of the most common ones are: @@ -277,8 +278,16 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_USE_OIIO_FOR_APPS=OFF`` (Set ON to build tools with OpenImageIO rather than OpenEXR) - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding) - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins) -- ``-DOCIO_USE_SSE=ON`` (Deprecated -- please use OCIO_USE_SIMD) - ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON) +- ``-DOCIO_USE_SSE2=ON`` (Set to OFF to turn off SSE2 CPU performance optimizations) +- ``-DOCIO_USE_SSE3=ON`` (Set to OFF to turn off SSE3 CPU performance optimizations) +- ``-DOCIO_USE_SSSE2=ON`` (Set to OFF to turn off SSSE3 CPU performance optimizations) +- ``-DOCIO_USE_SSE4=ON`` (Set to OFF to turn off SSE4 CPU performance optimizations) +- ``-DOCIO_USE_SSE42=ON`` (Set to OFF to turn off SSE4.2 CPU performance optimizations) +- ``-DOCIO_USE_AVX=ON`` (Set to OFF to turn off AVX CPU performance optimizations) +- ``-DOCIO_USE_AVX2=ON`` (Set to OFF to turn off AVX2 CPU performance optimizations) +- ``-DOCIO_USE_AVX512=ON`` (Set to OFF to turn off AVX512 CPU performance optimizations) +- ``-DOCIO_USE_F16C=ON`` (Set to OFF to turn off F16C CPU performance optimizations) - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests) - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests) - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering) @@ -286,6 +295,12 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation) - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation) +Note that *OCIO_USE_AVX*, *OCIO_USE_AVX2*, *OCIO_USE_AVX512* and *OCIO_USE_F16C* default values are +set to OFF on Apple ARM chipset because of the following two reasons: + +- Rosetta does not support these instructions +- OCIO does not currently use a library to translate these instructions into ARM Neon. + On the MacOS under the ARM architecture, the default is to make a universal build (natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. diff --git a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake index 2259ad2f0a..eb868dd8a3 100644 --- a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake +++ b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake @@ -7,10 +7,6 @@ set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") -# Compiling using CMAKE_OSX_ARCHITECTURES="arm64" will return SUCCESS. -# Compiling using CMAKE_OSX_ARCHITECTURES="x86_64" will return FAILED. -# Compiling using CMAKE_OSX_ARCHITECTURES="arm64;x86_64" will return FAILED. - if(APPLE AND COMPILER_SUPPORTS_ARM_NEON) set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR}) diff --git a/share/cmake/utils/CheckSupportX86SIMD.cmake b/share/cmake/utils/CheckSupportX86SIMD.cmake index 7de95aab1d..f3d91cd5b1 100644 --- a/share/cmake/utils/CheckSupportX86SIMD.cmake +++ b/share/cmake/utils/CheckSupportX86SIMD.cmake @@ -5,13 +5,8 @@ ############################################################################### # Check if compiler supports X86 SIMD extensions -# Please note that some compilers could ignore unknown compilers flags and -# return SUCCESS even if the options are not supported. -# We could test the SSE with small snippet of code. - -# Using a small code snippet to test each sets. It is more robust that using only compilers -# flags because some compilers might ignore the flags and check_cxx_compiler_flag could -# return a false positive. +# These checks use try_compile instead of check_cxx_source_compiles because the latter was causing +# false positives on Apple ARM architectures. include(CheckSupportAVX) include(CheckSupportAVX2) diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index 4bb775e30e..0f7f3f3e2a 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -36,11 +36,11 @@ else() set(OCIO_USE_SSE2NEON OFF) endif() -#if(NOT COMPILER_SUPPORTS_SSE2 AND NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) if (NOT COMPILER_SUPPORTS_SSE2 AND NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON AND NOT COMPILER_SUPPORTS_SSE3 AND NOT COMPILER_SUPPORTS_SSSE3 AND NOT COMPILER_SUPPORTS_SSE4 AND NOT COMPILER_SUPPORTS_SSE42 AND - NOT COMPILER_SUPPORTS_AVX AND NOT COMPILER_SUPPORTS_AVX2 AND NOT COMPILER_SUPPORTS_AVX512) + NOT COMPILER_SUPPORTS_AVX AND NOT COMPILER_SUPPORTS_AVX2 AND NOT COMPILER_SUPPORTS_AVX512 AND + NOT COMPILER_SUPPORTS_F16C) message(STATUS "Disabling SIMD optimizations, as the target doesn't support them") set(OCIO_USE_SIMD OFF) endif() diff --git a/tests/cpu/UnitTestMain.cpp b/tests/cpu/UnitTestMain.cpp index 5a030b89e4..bcdd22a381 100644 --- a/tests/cpu/UnitTestMain.cpp +++ b/tests/cpu/UnitTestMain.cpp @@ -39,6 +39,9 @@ OCIO_ADD_TEST(UnitTest, windows_debug) #endif +#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON + #define ENABLE_SIMD_USAGE +#endif int main(int argc, const char ** argv) { @@ -59,7 +62,7 @@ int main(int argc, const char ** argv) // Note that empty strings mean to run all the unit tests. std::string filter, utestGroupAllowed, utestNameAllowed; -#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +#if defined(ENABLE_SIMD_USAGE) bool no_accel = false; bool sse2 = false; bool avx = false; @@ -70,7 +73,7 @@ int main(int argc, const char ** argv) ap.options("\nCommand line arguments:\n", "--help", &printHelp, "Print help message", "--stop_on_error", &stopOnFirstError, "Stop on the first error", -#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +#if defined(ENABLE_SIMD_USAGE) "--no_accel", &no_accel, "Disable ALL Accelerated features", "--sse2", &sse2, "Enable SSE2 Accelerated features", "--avx", &avx, "Enable AVX Accelerated features", @@ -96,7 +99,7 @@ int main(int argc, const char ** argv) return 1; } -#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +#if defined(ENABLE_SIMD_USAGE) OCIO::CPUInfo &cpu = OCIO::CPUInfo::instance(); if (no_accel || sse2 || avx || avx2 || f16c) { From bb789006d8fbc79dbdae2d0cd726915874f3b6b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Thu, 24 Aug 2023 16:23:36 -0400 Subject: [PATCH 06/22] Added something in the documentation for Rosetta and small change in cmake MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 20 +++----------------- docs/quick_start/installation.rst | 4 +++- 2 files changed, 6 insertions(+), 18 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2ea3a25dc..e9f2b99f1e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,19 +184,6 @@ endif() option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ON) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) -if (APPLE) - execute_process(COMMAND "system_profiler" "SPHardwareDataType" OUTPUT_VARIABLE SYSTEM_INFO) - if(SYSTEM_INFO MATCHES "^.*Chip: Apple.*") - set(OCIO_APPLE_CHIP_PLATFORM "M") - elseif(SYSTEM_INFO MATCHES "^.* Processor Name: Intel*") - set(OCIO_APPLE_CHIP_PLATFORM "INTEL") - endif() - - if (OCIO_APPLE_CHIP_PLATFORM MATCHES "M") - option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations on Apple ARM using SSE2NEON" ON) - endif() -endif() - if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)") # Note that CMAKE_SYSTEM_PROCESSOR will return x86_64 on an Apple M1 MacBook using a Rosetta # terminal. We do not support the compilation of OCIO using a Rosetta terminal on an Apple M1 @@ -204,13 +191,12 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686) # to compile for intel-based Macbook or do a universal build with CMAKE_OSX_ARCHITECTURES="arm64;x86_64". # Also note that Rosetta does not support any AVXs or F16C instructions. - # Assumes an Intel-based architecture. + # Assumes an Intel-based architecture or Apple ARM under Rosetta. set(OCIO_ARCH_X86 1) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) -elseif(APPLE AND OCIO_APPLE_CHIP_PLATFORM MATCHES "M" - AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" +elseif(APPLE AND "${CMAKE_HOST_SYSTEM_PROCESSOR}" MATCHES "arm64" AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64")) @@ -224,7 +210,7 @@ elseif(APPLE AND OCIO_APPLE_CHIP_PLATFORM MATCHES "M" set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) # Not supported. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) -elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" +elseif(APPLE AND "${CMAKE_HOST_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") # The user wants to do a arm64 only build. diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index f0b362bb1d..d5d83cfb02 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -303,7 +303,9 @@ set to OFF on Apple ARM chipset because of the following two reasons: On the MacOS under the ARM architecture, the default is to make a universal build (natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option -may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. +may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. Universal +build under Rosetta is not supported at this time. Under Rosetta, only the x86_64 build is supported +by using ``-DCMAKE_OSX_ARCHITECTURES="x86_64"``. When doing a universal build, note that the OCIO dependencies must be built as universal libraries too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if From f799a2716d2eeb68ba9cdd9a9e90d7a107a7fc5c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Fri, 25 Aug 2023 09:56:15 -0400 Subject: [PATCH 07/22] Fixing the build under Rosetta and fixing issue in the MacOS CI. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 42 ++++++++++----------------- docs/quick_start/installation.rst | 14 ++++----- share/cmake/utils/CompilerFlags.cmake | 2 +- src/OpenColorIO/CMakeLists.txt | 2 +- src/OpenColorIO/CPUInfo.cpp | 32 +++++--------------- src/OpenColorIO/CPUInfo.h | 6 ++-- src/OpenColorIO/CPUInfoConfig.h.in | 1 - tests/cpu/CMakeLists.txt | 4 +-- tests/cpu/UnitTestMain.cpp | 2 +- 9 files changed, 36 insertions(+), 69 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9f2b99f1e..5c7704ef85 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,42 +184,26 @@ endif() option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ON) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) -if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)") - # Note that CMAKE_SYSTEM_PROCESSOR will return x86_64 on an Apple M1 MacBook using a Rosetta - # terminal. We do not support the compilation of OCIO using a Rosetta terminal on an Apple M1 - # MacBook. The user should compile OCIO using a native terminal (ARM) and use CMAKE_OSX_ARCHITECTURES="x86_64" - # to compile for intel-based Macbook or do a universal build with CMAKE_OSX_ARCHITECTURES="arm64;x86_64". - # Also note that Rosetta does not support any AVXs or F16C instructions. +if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)" OR + (APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "(arm64;x86_64|x86_64;arm64|x86_64)")) - # Assumes an Intel-based architecture or Apple ARM under Rosetta. + # Enable OCIO_ARCH_X86 for any intel-based architecture or Apple Rosetta (x86_64 or universal build). set(OCIO_ARCH_X86 1) + + # For a Mac OS universal build, OCIO translates the SSE instructions into ARM Neon with + # the library SSE2NEON for the arm64 side of the binary. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) -elseif(APPLE AND "${CMAKE_HOST_SYSTEM_PROCESSOR}" MATCHES "arm64" - AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64")) - - # The user wants to do a universal build (arm64 + x86_64) or a x86_64 only build. - set(OCIO_APPLE_M1_ARCH_X86 1) - - # OCIO translates SSE using SSE2NEON library. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) - # OCIO does not have any process to translate from AVXs to ARM Neon. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) - # Not supported. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) -elseif(APPLE AND "${CMAKE_HOST_SYSTEM_PROCESSOR}" MATCHES "arm64" - AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") +elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") - # The user wants to do a arm64 only build. + # Apple ARM only build. - # OCIO translates SSE using SSE2NEON library. + # OCIO translates the SSE instructions into ARM Neon with the library SSE2NEON. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) - # OCIO does not have any process to translate from AVXs to ARM Neon. + # OCIO does not translate AVX to ARM Neon. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) - # Not supported. + # Turn F16C off since they are x86_64 instructions. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) else() set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF) @@ -238,6 +222,10 @@ option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizatio option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX}) option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C}) +if (APPLE) + option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations using SSE2NEON for Apple ARM architecture" ON) +endif() + ############################################################################### # GPU configuration message(STATUS "") diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index d5d83cfb02..5949e3e86d 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -288,6 +288,7 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_USE_AVX2=ON`` (Set to OFF to turn off AVX2 CPU performance optimizations) - ``-DOCIO_USE_AVX512=ON`` (Set to OFF to turn off AVX512 CPU performance optimizations) - ``-DOCIO_USE_F16C=ON`` (Set to OFF to turn off F16C CPU performance optimizations) +- ``-OCIO_USE_SSE2NEON=ON`` (Apple Only; Set to OFF to turn off the SSE2NEON translation performance optimizations on Apple ARM) - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests) - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests) - ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering) @@ -295,17 +296,12 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation) - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation) -Note that *OCIO_USE_AVX*, *OCIO_USE_AVX2*, *OCIO_USE_AVX512* and *OCIO_USE_F16C* default values are -set to OFF on Apple ARM chipset because of the following two reasons: +Note that OCIO will turn off any specific SIMD CPU performance optimizations if they are not supported +by the build target architecture. -- Rosetta does not support these instructions -- OCIO does not currently use a library to translate these instructions into ARM Neon. - -On the MacOS under the ARM architecture, the default is to make a universal build +On the MacOS, the default is to build universal binaries (natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option -may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. Universal -build under Rosetta is not supported at this time. Under Rosetta, only the x86_64 build is supported -by using ``-DCMAKE_OSX_ARCHITECTURES="x86_64"``. +may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. When doing a universal build, note that the OCIO dependencies must be built as universal libraries too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if diff --git a/share/cmake/utils/CompilerFlags.cmake b/share/cmake/utils/CompilerFlags.cmake index 0f7f3f3e2a..fcee6d1df0 100644 --- a/share/cmake/utils/CompilerFlags.cmake +++ b/share/cmake/utils/CompilerFlags.cmake @@ -12,7 +12,7 @@ set(PLATFORM_LINK_OPTIONS "") # Verify SIMD compatibility if(OCIO_USE_SIMD) - if (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86) + if (OCIO_ARCH_X86) include(CheckSupportX86SIMD) endif() diff --git a/src/OpenColorIO/CMakeLists.txt b/src/OpenColorIO/CMakeLists.txt index 5e8291a8db..7f7126ce0d 100755 --- a/src/OpenColorIO/CMakeLists.txt +++ b/src/OpenColorIO/CMakeLists.txt @@ -206,7 +206,7 @@ endif() configure_file(res/OpenColorIO.pc.in ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc @ONLY) install(FILES ${CMAKE_CURRENT_BINARY_DIR}/OpenColorIO.pc DESTINATION ${CMAKE_INSTALL_LIBDIR}/pkgconfig) -if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86 OR OCIO_USE_SSE2NEON)) +if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON)) # Note that these files are gated by preprocessors to remove them based on the OCIO_USE_* vars. set_property(SOURCE ops/lut1d/Lut1DOpCPU_SSE2.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE ops/lut1d/Lut1DOpCPU_AVX.cpp APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) diff --git a/src/OpenColorIO/CPUInfo.cpp b/src/OpenColorIO/CPUInfo.cpp index 4b3f05d861..0b582a345b 100644 --- a/src/OpenColorIO/CPUInfo.cpp +++ b/src/OpenColorIO/CPUInfo.cpp @@ -17,7 +17,7 @@ typedef __int64 int64_t; namespace OCIO_NAMESPACE { -#if defined(OCIO_ARCH_X86) +#if !defined(__aarch64__) && defined(OCIO_ARCH_X86) // Intel-based processor or Apple Rosetta x86_64. namespace { @@ -181,40 +181,24 @@ CPUInfo& CPUInfo::instance() static CPUInfo singleton = CPUInfo(); return singleton; } - -#elif defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +#elif defined(__aarch64__) // ARM Processor or Apple ARM. CPUInfo::CPUInfo() { flags = 0; memset(name, 0, sizeof(name)); - // Hardcode name to Apple ARM. - snprintf(name, sizeof(name), "%s", "Apple ARM"); - - // Note that Rosetta does not support any AVX instructions. - // See https://developer.apple.com/documentation/apple-silicon/about-the-rosetta-translation-environment#What-Cant-Be-Translated - - // Also note that during testing on a Apple M1 MacBook, SSE 4.2 does not seems to be supported. + snprintf(name, sizeof(name), "%s", "ARM"); -#if !defined(__aarch64__) - // Enable SSE2 instructions support using Rosetta for the x86_64 architecture on Apple ARM cpu. - if (OCIO_USE_SSE2) - { - flags |= X86_CPU_FLAG_SSE2; - } - //TODO: Once the other SSE instructions are implemented into OCIO, these can be enabled here. -#elif defined(__aarch64__) - // ARM architecture A64 (ARM64) // SSE2NEON library supports SSE, SSE2, SSE3, SSSE3, SSE4.1 and SSE4.2. // It does not support any AVX instructions. if (OCIO_USE_SSE2) { flags |= X86_CPU_FLAG_SSE2; + flags |= X86_CPU_FLAG_SSE3; + flags |= X86_CPU_FLAG_SSSE3; + flags |= X86_CPU_FLAG_SSE4; + flags |= X86_CPU_FLAG_SSE42; } - //TODO: Once the other SSE instructions are implemented into OCIO, these can be enabled here. -#endif - - } CPUInfo& CPUInfo::instance() @@ -222,6 +206,6 @@ CPUInfo& CPUInfo::instance() static CPUInfo singleton = CPUInfo(); return singleton; } -#endif // ARCH_X86 +#endif } // namespace OCIO_NAMESPACE \ No newline at end of file diff --git a/src/OpenColorIO/CPUInfo.h b/src/OpenColorIO/CPUInfo.h index f21956d191..ad493c9b02 100644 --- a/src/OpenColorIO/CPUInfo.h +++ b/src/OpenColorIO/CPUInfo.h @@ -36,7 +36,7 @@ namespace OCIO_NAMESPACE #define x86_check_flags(cpuext) \ (OCIO_USE_ ## cpuext && ((flags) & X86_CPU_FLAG_ ## cpuext)) -#if defined(OCIO_ARCH_X86) +#if !defined(__aarch64__) && defined(OCIO_ARCH_X86) // Intel-based processor or Apple Rosetta x86_64. struct CPUInfo { @@ -79,7 +79,7 @@ struct CPUInfo #undef x86_check_flags -#elif defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +#elif defined(__aarch64__) // ARM Processor or Apple ARM. #define check_flags(cpuext) \ (OCIO_USE_ ## cpuext && ((flags) & X86_CPU_FLAG_ ## cpuext)) @@ -118,7 +118,7 @@ struct CPUInfo #undef x86_check_flags -#endif // OCIO_ARCH_X86 +#endif } // namespace OCIO_NAMESPACE diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index f6330a5547..68fdb55831 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -3,7 +3,6 @@ #cmakedefine OCIO_ARCH_X86 -#cmakedefine OCIO_APPLE_M1_ARCH_X86 #cmakedefine01 OCIO_USE_SSE2 #cmakedefine01 OCIO_USE_SSE3 diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index 1c44bfc3d6..e44afadf93 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -71,7 +71,7 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) LINK_OPTIONS "${PLATFORM_LINK_OPTIONS}" ) - if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86 OR OCIO_USE_SSE2NEON)) + if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON)) add_test(NAME ${TEST_NAME} COMMAND ${TEST_BINARY}) add_test(NAME ${TEST_NAME}_no_accel COMMAND ${TEST_BINARY} --no_accel) @@ -316,7 +316,7 @@ prepend(SOURCES "${PROJECT_SOURCE_DIR}/src/OpenColorIO/" ${SOURCES}) list(APPEND SOURCES ${TESTS}) -if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_APPLE_M1_ARCH_X86 OR OCIO_USE_SSE2NEON)) +if(OCIO_USE_SIMD AND (OCIO_ARCH_X86 OR OCIO_USE_SSE2NEON)) # Note that these files are gated by preprocessors to remove them based on the OCIO_USE_* vars. set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_SSE2_ARGS}) set_property(SOURCE "${CMAKE_SOURCE_DIR}/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp" APPEND PROPERTY COMPILE_OPTIONS ${OCIO_AVX_ARGS}) diff --git a/tests/cpu/UnitTestMain.cpp b/tests/cpu/UnitTestMain.cpp index bcdd22a381..61ae8adf89 100644 --- a/tests/cpu/UnitTestMain.cpp +++ b/tests/cpu/UnitTestMain.cpp @@ -39,7 +39,7 @@ OCIO_ADD_TEST(UnitTest, windows_debug) #endif -#if defined(OCIO_ARCH_X86) || defined(OCIO_APPLE_M1_ARCH_X86) || OCIO_USE_SSE2NEON +#if defined(OCIO_ARCH_X86) || OCIO_USE_SSE2NEON #define ENABLE_SIMD_USAGE #endif From 590be95090120535a32e556c07bb800f5c150928 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Mon, 28 Aug 2023 08:19:04 -0400 Subject: [PATCH 08/22] Using try_compile for CheckSupportSSEUsingSSE2NEON to standardize the code and fixed CheckSupport compiler flags. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- share/cmake/utils/CheckSupportAVX.cmake | 28 +++++++++++++++---- share/cmake/utils/CheckSupportAVX2.cmake | 28 +++++++++++++++---- share/cmake/utils/CheckSupportAVX512.cmake | 28 +++++++++++++++---- share/cmake/utils/CheckSupportF16C.cmake | 26 ++++++++++++++--- share/cmake/utils/CheckSupportSSE2.cmake | 8 +++--- share/cmake/utils/CheckSupportSSE3.cmake | 8 +++--- share/cmake/utils/CheckSupportSSE4.cmake | 8 +++--- share/cmake/utils/CheckSupportSSE42.cmake | 8 +++--- .../utils/CheckSupportSSEUsingSSE2NEON.cmake | 18 ++++++++++-- share/cmake/utils/CheckSupportSSSE3.cmake | 8 +++--- 10 files changed, 126 insertions(+), 42 deletions(-) diff --git a/share/cmake/utils/CheckSupportAVX.cmake b/share/cmake/utils/CheckSupportAVX.cmake index 89b7688997..c4a6015895 100644 --- a/share/cmake/utils/CheckSupportAVX.cmake +++ b/share/cmake/utils/CheckSupportAVX.cmake @@ -1,12 +1,24 @@ # SPDX-License-Identifier: BSD-3-Clause # Copyright Contributors to the OpenColorIO Project. -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() if(MSVC) - set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX") + set(CMAKE_CXX_FLAGS "/w /arch:AVX") elseif(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -mavx") + set(CMAKE_CXX_FLAGS "-w -mavx") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(AVX_CODE " @@ -38,5 +50,11 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_AVX - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) + +if(__universal_build) + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() diff --git a/share/cmake/utils/CheckSupportAVX2.cmake b/share/cmake/utils/CheckSupportAVX2.cmake index 611394f314..c81673cd6e 100644 --- a/share/cmake/utils/CheckSupportAVX2.cmake +++ b/share/cmake/utils/CheckSupportAVX2.cmake @@ -3,12 +3,24 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() if(MSVC) - set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX2") + set(CMAKE_CXX_FLAGS "/w /arch:AVX2") elseif(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -mavx2 -mfma -mf16c") + set(CMAKE_CXX_FLAGS "-w -mavx2 -mfma -mf16c") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(AVX2_CODE " @@ -46,5 +58,11 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) \ No newline at end of file +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) + +if(__universal_build) + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportAVX512.cmake b/share/cmake/utils/CheckSupportAVX512.cmake index c3989e0fab..5ef3ea038c 100644 --- a/share/cmake/utils/CheckSupportAVX512.cmake +++ b/share/cmake/utils/CheckSupportAVX512.cmake @@ -3,12 +3,24 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() if(MSVC) - set(CMAKE_REQUIRED_FLAGS "/w /arch:AVX512") + set(CMAKE_CXX_FLAGS "/w /arch:AVX512") elseif(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -mavx512f") + set(CMAKE_CXX_FLAGS "-w -mavx512f") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(AVX512_CODE " @@ -34,5 +46,11 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) \ No newline at end of file +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) + +if(__universal_build) + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportF16C.cmake b/share/cmake/utils/CheckSupportF16C.cmake index 90825ee4cd..400d065b0b 100644 --- a/share/cmake/utils/CheckSupportF16C.cmake +++ b/share/cmake/utils/CheckSupportF16C.cmake @@ -3,11 +3,23 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") + +if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") + set(__universal_build 1) + set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") +endif() # MSVC doesn't have flags if(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -mf16c") + set(CMAKE_CXX_FLAGS "-w -mf16c") +endif() + +if (APPLE AND __universal_build) + # Force the test to build under x86_64 + set(CMAKE_OSX_ARCHITECTURES "x86_64") + # Apple has an automatic translation layer from SSE to ARM Neon. endif() set(F16C_CODE " @@ -34,5 +46,11 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_F16C - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) \ No newline at end of file +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) + +if(__universal_build) + set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") + unset(_cmake_osx_architectures_orig) + unset(__universal_build) +endif() \ No newline at end of file diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake index 099c705cb4..67dd89828a 100644 --- a/share/cmake/utils/CheckSupportSSE2.cmake +++ b/share/cmake/utils/CheckSupportSSE2.cmake @@ -3,7 +3,7 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -22,7 +22,7 @@ if(MSVC) check_cxx_compiler_flag("/arch:SSE2" COMPILER_SUPPORTS_SSE2) endif() elseif(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -msse2") + set(CMAKE_CXX_FLAGS "-w -msse2") endif() if (APPLE AND __universal_build) @@ -59,8 +59,8 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +set(CMAKE_REQUIRED_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) if(__universal_build) set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") diff --git a/share/cmake/utils/CheckSupportSSE3.cmake b/share/cmake/utils/CheckSupportSSE3.cmake index a1c76b1546..d87daab1c6 100644 --- a/share/cmake/utils/CheckSupportSSE3.cmake +++ b/share/cmake/utils/CheckSupportSSE3.cmake @@ -3,7 +3,7 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -13,7 +13,7 @@ endif() # MSVC doesn't have flags if(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -msse3") + set(CMAKE_CXX_FLAGS "-w -msse3") endif() if (APPLE AND __universal_build) @@ -46,8 +46,8 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) if(__universal_build) set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") diff --git a/share/cmake/utils/CheckSupportSSE4.cmake b/share/cmake/utils/CheckSupportSSE4.cmake index 3bfc7095a7..4e3e815834 100644 --- a/share/cmake/utils/CheckSupportSSE4.cmake +++ b/share/cmake/utils/CheckSupportSSE4.cmake @@ -3,7 +3,7 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -13,7 +13,7 @@ endif() # MSVC doesn't have flags if(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -msse4") + set(CMAKE_CXX_FLAGS "-w -msse4") endif() if (APPLE AND __universal_build) @@ -46,8 +46,8 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) if(__universal_build) set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") diff --git a/share/cmake/utils/CheckSupportSSE42.cmake b/share/cmake/utils/CheckSupportSSE42.cmake index 521885c89f..4a002b87b9 100644 --- a/share/cmake/utils/CheckSupportSSE42.cmake +++ b/share/cmake/utils/CheckSupportSSE42.cmake @@ -3,7 +3,7 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -13,7 +13,7 @@ endif() # MSVC doesn't have flags if(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -msse4.2") + set(CMAKE_CXX_FLAGS "-w -msse4.2") endif() if (APPLE AND __universal_build) @@ -46,8 +46,8 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) if(__universal_build) set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") diff --git a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake index eb868dd8a3..c47c8be701 100644 --- a/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake +++ b/share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake @@ -8,8 +8,6 @@ set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}") set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") if(APPLE AND COMPILER_SUPPORTS_ARM_NEON) - - set(CMAKE_REQUIRED_INCLUDES ${sse2neon_INCLUDE_DIR}) if("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -45,7 +43,21 @@ if(APPLE AND COMPILER_SUPPORTS_ARM_NEON) return (0); } ") - check_cxx_source_compiles("${SSE2NEON_CODE}" COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) + + file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse2neon_test.cpp" "${SSE2NEON_CODE}") + + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE_WITH_SSE2NEON") + try_compile(COMPILER_SUPPORTS_SSE_WITH_SSE2NEON + "${CMAKE_BINARY_DIR}/CMakeTmp" + "${CMAKE_BINARY_DIR}/CMakeTmp/sse2neon_test.cpp" + COMPILE_DEFINITIONS "-I${sse2neon_INCLUDE_DIR}" + ) + + if(COMPILER_SUPPORTS_SSE_WITH_SSE2NEON) + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE_WITH_SSE2NEON - Success") + else() + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE_WITH_SSE2NEON - Failed") + endif() endif() set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") diff --git a/share/cmake/utils/CheckSupportSSSE3.cmake b/share/cmake/utils/CheckSupportSSSE3.cmake index 3749e3e90d..a0ead45445 100644 --- a/share/cmake/utils/CheckSupportSSSE3.cmake +++ b/share/cmake/utils/CheckSupportSSSE3.cmake @@ -3,7 +3,7 @@ include(CheckCXXSourceCompiles) -set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}") +set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") @@ -13,7 +13,7 @@ endif() # MSVC doesn't have flags if(USE_GCC OR USE_CLANG) - set(CMAKE_REQUIRED_FLAGS "-w -msse3") + set(CMAKE_CXX_FLAGS "-w -msse3") endif() if (APPLE AND __universal_build) @@ -46,8 +46,8 @@ else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3 - Failed") endif() -set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}") -unset(_cmake_required_flags_orig) +set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}") +unset(_cmake_cxx_flags_orig) if(__universal_build) set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}") From c31c07dcc52f68f6a1d45e800b5e849624a455fd Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Sat, 26 Aug 2023 19:03:13 -0700 Subject: [PATCH 09/22] use emmintrin.h for only sse2 features MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mark Reid Signed-off-by: Cédrik Fuoco From 8a864dd2872b7ef9b4b015bd397bb0c00a24d012 Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Sat, 26 Aug 2023 19:01:14 -0700 Subject: [PATCH 10/22] Allow F16C to be completely turned off MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Mark Reid Signed-off-by: Cédrik Fuoco --- src/OpenColorIO/AVX2.h | 4 ++ src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp | 6 ++- tests/cpu/AVX2_tests.cpp | 51 ++++++++++++++----- tests/cpu/AVX_tests.cpp | 4 ++ tests/cpu/CMakeLists.txt | 3 ++ tests/cpu/SSE2_tests.cpp | 4 ++ 6 files changed, 57 insertions(+), 15 deletions(-) diff --git a/src/OpenColorIO/AVX2.h b/src/OpenColorIO/AVX2.h index dfbfa5dbc7..404fc046c1 100644 --- a/src/OpenColorIO/AVX2.h +++ b/src/OpenColorIO/AVX2.h @@ -234,6 +234,8 @@ struct AVX2RGBAPack } }; +#if OCIO_USE_F16C + template <> struct AVX2RGBAPack { @@ -271,6 +273,8 @@ struct AVX2RGBAPack } }; +#endif + template <> struct AVX2RGBAPack { diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp index 47155b2720..a4565a0335 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp @@ -127,7 +127,11 @@ inline Lut1DOpCPUApplyFunc * GetConvertInBitDepth(BitDepth outBD) case BIT_DEPTH_UINT16: return linear1D; case BIT_DEPTH_F16: - return linear1D; +#if OCIO_USE_F16C + if (CPUInfo::instance().hasF16C()) + return linear1D; + break; +#endif case BIT_DEPTH_F32: return linear1D; case BIT_DEPTH_UINT14: diff --git a/tests/cpu/AVX2_tests.cpp b/tests/cpu/AVX2_tests.cpp index cd78182130..421621eed0 100644 --- a/tests/cpu/AVX2_tests.cpp +++ b/tests/cpu/AVX2_tests.cpp @@ -20,6 +20,9 @@ namespace OCIO = OCIO_NAMESPACE; #define AVX2_CHECK() \ if (!OCIO::CPUInfo::instance().hasAVX2()) throw SkipException() +#define HAS_F16C() \ + OCIO::CPUInfo::instance().hasF16C() + namespace { @@ -68,12 +71,16 @@ float scale_unsigned(unsigned i) return static_cast(i) * 1.0f/65535.0f; } +#if OCIO_USE_F16C + template <> half scale_unsigned(unsigned i) { return static_cast(1.0f/65535.0f * static_cast(i)); } +#endif + template void testConvert_OutBitDepth() { @@ -126,17 +133,21 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) switch(outBD) { case OCIO::BIT_DEPTH_UINT8: - return testConvert_OutBitDepth(); return; + return testConvert_OutBitDepth(); case OCIO::BIT_DEPTH_UINT10: - return testConvert_OutBitDepth(); return; + return testConvert_OutBitDepth(); case OCIO::BIT_DEPTH_UINT12: - return testConvert_OutBitDepth(); return; + return testConvert_OutBitDepth(); case OCIO::BIT_DEPTH_UINT16: - return testConvert_OutBitDepth(); return; + return testConvert_OutBitDepth(); case OCIO::BIT_DEPTH_F16: - return testConvert_OutBitDepth(); return; +#if OCIO_USE_F16C + if (HAS_F16C()) + return testConvert_OutBitDepth(); +#endif + break; case OCIO::BIT_DEPTH_F32: - return testConvert_OutBitDepth(); return; + return testConvert_OutBitDepth(); case OCIO::BIT_DEPTH_UINT14: case OCIO::BIT_DEPTH_UINT32: @@ -255,6 +266,8 @@ OCIO_ADD_TEST(AVX2, packed_uint16_to_f32_test) } } +#if OCIO_USE_F16C + OCIO_ADD_TEST(AVX2, packed_f16_to_f32_test) { AVX2_CHECK(); @@ -283,6 +296,7 @@ OCIO_ADD_TEST(AVX2, packed_f16_to_f32_test) } } +#endif OCIO_ADD_TEST(AVX2, packed_nan_inf_test) { @@ -305,16 +319,22 @@ OCIO_ADD_TEST(AVX2, packed_nan_inf_test) 100000.0f, 200000.0f, -10.0f, -2000.0f, 65535.0f, 65537.0f, -65536.0f, -65537.0f }; - OCIO::AVX2RGBAPack::Load(&pixels[0], r, g, b, a); - OCIO::AVX2RGBAPack::Store(&outImageHalf[0], r, g, b, a); - - for (unsigned i = 0; i < outImageHalf.size(); i++) +#if OCIO_USE_F16C + if(HAS_F16C()) { - OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((half)pixels[i], (float)outImageHalf[i], 0, false), - GetErrorMessage((half)pixels[i], (float)outImageHalf[i], - OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_F16)); + OCIO::AVX2RGBAPack::Load(&pixels[0], r, g, b, a); + OCIO::AVX2RGBAPack::Store(&outImageHalf[0], r, g, b, a); + + for (unsigned i = 0; i < outImageHalf.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((half)pixels[i], (float)outImageHalf[i], 0, false), + GetErrorMessage((half)pixels[i], (float)outImageHalf[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_F16)); + } } +#endif + const uint8_t resultU8[32] = { 0, 0, 0, 0, 255, 0, 3, 0, 255, 255, 255, 255, @@ -425,7 +445,10 @@ OCIO_ADD_TEST(AVX2, packed_all_test) testConvert_InBitDepth(outBD); break; case OCIO::BIT_DEPTH_F16: - testConvert_InBitDepth(outBD); +#if OCIO_USE_F16C + if(HAS_F16C()) + testConvert_InBitDepth(outBD); +#endif break; case OCIO::BIT_DEPTH_F32: testConvert_InBitDepth(outBD); diff --git a/tests/cpu/AVX_tests.cpp b/tests/cpu/AVX_tests.cpp index 29c384b0ad..1836d6b15c 100644 --- a/tests/cpu/AVX_tests.cpp +++ b/tests/cpu/AVX_tests.cpp @@ -71,12 +71,16 @@ float scale_unsigned(unsigned i) return static_cast(i) * 1.0f/65535.0f; } +#if OCIO_USE_F16C + template <> half scale_unsigned(unsigned i) { return static_cast(1.0f/65535.0f * static_cast(i)); } +#endif + template void testConvert_OutBitDepth() { diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index e44afadf93..9450d93fb9 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -91,6 +91,9 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) if(OCIO_USE_AVX2) add_test(NAME ${TEST_NAME}_avx2 COMMAND ${TEST_BINARY} --avx2) + if(${OCIO_USE_F16C}) + add_test(NAME ${TEST_NAME}_avx2+f16c COMMAND ${TEST_BINARY} --avx2 --f16c) + endif() endif() else() add_test(NAME ${TEST_NAME} COMMAND ${TEST_BINARY}) diff --git a/tests/cpu/SSE2_tests.cpp b/tests/cpu/SSE2_tests.cpp index 2deb148cce..468aa7260a 100644 --- a/tests/cpu/SSE2_tests.cpp +++ b/tests/cpu/SSE2_tests.cpp @@ -70,12 +70,16 @@ float scale_unsigned(unsigned i) return static_cast(i) * 1.0f/65535.0f; } +#if OCIO_USE_F16C + template <> half scale_unsigned(unsigned i) { return static_cast(1.0f/65535.0f * static_cast(i)); } +#endif + template void testConvert_OutBitDepth() { From 0065242deca5367bb40e6a66a87c47b43e4b6e0b Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Sun, 27 Aug 2023 16:31:09 -0700 Subject: [PATCH 11/22] Seperate SIMD test code from code that adds unit tests. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This isolates the compulation units and avoids executing illegal hardware instructions Signed-off-by: Mark Reid Signed-off-by: Cédrik Fuoco --- tests/cpu/AVX2_tests.cpp | 17 +++++---- tests/cpu/AVX_tests.cpp | 17 +++++---- tests/cpu/CMakeLists.txt | 1 + tests/cpu/SIMD_tests.cpp | 79 ++++++++++++++++++++++++++++++++++++++++ tests/cpu/SSE2_tests.cpp | 17 +++++---- 5 files changed, 110 insertions(+), 21 deletions(-) create mode 100644 tests/cpu/SIMD_tests.cpp diff --git a/tests/cpu/AVX2_tests.cpp b/tests/cpu/AVX2_tests.cpp index 421621eed0..56f76174e9 100644 --- a/tests/cpu/AVX2_tests.cpp +++ b/tests/cpu/AVX2_tests.cpp @@ -23,6 +23,9 @@ namespace OCIO = OCIO_NAMESPACE; #define HAS_F16C() \ OCIO::CPUInfo::instance().hasF16C() +#define DEFINE_SIMD_TEST(name) \ +void avx2_test_##name() + namespace { @@ -159,7 +162,7 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) } -OCIO_ADD_TEST(AVX2, packed_uint8_to_float_test) +DEFINE_SIMD_TEST(packed_uint8_to_float_test) { AVX2_CHECK(); std::vector inImage(256); @@ -185,7 +188,7 @@ OCIO_ADD_TEST(AVX2, packed_uint8_to_float_test) } } -OCIO_ADD_TEST(AVX2, packed_uint10_to_f32_test) +DEFINE_SIMD_TEST(packed_uint10_to_f32_test) { AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -212,7 +215,7 @@ OCIO_ADD_TEST(AVX2, packed_uint10_to_f32_test) } } -OCIO_ADD_TEST(AVX2, packed_uint12_to_f32_test) +DEFINE_SIMD_TEST(packed_uint12_to_f32_test) { AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -239,7 +242,7 @@ OCIO_ADD_TEST(AVX2, packed_uint12_to_f32_test) } } -OCIO_ADD_TEST(AVX2, packed_uint16_to_f32_test) +DEFINE_SIMD_TEST(packed_uint16_to_f32_test) { AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -268,7 +271,7 @@ OCIO_ADD_TEST(AVX2, packed_uint16_to_f32_test) #if OCIO_USE_F16C -OCIO_ADD_TEST(AVX2, packed_f16_to_f32_test) +DEFINE_SIMD_TEST(packed_f16_to_f32_test) { AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -298,7 +301,7 @@ OCIO_ADD_TEST(AVX2, packed_f16_to_f32_test) #endif -OCIO_ADD_TEST(AVX2, packed_nan_inf_test) +DEFINE_SIMD_TEST(packed_nan_inf_test) { AVX2_CHECK(); const float qnan = std::numeric_limits::quiet_NaN(); @@ -412,7 +415,7 @@ OCIO_ADD_TEST(AVX2, packed_nan_inf_test) } } -OCIO_ADD_TEST(AVX2, packed_all_test) +DEFINE_SIMD_TEST(packed_all_test) { AVX2_CHECK(); const std::vector< OCIO::BitDepth> formats = { diff --git a/tests/cpu/AVX_tests.cpp b/tests/cpu/AVX_tests.cpp index 1836d6b15c..dec835945a 100644 --- a/tests/cpu/AVX_tests.cpp +++ b/tests/cpu/AVX_tests.cpp @@ -23,6 +23,9 @@ namespace OCIO = OCIO_NAMESPACE; #define HAS_F16C() \ OCIO::CPUInfo::instance().hasF16C() +#define DEFINE_SIMD_TEST(name) \ +void avx_test_##name() + namespace { @@ -159,7 +162,7 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) } -OCIO_ADD_TEST(AVX, packed_uint8_to_float_test) +DEFINE_SIMD_TEST(packed_uint8_to_float_test) { AVX_CHECK(); std::vector inImage(256); @@ -185,7 +188,7 @@ OCIO_ADD_TEST(AVX, packed_uint8_to_float_test) } } -OCIO_ADD_TEST(AVX, packed_uint10_to_f32_test) +DEFINE_SIMD_TEST(packed_uint10_to_f32_test) { AVX_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -212,7 +215,7 @@ OCIO_ADD_TEST(AVX, packed_uint10_to_f32_test) } } -OCIO_ADD_TEST(AVX, packed_uint12_to_f32_test) +DEFINE_SIMD_TEST(packed_uint12_to_f32_test) { AVX_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -239,7 +242,7 @@ OCIO_ADD_TEST(AVX, packed_uint12_to_f32_test) } } -OCIO_ADD_TEST(AVX, packed_uint16_to_f32_test) +DEFINE_SIMD_TEST(packed_uint16_to_f32_test) { AVX_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -268,7 +271,7 @@ OCIO_ADD_TEST(AVX, packed_uint16_to_f32_test) #if OCIO_USE_F16C -OCIO_ADD_TEST(AVX, packed_f16_to_f32_test) +DEFINE_SIMD_TEST(packed_f16_to_f32_test) { AVX_CHECK(); if(!HAS_F16C()) throw SkipException(); @@ -300,7 +303,7 @@ OCIO_ADD_TEST(AVX, packed_f16_to_f32_test) #endif -OCIO_ADD_TEST(AVX, packed_nan_inf_test) +DEFINE_SIMD_TEST(packed_nan_inf_test) { AVX_CHECK(); const float qnan = std::numeric_limits::quiet_NaN(); @@ -413,7 +416,7 @@ OCIO_ADD_TEST(AVX, packed_nan_inf_test) } } -OCIO_ADD_TEST(AVX, packed_all_test) +DEFINE_SIMD_TEST(packed_all_test) { AVX_CHECK(); const std::vector< OCIO::BitDepth> formats = { diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index 9450d93fb9..8a793f3541 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -272,6 +272,7 @@ set(TESTS PathUtils_tests.cpp Platform_tests.cpp Processor_tests.cpp + SIMD_tests.cpp SSE_tests.cpp SSE2_tests.cpp AVX_tests.cpp diff --git a/tests/cpu/SIMD_tests.cpp b/tests/cpu/SIMD_tests.cpp new file mode 100644 index 0000000000..ad44abf6b5 --- /dev/null +++ b/tests/cpu/SIMD_tests.cpp @@ -0,0 +1,79 @@ +#include "testutils/UnitTest.h" +#include "CPUInfo.h" + +namespace OCIO = OCIO_NAMESPACE; + +#if OCIO_USE_SSE2 + +#define SSE2_CHECK() \ + if (!OCIO::CPUInfo::instance().hasSSE2()) throw SkipException() + +#define OCIO_ADD_TEST_SSE2(name) \ +void sse2_test_##name(); \ +OCIO_ADD_TEST(SSE2, name) \ +{ \ + SSE2_CHECK(); \ + sse2_test_##name(); \ +} + +OCIO_ADD_TEST_SSE2(packed_uint8_to_float_test) +OCIO_ADD_TEST_SSE2(packed_uint10_to_f32_test) +OCIO_ADD_TEST_SSE2(packed_uint12_to_f32_test) +OCIO_ADD_TEST_SSE2(packed_uint16_to_f32_test) +#if OCIO_USE_F16C + OCIO_ADD_TEST_SSE2(packed_f16_to_f32_test) +#endif +OCIO_ADD_TEST_SSE2(packed_nan_inf_test) +OCIO_ADD_TEST_SSE2(packed_all_test) + +#endif + +#if OCIO_USE_AVX + +#define AVX_CHECK() \ + if (!OCIO::CPUInfo::instance().hasAVX()) throw SkipException() + +#define OCIO_ADD_TEST_AVX(name) \ +void avx_test_##name(); \ +OCIO_ADD_TEST(AVX, name) \ +{ \ + AVX_CHECK(); \ + avx_test_##name(); \ +} + +OCIO_ADD_TEST_AVX(packed_uint8_to_float_test) +OCIO_ADD_TEST_AVX(packed_uint10_to_f32_test) +OCIO_ADD_TEST_AVX(packed_uint12_to_f32_test) +OCIO_ADD_TEST_AVX(packed_uint16_to_f32_test) +#if OCIO_USE_F16C + OCIO_ADD_TEST_AVX(packed_f16_to_f32_test) +#endif +OCIO_ADD_TEST_AVX(packed_nan_inf_test) +OCIO_ADD_TEST_AVX(packed_all_test) + +#endif + +#if OCIO_USE_AVX2 + +#define AVX2_CHECK() \ + if (!OCIO::CPUInfo::instance().hasAVX2()) throw SkipException() + +#define OCIO_ADD_TEST_AVX2(name) \ +void avx2_test_##name(); \ +OCIO_ADD_TEST(AVX2, name) \ +{ \ + AVX2_CHECK(); \ + avx2_test_##name(); \ +} + +OCIO_ADD_TEST_AVX2(packed_uint8_to_float_test) +OCIO_ADD_TEST_AVX2(packed_uint10_to_f32_test) +OCIO_ADD_TEST_AVX2(packed_uint12_to_f32_test) +OCIO_ADD_TEST_AVX2(packed_uint16_to_f32_test) +#if OCIO_USE_F16C + OCIO_ADD_TEST_AVX2(packed_f16_to_f32_test) +#endif +OCIO_ADD_TEST_AVX2(packed_nan_inf_test) +OCIO_ADD_TEST_AVX2(packed_all_test) + +#endif \ No newline at end of file diff --git a/tests/cpu/SSE2_tests.cpp b/tests/cpu/SSE2_tests.cpp index 468aa7260a..85ad4d4cf8 100644 --- a/tests/cpu/SSE2_tests.cpp +++ b/tests/cpu/SSE2_tests.cpp @@ -22,6 +22,9 @@ namespace OCIO = OCIO_NAMESPACE; #define HAS_F16C() \ OCIO::CPUInfo::instance().hasF16C() +#define DEFINE_SIMD_TEST(name) \ +void sse2_test_##name() + namespace { @@ -158,7 +161,7 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) } -OCIO_ADD_TEST(SSE2, packed_uint8_to_float_test) +DEFINE_SIMD_TEST(packed_uint8_to_float_test) { SSE2_CHECK(); std::vector inImage(256); @@ -187,7 +190,7 @@ OCIO_ADD_TEST(SSE2, packed_uint8_to_float_test) } -OCIO_ADD_TEST(SSE2, packed_uint10_to_f32_test) +DEFINE_SIMD_TEST(packed_uint10_to_f32_test) { SSE2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -214,7 +217,7 @@ OCIO_ADD_TEST(SSE2, packed_uint10_to_f32_test) } } -OCIO_ADD_TEST(SSE2, packed_uint12_to_f32_test) +DEFINE_SIMD_TEST(packed_uint12_to_f32_test) { SSE2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -241,7 +244,7 @@ OCIO_ADD_TEST(SSE2, packed_uint12_to_f32_test) } } -OCIO_ADD_TEST(SSE2, packed_uint16_to_f32_test) +DEFINE_SIMD_TEST(packed_uint16_to_f32_test) { SSE2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -270,7 +273,7 @@ OCIO_ADD_TEST(SSE2, packed_uint16_to_f32_test) #if OCIO_USE_F16C -OCIO_ADD_TEST(SSE2, packed_f16_to_f32_test) +DEFINE_SIMD_TEST(packed_f16_to_f32_test) { SSE2_CHECK(); if(!HAS_F16C()) throw SkipException(); @@ -303,7 +306,7 @@ OCIO_ADD_TEST(SSE2, packed_f16_to_f32_test) #endif -OCIO_ADD_TEST(SSE2, packed_nan_inf_test) +DEFINE_SIMD_TEST(packed_nan_inf_test) { SSE2_CHECK(); const float qnan = std::numeric_limits::quiet_NaN(); @@ -431,7 +434,7 @@ OCIO_ADD_TEST(SSE2, packed_nan_inf_test) } -OCIO_ADD_TEST(SSE2, packed_all_test) +DEFINE_SIMD_TEST(packed_all_test) { SSE2_CHECK(); const std::vector< OCIO::BitDepth> formats = { From 7534eea8816704521840d76a54dc27d72975bc82 Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Sun, 27 Aug 2023 22:40:03 -0700 Subject: [PATCH 12/22] remove uneeded checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- tests/cpu/AVX2_tests.cpp | 10 ---------- tests/cpu/AVX_tests.cpp | 10 ---------- tests/cpu/SSE2_tests.cpp | 10 ---------- 3 files changed, 30 deletions(-) diff --git a/tests/cpu/AVX2_tests.cpp b/tests/cpu/AVX2_tests.cpp index 56f76174e9..e711661841 100644 --- a/tests/cpu/AVX2_tests.cpp +++ b/tests/cpu/AVX2_tests.cpp @@ -17,9 +17,6 @@ namespace OCIO = OCIO_NAMESPACE; -#define AVX2_CHECK() \ - if (!OCIO::CPUInfo::instance().hasAVX2()) throw SkipException() - #define HAS_F16C() \ OCIO::CPUInfo::instance().hasF16C() @@ -164,7 +161,6 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) DEFINE_SIMD_TEST(packed_uint8_to_float_test) { - AVX2_CHECK(); std::vector inImage(256); std::vector outImage(256); @@ -190,7 +186,6 @@ DEFINE_SIMD_TEST(packed_uint8_to_float_test) DEFINE_SIMD_TEST(packed_uint10_to_f32_test) { - AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -217,7 +212,6 @@ DEFINE_SIMD_TEST(packed_uint10_to_f32_test) DEFINE_SIMD_TEST(packed_uint12_to_f32_test) { - AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -244,7 +238,6 @@ DEFINE_SIMD_TEST(packed_uint12_to_f32_test) DEFINE_SIMD_TEST(packed_uint16_to_f32_test) { - AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -273,7 +266,6 @@ DEFINE_SIMD_TEST(packed_uint16_to_f32_test) DEFINE_SIMD_TEST(packed_f16_to_f32_test) { - AVX2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -303,7 +295,6 @@ DEFINE_SIMD_TEST(packed_f16_to_f32_test) DEFINE_SIMD_TEST(packed_nan_inf_test) { - AVX2_CHECK(); const float qnan = std::numeric_limits::quiet_NaN(); const float inf = std::numeric_limits::infinity(); const float maxf = std::numeric_limits::max(); @@ -417,7 +408,6 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) DEFINE_SIMD_TEST(packed_all_test) { - AVX2_CHECK(); const std::vector< OCIO::BitDepth> formats = { OCIO::BIT_DEPTH_UINT8, OCIO::BIT_DEPTH_UINT10, diff --git a/tests/cpu/AVX_tests.cpp b/tests/cpu/AVX_tests.cpp index dec835945a..60bd008373 100644 --- a/tests/cpu/AVX_tests.cpp +++ b/tests/cpu/AVX_tests.cpp @@ -17,9 +17,6 @@ namespace OCIO = OCIO_NAMESPACE; -#define AVX_CHECK() \ - if (!OCIO::CPUInfo::instance().hasAVX()) throw SkipException() - #define HAS_F16C() \ OCIO::CPUInfo::instance().hasF16C() @@ -164,7 +161,6 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) DEFINE_SIMD_TEST(packed_uint8_to_float_test) { - AVX_CHECK(); std::vector inImage(256); std::vector outImage(256); @@ -190,7 +186,6 @@ DEFINE_SIMD_TEST(packed_uint8_to_float_test) DEFINE_SIMD_TEST(packed_uint10_to_f32_test) { - AVX_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -217,7 +212,6 @@ DEFINE_SIMD_TEST(packed_uint10_to_f32_test) DEFINE_SIMD_TEST(packed_uint12_to_f32_test) { - AVX_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -244,7 +238,6 @@ DEFINE_SIMD_TEST(packed_uint12_to_f32_test) DEFINE_SIMD_TEST(packed_uint16_to_f32_test) { - AVX_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -273,7 +266,6 @@ DEFINE_SIMD_TEST(packed_uint16_to_f32_test) DEFINE_SIMD_TEST(packed_f16_to_f32_test) { - AVX_CHECK(); if(!HAS_F16C()) throw SkipException(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -305,7 +297,6 @@ DEFINE_SIMD_TEST(packed_f16_to_f32_test) DEFINE_SIMD_TEST(packed_nan_inf_test) { - AVX_CHECK(); const float qnan = std::numeric_limits::quiet_NaN(); const float inf = std::numeric_limits::infinity(); const float maxf = std::numeric_limits::max(); @@ -418,7 +409,6 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) DEFINE_SIMD_TEST(packed_all_test) { - AVX_CHECK(); const std::vector< OCIO::BitDepth> formats = { OCIO::BIT_DEPTH_UINT8, OCIO::BIT_DEPTH_UINT10, diff --git a/tests/cpu/SSE2_tests.cpp b/tests/cpu/SSE2_tests.cpp index 85ad4d4cf8..7847c2f1a2 100644 --- a/tests/cpu/SSE2_tests.cpp +++ b/tests/cpu/SSE2_tests.cpp @@ -16,9 +16,6 @@ namespace OCIO = OCIO_NAMESPACE; -#define SSE2_CHECK() \ - if (!OCIO::CPUInfo::instance().hasSSE2()) throw SkipException() - #define HAS_F16C() \ OCIO::CPUInfo::instance().hasF16C() @@ -163,7 +160,6 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) DEFINE_SIMD_TEST(packed_uint8_to_float_test) { - SSE2_CHECK(); std::vector inImage(256); std::vector outImage(256); @@ -192,7 +188,6 @@ DEFINE_SIMD_TEST(packed_uint8_to_float_test) DEFINE_SIMD_TEST(packed_uint10_to_f32_test) { - SSE2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -219,7 +214,6 @@ DEFINE_SIMD_TEST(packed_uint10_to_f32_test) DEFINE_SIMD_TEST(packed_uint12_to_f32_test) { - SSE2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -246,7 +240,6 @@ DEFINE_SIMD_TEST(packed_uint12_to_f32_test) DEFINE_SIMD_TEST(packed_uint16_to_f32_test) { - SSE2_CHECK(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -275,7 +268,6 @@ DEFINE_SIMD_TEST(packed_uint16_to_f32_test) DEFINE_SIMD_TEST(packed_f16_to_f32_test) { - SSE2_CHECK(); if(!HAS_F16C()) throw SkipException(); size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; @@ -308,7 +300,6 @@ DEFINE_SIMD_TEST(packed_f16_to_f32_test) DEFINE_SIMD_TEST(packed_nan_inf_test) { - SSE2_CHECK(); const float qnan = std::numeric_limits::quiet_NaN(); const float inf = std::numeric_limits::infinity(); const float maxf = std::numeric_limits::max(); @@ -436,7 +427,6 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) DEFINE_SIMD_TEST(packed_all_test) { - SSE2_CHECK(); const std::vector< OCIO::BitDepth> formats = { OCIO::BIT_DEPTH_UINT8, OCIO::BIT_DEPTH_UINT10, From e3c5ef096425bc23acc849b748e5f7d4114c399a Mon Sep 17 00:00:00 2001 From: Mark Reid Date: Sun, 27 Aug 2023 23:02:11 -0700 Subject: [PATCH 13/22] use software implementations of f16c intrinsics for SSE2 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- share/cmake/utils/CheckSupportX86SIMD.cmake | 1 - src/OpenColorIO/SSE2.h | 108 ++++++++++++++++-- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp | 6 +- tests/cpu/CMakeLists.txt | 3 - tests/cpu/SIMD_tests.cpp | 4 +- tests/cpu/SSE2_tests.cpp | 47 ++------ 6 files changed, 110 insertions(+), 59 deletions(-) diff --git a/share/cmake/utils/CheckSupportX86SIMD.cmake b/share/cmake/utils/CheckSupportX86SIMD.cmake index f3d91cd5b1..3d4b4f19ba 100644 --- a/share/cmake/utils/CheckSupportX86SIMD.cmake +++ b/share/cmake/utils/CheckSupportX86SIMD.cmake @@ -92,7 +92,6 @@ endif() if(${OCIO_USE_F16C}) if(NOT MSVC) - list(APPEND OCIO_SSE2_ARGS -mf16c) list(APPEND OCIO_AVX_ARGS -mf16c) list(APPEND OCIO_AVX2_ARGS -mf16c) endif() diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 441703fd3f..1ee21c7f7b 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -76,6 +76,94 @@ static inline void sse2RGBATranspose_4x4(__m128 row0, __m128 row1, __m128 row2, out_a = _mm_movehl_ps(tmp3, tmp1); } +static inline __m128i sse2_blendv(__m128i a, __m128i b, __m128i mask) +{ + return _mm_xor_si128(_mm_and_si128(_mm_xor_si128(a, b), mask), a); +} + +static inline __m128i sse2_cvtps_ph(__m128 a) +{ + __m128i x = _mm_castps_si128(a); + + __m128i x_sgn = _mm_and_si128(x, _mm_set1_epi32(0x80000000u)); + __m128i x_exp = _mm_and_si128(x, _mm_set1_epi32(0x7f800000u)); + + __m128 magic1 = _mm_castsi128_ps(_mm_set1_epi32(0x77800000u)); // 0x1.0p+112f + __m128 magic2 = _mm_castsi128_ps(_mm_set1_epi32(0x08800000u)); // 0x1.0p-110f + + // sse2 doesn't have _mm_max_epu32, but _mm_max_ps works + __m128i exp_max = _mm_set1_epi32(0x38800000u); + x_exp = _mm_castps_si128(_mm_max_ps(_mm_castsi128_ps(x_exp), _mm_castsi128_ps(exp_max))); // max(e, -14) + x_exp = _mm_add_epi32(x_exp, _mm_set1_epi32(15u << 23)); // e += 15 + x = _mm_and_si128(x, _mm_set1_epi32(0x7fffffffu)); // Discard sign + + __m128 f = _mm_castsi128_ps(x); + __m128 magicf = _mm_castsi128_ps(x_exp); + + // If 15 < e then inf, otherwise e += 2 + f = _mm_mul_ps(_mm_mul_ps(f, magic1), magic2); + f = _mm_add_ps(f, magicf); + + __m128i u = _mm_castps_si128(f); + + __m128i h_exp = _mm_and_si128(_mm_srli_epi32(u, 13), _mm_set1_epi32(0x7c00u)); + __m128i h_sig = _mm_and_si128(u, _mm_set1_epi32(0x0fffu)); + + // blend in nan values only if present + __m128i nan_mask = _mm_cmpgt_epi32(x, _mm_set1_epi32(0x7f800000u)); + if (_mm_movemask_epi8(nan_mask)) { + __m128i nan = _mm_and_si128(_mm_srli_epi32(x, 13), _mm_set1_epi32(0x03FFu)); + nan = _mm_or_si128(_mm_set1_epi32(0x0200u), nan); + h_sig = sse2_blendv(h_sig, nan, nan_mask); + } + + __m128i ph = _mm_add_epi32(_mm_srli_epi32(x_sgn, 16),_mm_add_epi32(h_exp, h_sig)); + + // pack u16 values into lower 64 bits + ph = _mm_shufflehi_epi16(ph, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)); + ph = _mm_shufflelo_epi16(ph, (1 << 6 | 1 << 4 | 2 << 2 | 0 << 0)); + return _mm_shuffle_epi32(ph, (3 << 6 | 3 << 4 | 2 << 2 | 0 << 0)); +} + +static inline __m128 sse2_cvtph_ps(__m128i a) +{ + __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((254 - 15) << 23)); + __m128 was_infnan = _mm_castsi128_ps(_mm_set1_epi32((127 + 16) << 23)); + __m128 sign; + __m128 o; + + // the values to unpack are in the lower 64 bits + // | 0 1 | 2 3 | 4 5 | 6 7 | 8 9 | 10 11 | 12 13 | 14 15 + // | 0 1 | 0 1 | 2 3 | 2 3 | 4 5 | 4 5 | 6 7 | 6 7 + a = _mm_unpacklo_epi16(a, a); + + // extract sign + sign = _mm_castsi128_ps(_mm_slli_epi32(_mm_and_si128(a, _mm_set1_epi32(0x8000)), 16)); + + // extract exponent/mantissa bits + o = _mm_castsi128_ps(_mm_slli_epi32(_mm_and_si128(a, _mm_set1_epi32(0x7fff)), 13)); + + // magic multiply + o = _mm_mul_ps(o, magic); + + // blend in inf/nan values only if present + __m128i mask = _mm_castps_si128(_mm_cmpge_ps(o, was_infnan)); + if (_mm_movemask_epi8(mask)) { + __m128i ou = _mm_castps_si128(o); + __m128i ou_nan = _mm_or_si128(ou, _mm_set1_epi32( 0x01FF << 22)); + __m128i ou_inf = _mm_or_si128(ou, _mm_set1_epi32( 0x00FF << 23)); + + // blend in nans + ou = sse2_blendv(ou, ou_nan, mask); + + // blend in infinities + mask = _mm_cmpeq_epi32( _mm_castps_si128(o), _mm_castps_si128(was_infnan)); + o = _mm_castsi128_ps(sse2_blendv(ou, ou_inf, mask)); + } + + return _mm_or_ps(o, sign); +} + // Note Packing functions perform no 0.0 - 1.0 normalization // but perform 0 - max value clamping for integer formats template struct SSE2RGBAPack {}; @@ -194,8 +282,6 @@ struct SSE2RGBAPack } }; -#if OCIO_USE_F16C - template <> struct SSE2RGBAPack { @@ -204,10 +290,10 @@ struct SSE2RGBAPack __m128i rgba_00_01 = _mm_loadu_si128((const __m128i*)(in + 0)); __m128i rgba_02_03 = _mm_loadu_si128((const __m128i*)(in + 8)); - __m128 rgba0 = _mm_cvtph_ps(rgba_00_01); - __m128 rgba1 = _mm_cvtph_ps(_mm_shuffle_epi32(rgba_00_01, _MM_SHUFFLE(1,0,3,2))); - __m128 rgba2 = _mm_cvtph_ps(rgba_02_03); - __m128 rgba3 = _mm_cvtph_ps(_mm_shuffle_epi32(rgba_02_03, _MM_SHUFFLE(1,0,3,2))); + __m128 rgba0 = sse2_cvtph_ps(rgba_00_01); + __m128 rgba1 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_00_01, _MM_SHUFFLE(1,0,3,2))); + __m128 rgba2 = sse2_cvtph_ps(rgba_02_03); + __m128 rgba3 = sse2_cvtph_ps(_mm_shuffle_epi32(rgba_02_03, _MM_SHUFFLE(1,0,3,2))); sse2RGBATranspose_4x4(rgba0, rgba1, rgba2, rgba3, r, g, b, a); } @@ -219,10 +305,10 @@ struct SSE2RGBAPack sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3); - __m128i rgba00_01 = _mm_cvtps_ph(rgba0, 0); - __m128i rgba02_03 = _mm_cvtps_ph(rgba1, 0); - __m128i rgba04_05 = _mm_cvtps_ph(rgba2, 0); - __m128i rgba06_07 = _mm_cvtps_ph(rgba3, 0); + __m128i rgba00_01 = sse2_cvtps_ph(rgba0); + __m128i rgba02_03 = sse2_cvtps_ph(rgba1); + __m128i rgba04_05 = sse2_cvtps_ph(rgba2); + __m128i rgba06_07 = sse2_cvtps_ph(rgba3); rgba = _mm_xor_si128(rgba00_01, _mm_shuffle_epi32(rgba02_03, _MM_SHUFFLE(1,0,3,2))); _mm_storeu_si128((__m128i*)(out+0), rgba); @@ -232,8 +318,6 @@ struct SSE2RGBAPack } }; -#endif - template <> struct SSE2RGBAPack { diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp index d542139ea2..a144649b7c 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp @@ -151,11 +151,7 @@ inline Lut1DOpCPUApplyFunc * GetConvertInBitDepth(BitDepth outBD) case BIT_DEPTH_UINT16: return linear1D; case BIT_DEPTH_F16: -#if OCIO_USE_F16C - if (CPUInfo::instance().hasF16C()) - return linear1D; -#endif - break; + return linear1D; case BIT_DEPTH_F32: return linear1D; case BIT_DEPTH_UINT14: diff --git a/tests/cpu/CMakeLists.txt b/tests/cpu/CMakeLists.txt index 8a793f3541..6e3dc522e7 100755 --- a/tests/cpu/CMakeLists.txt +++ b/tests/cpu/CMakeLists.txt @@ -77,9 +77,6 @@ function(add_ocio_test NAME SOURCES PRIVATE_INCLUDES) if(OCIO_USE_SSE2) add_test(NAME ${TEST_NAME}_sse2 COMMAND ${TEST_BINARY} --sse2) - if(OCIO_USE_F16C) - add_test(NAME ${TEST_NAME}_sse2+f16c COMMAND ${TEST_BINARY} --sse2 --f16c) - endif() endif() if(OCIO_USE_AVX) diff --git a/tests/cpu/SIMD_tests.cpp b/tests/cpu/SIMD_tests.cpp index ad44abf6b5..3cd5b76bc4 100644 --- a/tests/cpu/SIMD_tests.cpp +++ b/tests/cpu/SIMD_tests.cpp @@ -20,9 +20,7 @@ OCIO_ADD_TEST_SSE2(packed_uint8_to_float_test) OCIO_ADD_TEST_SSE2(packed_uint10_to_f32_test) OCIO_ADD_TEST_SSE2(packed_uint12_to_f32_test) OCIO_ADD_TEST_SSE2(packed_uint16_to_f32_test) -#if OCIO_USE_F16C - OCIO_ADD_TEST_SSE2(packed_f16_to_f32_test) -#endif +OCIO_ADD_TEST_SSE2(packed_f16_to_f32_test) OCIO_ADD_TEST_SSE2(packed_nan_inf_test) OCIO_ADD_TEST_SSE2(packed_all_test) diff --git a/tests/cpu/SSE2_tests.cpp b/tests/cpu/SSE2_tests.cpp index 7847c2f1a2..3431d04b35 100644 --- a/tests/cpu/SSE2_tests.cpp +++ b/tests/cpu/SSE2_tests.cpp @@ -16,9 +16,6 @@ namespace OCIO = OCIO_NAMESPACE; -#define HAS_F16C() \ - OCIO::CPUInfo::instance().hasF16C() - #define DEFINE_SIMD_TEST(name) \ void sse2_test_##name() @@ -70,16 +67,12 @@ float scale_unsigned(unsigned i) return static_cast(i) * 1.0f/65535.0f; } -#if OCIO_USE_F16C - template <> half scale_unsigned(unsigned i) { return static_cast(1.0f/65535.0f * static_cast(i)); } -#endif - template void testConvert_OutBitDepth() { @@ -140,10 +133,7 @@ void testConvert_InBitDepth(OCIO::BitDepth outBD) case OCIO::BIT_DEPTH_UINT16: return testConvert_OutBitDepth(); case OCIO::BIT_DEPTH_F16: -#if OCIO_USE_F16C - if (HAS_F16C()) - return testConvert_OutBitDepth(); -#endif + return testConvert_OutBitDepth(); break; case OCIO::BIT_DEPTH_F32: return testConvert_OutBitDepth(); @@ -264,12 +254,8 @@ DEFINE_SIMD_TEST(packed_uint16_to_f32_test) } } -#if OCIO_USE_F16C - DEFINE_SIMD_TEST(packed_f16_to_f32_test) { - if(!HAS_F16C()) throw SkipException(); - size_t maxValue = OCIO::BitDepthInfo::maxValue + 1; std::vector inImage(maxValue); std::vector outImage(maxValue); @@ -295,8 +281,6 @@ DEFINE_SIMD_TEST(packed_f16_to_f32_test) } } -#endif - DEFINE_SIMD_TEST(packed_nan_inf_test) { @@ -317,24 +301,20 @@ DEFINE_SIMD_TEST(packed_nan_inf_test) -0.0f, -1.0f, - 2.0f, -5.0f, 100000.0f, 200000.0f, -10.0f, -2000.0f, 65535.0f, 65537.0f, -65536.0f, -65537.0f }; -#if OCIO_USE_F16C - if(HAS_F16C()) + + for (unsigned i = 0; i < 32; i+= 16) { - for (unsigned i = 0; i < 32; i+= 16) - { - OCIO::SSE2RGBAPack::Load(&pixels[i], r, g, b, a); - OCIO::SSE2RGBAPack::Store(&outImageHalf[i], r, g, b, a); - } + OCIO::SSE2RGBAPack::Load(&pixels[i], r, g, b, a); + OCIO::SSE2RGBAPack::Store(&outImageHalf[i], r, g, b, a); + } - for (unsigned i = 0; i < outImageHalf.size(); i++) - { - OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((half)pixels[i], (float)outImageHalf[i], 0, false), - GetErrorMessage((half)pixels[i], (float)outImageHalf[i], - OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_F16)); - } + for (unsigned i = 0; i < outImageHalf.size(); i++) + { + OCIO_CHECK_ASSERT_MESSAGE(!OCIO::FloatsDiffer((half)pixels[i], (float)outImageHalf[i], 0, false), + GetErrorMessage((half)pixels[i], (float)outImageHalf[i], + OCIO::BIT_DEPTH_F32, OCIO::BIT_DEPTH_F16)); } -#endif const uint8_t resultU8[32] = { 0, 0, 0, 0, 255, 0, 3, 0, @@ -457,10 +437,7 @@ DEFINE_SIMD_TEST(packed_all_test) testConvert_InBitDepth(outBD); break; case OCIO::BIT_DEPTH_F16: -#if OCIO_USE_F16C - if(HAS_F16C()) - testConvert_InBitDepth(outBD); -#endif + testConvert_InBitDepth(outBD); break; case OCIO::BIT_DEPTH_F32: testConvert_InBitDepth(outBD); From dfcde1881652e7f30194b2d6f2a247e9bd6c36de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Mon, 28 Aug 2023 12:54:19 -0400 Subject: [PATCH 14/22] Added preprocessor checks for ARM as it is needed for universal build on APPLE platform MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- src/OpenColorIO/AVX.h | 3 ++- src/OpenColorIO/AVX2.h | 4 +++- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp | 7 +++++-- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp | 7 +++++-- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp | 3 +++ src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp | 4 ++-- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp | 4 ++-- tests/cpu/AVX2_tests.cpp | 3 ++- tests/cpu/AVX_tests.cpp | 3 ++- 9 files changed, 26 insertions(+), 12 deletions(-) diff --git a/src/OpenColorIO/AVX.h b/src/OpenColorIO/AVX.h index 16809bc2d0..74f651be4f 100644 --- a/src/OpenColorIO/AVX.h +++ b/src/OpenColorIO/AVX.h @@ -6,7 +6,8 @@ #define INCLUDED_OCIO_AVX_H #include "CPUInfo.h" -#ifdef OCIO_USE_AVX +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX && !defined(__aarch64__) #include #include diff --git a/src/OpenColorIO/AVX2.h b/src/OpenColorIO/AVX2.h index 404fc046c1..4652f2debf 100644 --- a/src/OpenColorIO/AVX2.h +++ b/src/OpenColorIO/AVX2.h @@ -6,7 +6,8 @@ #define INCLUDED_OCIO_AVX2_H #include "CPUInfo.h" -#ifdef OCIO_USE_AVX2 +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX2 && !defined(__aarch64__) #include #include @@ -39,6 +40,7 @@ inline __m256 avx2_clamp(__m256 value, const __m256& maxValue) } inline void avx2RGBATranspose_4x4_4x4(__m256 row0, __m256 row1, __m256 row2, __m256 row3, + __m256 &out_r, __m256 &out_g, __m256 &out_b, __m256 &out_a ) { // the rgba transpose result will look this diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp index 49caeb6dbc..143d8fcae2 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp @@ -1,6 +1,9 @@ -#include "Lut1DOpCPU_AVX.h" +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. -#if OCIO_USE_AVX +#include "Lut1DOpCPU_AVX.h" +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX && !defined(__aarch64__) #include #include diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp index a4565a0335..17ca33f833 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp @@ -1,6 +1,9 @@ -#include "Lut1DOpCPU_AVX2.h" +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. -#if OCIO_USE_AVX2 +#include "Lut1DOpCPU_AVX2.h" +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX2 && !defined(__aarch64__) #include #include diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp index a144649b7c..d7ecfe49c3 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_SSE2.cpp @@ -1,3 +1,6 @@ +// SPDX-License-Identifier: BSD-3-Clause +// Copyright Contributors to the OpenColorIO Project. + #include "Lut1DOpCPU_SSE2.h" #if OCIO_USE_SSE2 diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp index 9a7ad75e6c..1b69b689a9 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp @@ -2,8 +2,8 @@ // Copyright Contributors to the OpenColorIO Project. #include "Lut3DOpCPU_AVX.h" - -#if OCIO_USE_AVX +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX && !defined(__aarch64__) #include "AVX.h" diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp index db77f26814..5fc829ebc8 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp @@ -2,8 +2,8 @@ // Copyright Contributors to the OpenColorIO Project. #include "Lut3DOpCPU_AVX2.h" - -#if OCIO_USE_AVX2 +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX2 && !defined(__aarch64__) #include #include diff --git a/tests/cpu/AVX2_tests.cpp b/tests/cpu/AVX2_tests.cpp index e711661841..36f01c59a7 100644 --- a/tests/cpu/AVX2_tests.cpp +++ b/tests/cpu/AVX2_tests.cpp @@ -3,7 +3,8 @@ #include "CPUInfo.h" -#if OCIO_USE_AVX2 +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX2 && !defined(__aarch64__) #include diff --git a/tests/cpu/AVX_tests.cpp b/tests/cpu/AVX_tests.cpp index 60bd008373..b605d38958 100644 --- a/tests/cpu/AVX_tests.cpp +++ b/tests/cpu/AVX_tests.cpp @@ -3,7 +3,8 @@ #include "CPUInfo.h" -#if OCIO_USE_AVX +// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. +#if OCIO_USE_AVX && !defined(__aarch64__) #include From 5bb952a535d5a9022bc5c4aea4b4d4de90fcf183 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Mon, 28 Aug 2023 14:32:46 -0400 Subject: [PATCH 15/22] Adding missing checks for "not arm64" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp | 8 ++++---- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h | 2 +- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h | 2 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp | 4 ++-- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h | 2 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h | 2 +- tests/cpu/SIMD_tests.cpp | 4 ++-- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp index 7dfaab8daf..4261530c10 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp @@ -285,14 +285,14 @@ BaseLut1DRenderer::BaseLut1DRenderer(ConstLut1DOpDataRcPtr & lut) } #endif -#if OCIO_USE_AVX +#if OCIO_USE_AVX && !defined(__aarch64__) if (CPUInfo::instance().hasAVX()) { m_applyLutFunc = AVXGetLut1DApplyFunc(inBD, outBD); } #endif -#if OCIO_USE_AVX2 +#if OCIO_USE_AVX2 && !defined(__aarch64__) if (CPUInfo::instance().hasAVX2() && !CPUInfo::instance().AVX2SlowGather()) { m_applyLutFunc = AVX2GetLut1DApplyFunc(inBD, outBD); @@ -316,14 +316,14 @@ BaseLut1DRenderer::BaseLut1DRenderer(ConstLut1DOpDataRcPtr & lut, B } #endif -#if OCIO_USE_AVX +#if OCIO_USE_AVX && !defined(__aarch64__) if (CPUInfo::instance().hasAVX() && !CPUInfo::instance().AVXSlow()) { m_applyLutFunc = AVXGetLut1DApplyFunc(inBD, m_outBitDepth); } #endif -#if OCIO_USE_AVX2 +#if OCIO_USE_AVX2 && !defined(__aarch64__) if (CPUInfo::instance().hasAVX2() && !CPUInfo::instance().AVX2SlowGather()) { m_applyLutFunc = AVX2GetLut1DApplyFunc(inBD, m_outBitDepth); diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h index c828169fd2..917d323b51 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h @@ -10,7 +10,7 @@ typedef void (Lut1DOpCPUApplyFunc)(const float *, const float *, const float *, int, const void *, void *, long); -#if OCIO_USE_AVX +#if OCIO_USE_AVX && !defined(__aarch64__) namespace OCIO_NAMESPACE { diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h index a3e63d67eb..1c8ace1b3e 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h @@ -10,7 +10,7 @@ typedef void (Lut1DOpCPUApplyFunc)(const float *, const float *, const float *, int, const void *, void *, long); -#if OCIO_USE_AVX2 +#if OCIO_USE_AVX2 && !defined(__aarch64__) namespace OCIO_NAMESPACE { diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp index 80fe753c0c..5c29747456 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp @@ -392,14 +392,14 @@ Lut3DTetrahedralRenderer::Lut3DTetrahedralRenderer(ConstLut3DOpDataRcPtr & lut) } #endif - #if OCIO_USE_AVX + #if OCIO_USE_AVX && !defined(__aarch64__) if (CPUInfo::instance().hasAVX() && !CPUInfo::instance().AVXSlow()) { m_applyLutFunc = applyTetrahedralAVX; } #endif - #if OCIO_USE_AVX2 + #if OCIO_USE_AVX2 && !defined(__aarch64__) if (CPUInfo::instance().hasAVX2() && !CPUInfo::instance().AVX2SlowGather()) { m_applyLutFunc = applyTetrahedralAVX2; diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h index abed452ae4..a7a1498917 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h @@ -8,7 +8,7 @@ #include "CPUInfo.h" -#if OCIO_USE_AVX +#if OCIO_USE_AVX && !defined(__aarch64__) namespace OCIO_NAMESPACE { diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h index 6a8f72395f..6d5d18647e 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h @@ -8,7 +8,7 @@ #include "CPUInfo.h" -#if OCIO_USE_AVX2 +#if OCIO_USE_AVX2 && !defined(__aarch64__) namespace OCIO_NAMESPACE { diff --git a/tests/cpu/SIMD_tests.cpp b/tests/cpu/SIMD_tests.cpp index 3cd5b76bc4..eb6508d3f5 100644 --- a/tests/cpu/SIMD_tests.cpp +++ b/tests/cpu/SIMD_tests.cpp @@ -26,7 +26,7 @@ OCIO_ADD_TEST_SSE2(packed_all_test) #endif -#if OCIO_USE_AVX +#if OCIO_USE_AVX && !defined(__aarch64__) #define AVX_CHECK() \ if (!OCIO::CPUInfo::instance().hasAVX()) throw SkipException() @@ -51,7 +51,7 @@ OCIO_ADD_TEST_AVX(packed_all_test) #endif -#if OCIO_USE_AVX2 +#if OCIO_USE_AVX2 && !defined(__aarch64__) #define AVX2_CHECK() \ if (!OCIO::CPUInfo::instance().hasAVX2()) throw SkipException() From 2415a98c84984a2ee6258db7519acb294d10e59d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Tue, 29 Aug 2023 12:57:10 -0400 Subject: [PATCH 16/22] Ease the future maintainability a the new OCIO_USE_xyz be moving the logic into CPUInfoConfig.h.in as well as fixing issue on ARM when building with OCIO_USE_SSE2NEON=OFF. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 3 +- src/OpenColorIO/AVX.h | 3 +- src/OpenColorIO/AVX2.h | 3 +- src/OpenColorIO/CPUInfo.cpp | 2 +- src/OpenColorIO/CPUInfo.h | 2 +- src/OpenColorIO/CPUInfoConfig.h.in | 56 +++++++++++++++---- src/OpenColorIO/SSE.h | 6 +- src/OpenColorIO/SSE2.h | 6 +- src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp | 8 +-- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp | 3 +- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h | 2 +- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp | 3 +- src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h | 2 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp | 4 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp | 3 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h | 2 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp | 3 +- src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h | 2 +- tests/cpu/AVX2_tests.cpp | 3 +- tests/cpu/AVX_tests.cpp | 3 +- tests/cpu/SIMD_tests.cpp | 4 +- tests/cpu/UnitTestMain.cpp | 2 +- 22 files changed, 76 insertions(+), 49 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5c7704ef85..8bdfc0cbce 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -198,6 +198,7 @@ if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686) elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") # Apple ARM only build. + set(OCIO_ARCH_X86 0) # OCIO translates the SSE instructions into ARM Neon with the library SSE2NEON. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) @@ -206,10 +207,10 @@ elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_AR # Turn F16C off since they are x86_64 instructions. set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) else() + set(OCIO_ARCH_X86 0) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) - set(OCIO_ARCH_X86 0) endif() option(OCIO_USE_SSE2 "Specify whether to enable SSE2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) diff --git a/src/OpenColorIO/AVX.h b/src/OpenColorIO/AVX.h index 74f651be4f..6cb2ea5888 100644 --- a/src/OpenColorIO/AVX.h +++ b/src/OpenColorIO/AVX.h @@ -6,8 +6,7 @@ #define INCLUDED_OCIO_AVX_H #include "CPUInfo.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX #include #include diff --git a/src/OpenColorIO/AVX2.h b/src/OpenColorIO/AVX2.h index 4652f2debf..3237533bc6 100644 --- a/src/OpenColorIO/AVX2.h +++ b/src/OpenColorIO/AVX2.h @@ -6,8 +6,7 @@ #define INCLUDED_OCIO_AVX2_H #include "CPUInfo.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 #include #include diff --git a/src/OpenColorIO/CPUInfo.cpp b/src/OpenColorIO/CPUInfo.cpp index 0b582a345b..7aae56ad97 100644 --- a/src/OpenColorIO/CPUInfo.cpp +++ b/src/OpenColorIO/CPUInfo.cpp @@ -17,7 +17,7 @@ typedef __int64 int64_t; namespace OCIO_NAMESPACE { -#if !defined(__aarch64__) && defined(OCIO_ARCH_X86) // Intel-based processor or Apple Rosetta x86_64. +#if !defined(__aarch64__) && OCIO_ARCH_X86 // Intel-based processor or Apple Rosetta x86_64. namespace { diff --git a/src/OpenColorIO/CPUInfo.h b/src/OpenColorIO/CPUInfo.h index ad493c9b02..288360d7fd 100644 --- a/src/OpenColorIO/CPUInfo.h +++ b/src/OpenColorIO/CPUInfo.h @@ -36,7 +36,7 @@ namespace OCIO_NAMESPACE #define x86_check_flags(cpuext) \ (OCIO_USE_ ## cpuext && ((flags) & X86_CPU_FLAG_ ## cpuext)) -#if !defined(__aarch64__) && defined(OCIO_ARCH_X86) // Intel-based processor or Apple Rosetta x86_64. +#if !defined(__aarch64__) && OCIO_ARCH_X86 // Intel-based processor or Apple Rosetta x86_64. struct CPUInfo { diff --git a/src/OpenColorIO/CPUInfoConfig.h.in b/src/OpenColorIO/CPUInfoConfig.h.in index 68fdb55831..a94a7bb849 100644 --- a/src/OpenColorIO/CPUInfoConfig.h.in +++ b/src/OpenColorIO/CPUInfoConfig.h.in @@ -2,16 +2,50 @@ // Copyright Contributors to the OpenColorIO Project. -#cmakedefine OCIO_ARCH_X86 +#cmakedefine01 OCIO_ARCH_X86 -#cmakedefine01 OCIO_USE_SSE2 -#cmakedefine01 OCIO_USE_SSE3 -#cmakedefine01 OCIO_USE_SSSE3 -#cmakedefine01 OCIO_USE_SSE4 -#cmakedefine01 OCIO_USE_SSE42 -#cmakedefine01 OCIO_USE_AVX -#cmakedefine01 OCIO_USE_AVX2 -#cmakedefine01 OCIO_USE_AVX512 -#cmakedefine01 OCIO_USE_F16C +// Relevant only for arm64 architecture. +#if defined(__aarch64__) + #cmakedefine01 OCIO_USE_SSE2NEON +#endif -#cmakedefine01 OCIO_USE_SSE2NEON \ No newline at end of file +// On the Apple platform, a universal build is created for both x86_64 and arm64 architectures. +// CMake will run only one "configure" step, build for x86_64 and arm64, and then create a +// single binary that includes both architectures. + +// This means that for a universal build, the OCIO_USE_SSEx, OCIO_USE_AVXx, OCIO_USE_F16C, etc. +// flags can be enabled simultaneously. Therefore, we need to check whether we are currently +// building the x86_64 side or the arm64 side. This can be done by checking the OCIO_ARCH_X86 and +// aarch64 defines. + +// Building for x86_64 processor on a non-ARM host architecture +// OR Building on/for an ARM architecture and using SSE2NEON. +#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || (defined(__aarch64__) && OCIO_USE_SSE2NEON) + #cmakedefine01 OCIO_USE_SSE2 + #cmakedefine01 OCIO_USE_SSE3 + #cmakedefine01 OCIO_USE_SSSE3 + #cmakedefine01 OCIO_USE_SSE4 + #cmakedefine01 OCIO_USE_SSE42 +#else // non-intel based architecture or ARM architecture without OCIO_USE_SSE2NEON=ON + // Overwrite the values from cmake as these will not be supported on a non-intel architecture + // or arm64 with sse2neon disabled. + #define OCIO_USE_SSE2 0 + #define OCIO_USE_SSE3 0 + #define OCIO_USE_SSSE3 0 + #define OCIO_USE_SSE4 0 + #define OCIO_USE_SSE42 0 +#endif + +// Building for x86_64 processor on a non-ARM host architecture +#if OCIO_ARCH_X86 && !defined(__aarch64__) + #cmakedefine01 OCIO_USE_AVX + #cmakedefine01 OCIO_USE_AVX2 + #cmakedefine01 OCIO_USE_AVX512 + #cmakedefine01 OCIO_USE_F16C +#else // non-intel based architecture + // Overwrite the values from cmake as these will not be supported on a non-intel architecture. + #define OCIO_USE_AVX 0 + #define OCIO_USE_AVX2 0 + #define OCIO_USE_AVX512 0 + #define OCIO_USE_F16C 0 +#endif \ No newline at end of file diff --git a/src/OpenColorIO/SSE.h b/src/OpenColorIO/SSE.h index 049c8ad20b..2494698c57 100644 --- a/src/OpenColorIO/SSE.h +++ b/src/OpenColorIO/SSE.h @@ -6,7 +6,7 @@ #define INCLUDED_OCIO_SSE_H #include "CPUInfoConfig.h" -#if OCIO_USE_SSE2 || OCIO_USE_SSE2NEON +#if OCIO_USE_SSE2 // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). #if !defined(__aarch64__) @@ -31,7 +31,7 @@ namespace OCIO_NAMESPACE // it is redefining two of the functions from sse2neon. #if defined(__aarch64__) - #if defined(OCIO_USE_SSE2NEON) + #if OCIO_USE_SSE2NEON // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. @@ -39,7 +39,7 @@ namespace OCIO_NAMESPACE // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as - // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument + // the Intel _mm_max_ps / _mm_min_ps since they always return the non-NaN argument // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in // the first argument continues to be filtered out. static inline __m128 _mm_max_ps(__m128 a, __m128 b) diff --git a/src/OpenColorIO/SSE2.h b/src/OpenColorIO/SSE2.h index 1ee21c7f7b..2527ff084d 100644 --- a/src/OpenColorIO/SSE2.h +++ b/src/OpenColorIO/SSE2.h @@ -6,7 +6,7 @@ #define INCLUDED_OCIO_SSE2_H #include "CPUInfo.h" -#if OCIO_USE_SSE2 || OCIO_USE_SSE2NEON +#if OCIO_USE_SSE2 // Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM). #if !defined(__aarch64__) @@ -33,7 +33,7 @@ namespace OCIO_NAMESPACE // it is redefining two of the functions from sse2neon. #if defined(__aarch64__) - #if defined(OCIO_USE_SSE2NEON) + #if OCIO_USE_SSE2NEON // Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to // NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior. @@ -41,7 +41,7 @@ namespace OCIO_NAMESPACE // a simple (a>b) ? a:b. OCIO sometimes uses this behavior to filter out a possible NaN in the // first argument. The vmaxq/vminq will return a NaN if either input is a NaN, which omits the // filtering behavior. The vmaxnmq/vminnmq (similar to std::fmax/fmin) are not quite the same as - // the Intel _mm_max_ps / _mm_min_ps since they always returns the non-NaN argument + // the Intel _mm_max_ps / _mm_min_ps since they always return the non-NaN argument // (for quiet NaNs, signaling NaNs always get returned), but that's fine for OCIO since a NaN in // the first argument continues to be filtered out. static inline __m128 _mm_max_ps(__m128 a, __m128 b) diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp index 4261530c10..7dfaab8daf 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU.cpp @@ -285,14 +285,14 @@ BaseLut1DRenderer::BaseLut1DRenderer(ConstLut1DOpDataRcPtr & lut) } #endif -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX if (CPUInfo::instance().hasAVX()) { m_applyLutFunc = AVXGetLut1DApplyFunc(inBD, outBD); } #endif -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 if (CPUInfo::instance().hasAVX2() && !CPUInfo::instance().AVX2SlowGather()) { m_applyLutFunc = AVX2GetLut1DApplyFunc(inBD, outBD); @@ -316,14 +316,14 @@ BaseLut1DRenderer::BaseLut1DRenderer(ConstLut1DOpDataRcPtr & lut, B } #endif -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX if (CPUInfo::instance().hasAVX() && !CPUInfo::instance().AVXSlow()) { m_applyLutFunc = AVXGetLut1DApplyFunc(inBD, m_outBitDepth); } #endif -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 if (CPUInfo::instance().hasAVX2() && !CPUInfo::instance().AVX2SlowGather()) { m_applyLutFunc = AVX2GetLut1DApplyFunc(inBD, m_outBitDepth); diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp index 143d8fcae2..e0d1648b22 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.cpp @@ -2,8 +2,7 @@ // Copyright Contributors to the OpenColorIO Project. #include "Lut1DOpCPU_AVX.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX #include #include diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h index 917d323b51..c828169fd2 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX.h @@ -10,7 +10,7 @@ typedef void (Lut1DOpCPUApplyFunc)(const float *, const float *, const float *, int, const void *, void *, long); -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX namespace OCIO_NAMESPACE { diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp index 17ca33f833..32e59ff679 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.cpp @@ -2,8 +2,7 @@ // Copyright Contributors to the OpenColorIO Project. #include "Lut1DOpCPU_AVX2.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 #include #include diff --git a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h index 1c8ace1b3e..a3e63d67eb 100644 --- a/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h +++ b/src/OpenColorIO/ops/lut1d/Lut1DOpCPU_AVX2.h @@ -10,7 +10,7 @@ typedef void (Lut1DOpCPUApplyFunc)(const float *, const float *, const float *, int, const void *, void *, long); -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 namespace OCIO_NAMESPACE { diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp index 5c29747456..80fe753c0c 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU.cpp @@ -392,14 +392,14 @@ Lut3DTetrahedralRenderer::Lut3DTetrahedralRenderer(ConstLut3DOpDataRcPtr & lut) } #endif - #if OCIO_USE_AVX && !defined(__aarch64__) + #if OCIO_USE_AVX if (CPUInfo::instance().hasAVX() && !CPUInfo::instance().AVXSlow()) { m_applyLutFunc = applyTetrahedralAVX; } #endif - #if OCIO_USE_AVX2 && !defined(__aarch64__) + #if OCIO_USE_AVX2 if (CPUInfo::instance().hasAVX2() && !CPUInfo::instance().AVX2SlowGather()) { m_applyLutFunc = applyTetrahedralAVX2; diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp index 1b69b689a9..8bb7784f25 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.cpp @@ -2,8 +2,7 @@ // Copyright Contributors to the OpenColorIO Project. #include "Lut3DOpCPU_AVX.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX #include "AVX.h" diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h index a7a1498917..abed452ae4 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX.h @@ -8,7 +8,7 @@ #include "CPUInfo.h" -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX namespace OCIO_NAMESPACE { diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp index 5fc829ebc8..96a4ff6f67 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.cpp @@ -2,8 +2,7 @@ // Copyright Contributors to the OpenColorIO Project. #include "Lut3DOpCPU_AVX2.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 #include #include diff --git a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h index 6d5d18647e..6a8f72395f 100644 --- a/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h +++ b/src/OpenColorIO/ops/lut3d/Lut3DOpCPU_AVX2.h @@ -8,7 +8,7 @@ #include "CPUInfo.h" -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 namespace OCIO_NAMESPACE { diff --git a/tests/cpu/AVX2_tests.cpp b/tests/cpu/AVX2_tests.cpp index 36f01c59a7..e711661841 100644 --- a/tests/cpu/AVX2_tests.cpp +++ b/tests/cpu/AVX2_tests.cpp @@ -3,8 +3,7 @@ #include "CPUInfo.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 #include diff --git a/tests/cpu/AVX_tests.cpp b/tests/cpu/AVX_tests.cpp index b605d38958..60bd008373 100644 --- a/tests/cpu/AVX_tests.cpp +++ b/tests/cpu/AVX_tests.cpp @@ -3,8 +3,7 @@ #include "CPUInfo.h" -// The check for arm64 is needed for universal build (x86_64 and arm64) on Apple platform. -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX #include diff --git a/tests/cpu/SIMD_tests.cpp b/tests/cpu/SIMD_tests.cpp index eb6508d3f5..3cd5b76bc4 100644 --- a/tests/cpu/SIMD_tests.cpp +++ b/tests/cpu/SIMD_tests.cpp @@ -26,7 +26,7 @@ OCIO_ADD_TEST_SSE2(packed_all_test) #endif -#if OCIO_USE_AVX && !defined(__aarch64__) +#if OCIO_USE_AVX #define AVX_CHECK() \ if (!OCIO::CPUInfo::instance().hasAVX()) throw SkipException() @@ -51,7 +51,7 @@ OCIO_ADD_TEST_AVX(packed_all_test) #endif -#if OCIO_USE_AVX2 && !defined(__aarch64__) +#if OCIO_USE_AVX2 #define AVX2_CHECK() \ if (!OCIO::CPUInfo::instance().hasAVX2()) throw SkipException() diff --git a/tests/cpu/UnitTestMain.cpp b/tests/cpu/UnitTestMain.cpp index 61ae8adf89..a29dd3e085 100644 --- a/tests/cpu/UnitTestMain.cpp +++ b/tests/cpu/UnitTestMain.cpp @@ -39,7 +39,7 @@ OCIO_ADD_TEST(UnitTest, windows_debug) #endif -#if defined(OCIO_ARCH_X86) || OCIO_USE_SSE2NEON +#if OCIO_ARCH_X86 || OCIO_USE_SSE2NEON #define ENABLE_SIMD_USAGE #endif From 84ca7fe6d3a52caca2ddcd44aca6943e0df51112 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Tue, 29 Aug 2023 13:09:31 -0400 Subject: [PATCH 17/22] Fixing some spacing, documentations and making some cmake conditions clearer. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 2 +- docs/quick_start/installation.rst | 2 +- share/cmake/modules/install/Installsse2neon.cmake | 2 +- share/cmake/utils/CheckSupportAVX.cmake | 6 +++--- share/cmake/utils/CheckSupportAVX2.cmake | 6 +++--- share/cmake/utils/CheckSupportAVX512.cmake | 6 +++--- share/cmake/utils/CheckSupportF16C.cmake | 10 +++++----- share/cmake/utils/CheckSupportSSE2.cmake | 6 +++--- share/cmake/utils/CheckSupportSSE3.cmake | 6 +++--- share/cmake/utils/CheckSupportSSE4.cmake | 6 +++--- share/cmake/utils/CheckSupportSSE42.cmake | 6 +++--- share/cmake/utils/CheckSupportSSSE3.cmake | 6 +++--- 12 files changed, 32 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8bdfc0cbce..cb726a596b 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -254,7 +254,7 @@ endif() if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON) # Install sse2neon. Please note that sse2neon is downloaded during the configure step as it is - # needed for CompilerFlags.cmake and CheckSupportSSE2.cmake. + # needed for CompilerFlags.cmake and CheckSupportSSEUsingSSE2NEON.cmake. # Sse2neon is not treated like an imported target. The logic to find sse2neon is here because # a find module is not suitable for sse2neon's use case. diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index 5949e3e86d..ac91c7fe48 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -281,7 +281,7 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON) - ``-DOCIO_USE_SSE2=ON`` (Set to OFF to turn off SSE2 CPU performance optimizations) - ``-DOCIO_USE_SSE3=ON`` (Set to OFF to turn off SSE3 CPU performance optimizations) -- ``-DOCIO_USE_SSSE2=ON`` (Set to OFF to turn off SSSE3 CPU performance optimizations) +- ``-DOCIO_USE_SSSE3=ON`` (Set to OFF to turn off SSSE3 CPU performance optimizations) - ``-DOCIO_USE_SSE4=ON`` (Set to OFF to turn off SSE4 CPU performance optimizations) - ``-DOCIO_USE_SSE42=ON`` (Set to OFF to turn off SSE4.2 CPU performance optimizations) - ``-DOCIO_USE_AVX=ON`` (Set to OFF to turn off AVX CPU performance optimizations) diff --git a/share/cmake/modules/install/Installsse2neon.cmake b/share/cmake/modules/install/Installsse2neon.cmake index 5f0f810ca1..ab15a5c2f3 100644 --- a/share/cmake/modules/install/Installsse2neon.cmake +++ b/share/cmake/modules/install/Installsse2neon.cmake @@ -29,7 +29,7 @@ if(NOT sse2neon_POPULATED) set(_EXT_DIST_INCLUDE "${CMAKE_BINARY_DIR}/ext/dist/${CMAKE_INSTALL_INCLUDEDIR}") file(COPY "${sse2neon_SOURCE_DIR}/sse2neon.h" DESTINATION "${_EXT_DIST_INCLUDE}/sse2neon") - # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSE2.cmake and to create sse2neon + # sse2neon_INCLUDE_DIR is used internally for CheckSupportSSEUsingSSE2NEON.cmake and to create sse2neon # target for OCIO. set(sse2neon_INCLUDE_DIR "${sse2neon_SOURCE_DIR}") diff --git a/share/cmake/utils/CheckSupportAVX.cmake b/share/cmake/utils/CheckSupportAVX.cmake index c4a6015895..fd040b0821 100644 --- a/share/cmake/utils/CheckSupportAVX.cmake +++ b/share/cmake/utils/CheckSupportAVX.cmake @@ -3,8 +3,8 @@ set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -45,7 +45,7 @@ try_compile(COMPILER_SUPPORTS_AVX ) if(COMPILER_SUPPORTS_AVX) - message(STATUS "Performing Test COMPILER_SUPPORTS_AVX - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_AVX - Failed") endif() diff --git a/share/cmake/utils/CheckSupportAVX2.cmake b/share/cmake/utils/CheckSupportAVX2.cmake index c81673cd6e..f5a7544272 100644 --- a/share/cmake/utils/CheckSupportAVX2.cmake +++ b/share/cmake/utils/CheckSupportAVX2.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -53,7 +53,7 @@ try_compile(COMPILER_SUPPORTS_AVX2 ) if(COMPILER_SUPPORTS_AVX2) - message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_AVX2 - Failed") endif() diff --git a/share/cmake/utils/CheckSupportAVX512.cmake b/share/cmake/utils/CheckSupportAVX512.cmake index 5ef3ea038c..3d4f5bedd4 100644 --- a/share/cmake/utils/CheckSupportAVX512.cmake +++ b/share/cmake/utils/CheckSupportAVX512.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -41,7 +41,7 @@ try_compile(COMPILER_SUPPORTS_AVX512 ) if(COMPILER_SUPPORTS_AVX512) - message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_AVX512 - Failed") endif() diff --git a/share/cmake/utils/CheckSupportF16C.cmake b/share/cmake/utils/CheckSupportF16C.cmake index 400d065b0b..45672d6722 100644 --- a/share/cmake/utils/CheckSupportF16C.cmake +++ b/share/cmake/utils/CheckSupportF16C.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -32,16 +32,16 @@ set(F16C_CODE " } ") -file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/sse42_test.cpp" "${F16C_CODE}") +file(WRITE "${CMAKE_BINARY_DIR}/CMakeTmp/f16c_test.cpp" "${F16C_CODE}") message(STATUS "Performing Test COMPILER_SUPPORTS_F16C") try_compile(COMPILER_SUPPORTS_F16C "${CMAKE_BINARY_DIR}/CMakeTmp" - "${CMAKE_BINARY_DIR}/CMakeTmp/sse42_test.cpp" + "${CMAKE_BINARY_DIR}/CMakeTmp/f16c_test.cpp" ) if(COMPILER_SUPPORTS_F16C) - message(STATUS "Performing Test COMPILER_SUPPORTS_F16C - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_F16C - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_F16C - Failed") endif() diff --git a/share/cmake/utils/CheckSupportSSE2.cmake b/share/cmake/utils/CheckSupportSSE2.cmake index 67dd89828a..8c929c7e9b 100644 --- a/share/cmake/utils/CheckSupportSSE2.cmake +++ b/share/cmake/utils/CheckSupportSSE2.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -54,7 +54,7 @@ try_compile(COMPILER_SUPPORTS_SSE2 ) if(COMPILER_SUPPORTS_SSE2) - message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE2 - Failed") endif() diff --git a/share/cmake/utils/CheckSupportSSE3.cmake b/share/cmake/utils/CheckSupportSSE3.cmake index d87daab1c6..ef1dbd3140 100644 --- a/share/cmake/utils/CheckSupportSSE3.cmake +++ b/share/cmake/utils/CheckSupportSSE3.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -41,7 +41,7 @@ try_compile(COMPILER_SUPPORTS_SSE3 ) if(COMPILER_SUPPORTS_SSE3) - message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE3 - Failed") endif() diff --git a/share/cmake/utils/CheckSupportSSE4.cmake b/share/cmake/utils/CheckSupportSSE4.cmake index 4e3e815834..dd41624d9c 100644 --- a/share/cmake/utils/CheckSupportSSE4.cmake +++ b/share/cmake/utils/CheckSupportSSE4.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -41,7 +41,7 @@ try_compile(COMPILER_SUPPORTS_SSE4 ) if(COMPILER_SUPPORTS_SSE4) - message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE4 - Failed") endif() diff --git a/share/cmake/utils/CheckSupportSSE42.cmake b/share/cmake/utils/CheckSupportSSE42.cmake index 4a002b87b9..0e0afe2398 100644 --- a/share/cmake/utils/CheckSupportSSE42.cmake +++ b/share/cmake/utils/CheckSupportSSE42.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -41,7 +41,7 @@ try_compile(COMPILER_SUPPORTS_SSE42 ) if(COMPILER_SUPPORTS_SSE42) - message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSE42 - Failed") endif() diff --git a/share/cmake/utils/CheckSupportSSSE3.cmake b/share/cmake/utils/CheckSupportSSSE3.cmake index a0ead45445..9efea27eea 100644 --- a/share/cmake/utils/CheckSupportSSSE3.cmake +++ b/share/cmake/utils/CheckSupportSSSE3.cmake @@ -5,8 +5,8 @@ include(CheckCXXSourceCompiles) set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}") -if(APPLE AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" - OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64") +if(APPLE AND ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" + OR "${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")) set(__universal_build 1) set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}") endif() @@ -41,7 +41,7 @@ try_compile(COMPILER_SUPPORTS_SSSE3 ) if(COMPILER_SUPPORTS_SSSE3) - message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3 - Success") + message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3 - Success") else() message(STATUS "Performing Test COMPILER_SUPPORTS_SSSE3 - Failed") endif() From fac52ff47e1569ffb5f83dfd45b157fcb4e69b00 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Tue, 29 Aug 2023 13:44:53 -0400 Subject: [PATCH 18/22] Adding a build in ci_workflow for macos USE_OCIO_SSE2NEON=OFF MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- .github/workflows/ci_workflow.yml | 79 ++++++++++++++++++++----------- 1 file changed, 51 insertions(+), 28 deletions(-) diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index 076bf6e4ed..aed3a5d076 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -44,7 +44,7 @@ jobs: <${{ matrix.compiler-desc }} config=${{ matrix.build-type }}, shared=${{ matrix.build-shared }}, - sse=${{ matrix.use-sse }}, + simd=${{ matrix.use-simd }}, cxx=${{ matrix.cxx-standard }}, docs=${{ matrix.build-docs }}, oiio=${{ matrix.use-oiio }}>' @@ -70,7 +70,7 @@ jobs: build-shared: 'ON' build-docs: 'OFF' build-openfx: 'ON' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'ON' cxx-standard: 17 cxx-compiler: clang++ @@ -82,7 +82,7 @@ jobs: build-shared: 'ON' build-docs: 'ON' build-openfx: 'ON' - use-sse: 'OFF' + use-simd: 'OFF' use-oiio: 'OFF' cxx-standard: 17 cxx-compiler: g++ @@ -94,7 +94,7 @@ jobs: build-shared: 'OFF' build-docs: 'OFF' build-openfx: 'OFF' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'OFF' cxx-standard: 11 cxx-compiler: g++ @@ -109,7 +109,7 @@ jobs: build-shared: 'ON' build-docs: 'OFF' build-openfx: 'ON' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'ON' cxx-standard: 17 cxx-compiler: clang++ @@ -121,7 +121,7 @@ jobs: build-shared: 'ON' build-docs: 'ON' build-openfx: 'ON' - use-sse: 'OFF' + use-simd: 'OFF' use-oiio: 'OFF' cxx-standard: 17 cxx-compiler: g++ @@ -133,7 +133,7 @@ jobs: build-shared: 'OFF' build-docs: 'OFF' build-openfx: 'OFF' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'OFF' cxx-standard: 11 cxx-compiler: g++ @@ -148,7 +148,7 @@ jobs: build-shared: 'ON' build-docs: 'OFF' build-openfx: 'OFF' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'ON' cxx-standard: 17 cxx-compiler: clang++ @@ -160,7 +160,7 @@ jobs: build-shared: 'OFF' build-docs: 'OFF' build-openfx: 'ON' - use-sse: 'OFF' + use-simd: 'OFF' use-oiio: 'OFF' cxx-standard: 14 cxx-compiler: clang++ @@ -172,7 +172,7 @@ jobs: build-shared: 'ON' build-docs: 'OFF' build-openfx: 'OFF' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'OFF' cxx-standard: 11 cxx-compiler: g++ @@ -187,7 +187,7 @@ jobs: build-shared: 'ON' build-docs: 'OFF' build-openfx: 'ON' - use-sse: 'OFF' + use-simd: 'OFF' use-oiio: 'OFF' cxx-standard: 14 cxx-compiler: clang++ @@ -199,7 +199,7 @@ jobs: build-shared: 'OFF' build-docs: 'OFF' build-openfx: 'ON' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'ON' cxx-standard: 14 cxx-compiler: g++ @@ -211,7 +211,7 @@ jobs: build-shared: 'ON' build-docs: 'OFF' build-openfx: 'OFF' - use-sse: 'ON' + use-simd: 'ON' use-oiio: 'OFF' cxx-standard: 11 cxx-compiler: g++ @@ -243,7 +243,7 @@ jobs: -DOCIO_BUILD_DOCS=${{ matrix.build-docs }} \ -DOCIO_BUILD_OPENFX=${{ matrix.build-openfx }} \ -DOCIO_BUILD_GPU_TESTS=OFF \ - -DOCIO_USE_SIMD=${{ matrix.use-sse }} \ + -DOCIO_USE_SIMD=${{ matrix.use-simd }} \ -DOCIO_USE_OIIO_FOR_APPS=${{ matrix.use-oiio }} \ -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ @@ -306,9 +306,11 @@ jobs: macos: name: 'macOS 11 Date: Tue, 29 Aug 2023 13:46:25 -0400 Subject: [PATCH 19/22] typo MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- .github/workflows/ci_workflow.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index aed3a5d076..e6c3b1d12c 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -306,7 +306,7 @@ jobs: macos: name: 'macOS 11 Date: Tue, 29 Aug 2023 16:10:11 -0400 Subject: [PATCH 20/22] Changing back all the macOS (except one) builds to x86_64 only as it takes double the time to do the universal build. OCIO no longuer build a universal binary by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- .github/workflows/ci_workflow.yml | 9 +++++---- CMakeLists.txt | 8 -------- 2 files changed, 5 insertions(+), 12 deletions(-) diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index e6c3b1d12c..452198e8ed 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -325,7 +325,7 @@ jobs: build: [1, 2, 3, 4] include: - build: 5 - arch-type: "x86_64;arm64" + arch-type: "x86_64" build-type: Release build-shared: 'ON' build-docs: 'OFF' @@ -335,6 +335,7 @@ jobs: use-oiio: 'ON' cxx-standard: 17 python-version: '3.11' + # Keeping one universal build - build: 4 arch-type: "x86_64;arm64" build-type: Release @@ -347,7 +348,7 @@ jobs: cxx-standard: 11 python-version: '3.10' - build: 3 - arch-type: "x86_64;arm64" + arch-type: "x86_64" build-type: Release build-shared: 'ON' build-docs: 'ON' @@ -358,7 +359,7 @@ jobs: cxx-standard: 11 python-version: '3.10' - build: 2 - arch-type: "x86_64;arm64" + arch-type: "x86_64" build-type: Debug build-shared: 'ON' build-docs: 'OFF' @@ -369,7 +370,7 @@ jobs: cxx-standard: 11 python-version: '3.9' - build: 1 - arch-type: "x86_64;arm64" + arch-type: "x86_64" build-type: Release build-shared: 'OFF' build-docs: 'OFF' diff --git a/CMakeLists.txt b/CMakeLists.txt index cb726a596b..5aac1144a3 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -25,14 +25,6 @@ if(APPLE AND NOT DEFINED CMAKE_OSX_DEPLOYMENT_TARGET) endif() -############################################################################### -# By default, build the library, tests, tools, and Python binding as universal binaries for macOS. - -if(APPLE AND (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL "")) - set(CMAKE_OSX_ARCHITECTURES "arm64;x86_64" CACHE STRING "Default OS X architectures" FORCE) -endif() - - ############################################################################### # Project definition. From 6765e8ad51860b2bbfb0bcf92e743d282cd00652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Thu, 31 Aug 2023 15:20:54 -0400 Subject: [PATCH 21/22] Update the CMakeLists.txt logic to accomodate all scenario and fixing documentations. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- CMakeLists.txt | 106 ++++++++++++++++++++++-------- docs/quick_start/installation.rst | 24 +++---- 2 files changed, 88 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fb8cb7a666..745fd1898e 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -178,33 +178,83 @@ endif() option(OCIO_USE_SIMD "Specify whether to enable SIMD CPU performance optimizations" ON) option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ocioconvert and ociodisplay), the default uses OpenEXR." OFF) -if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)" OR - (APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "(arm64;x86_64|x86_64;arm64|x86_64)")) - - # Enable OCIO_ARCH_X86 for any intel-based architecture or Apple Rosetta (x86_64 or universal build). - set(OCIO_ARCH_X86 1) - - # For a Mac OS universal build, OCIO translates the SSE instructions into ARM Neon with - # the library SSE2NEON for the arm64 side of the binary. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) -elseif(APPLE AND "${CMAKE_SYSTEM_PROCESSOR}" MATCHES "arm64" AND "${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64") - - # Apple ARM only build. - set(OCIO_ARCH_X86 0) - - # OCIO translates the SSE instructions into ARM Neon with the library SSE2NEON. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) - # OCIO does not translate AVX to ARM Neon. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) - # Turn F16C off since they are x86_64 instructions. - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) -else() - set(OCIO_ARCH_X86 0) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) - set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) + +if (NOT APPLE) + if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|X86|x86_64|i386|i686)") + # Intel-based architecture (not APPLE) + set(OCIO_ARCH_X86 1) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) + else() + set(OCIO_ARCH_X86 0) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) + endif() +elseif (APPLE) + # Multiple possible scenarios: + # A. Building on an Apple arm64 architecture for the native architecture (arm64). + # B. Building on an Apple arm64 architecture for x86_64. + # C. Building on an Apple arm64 architecture for universal binaries (x86_64 + arm64). + # + # D. Building on an Apple Intel architecture for the native architecture (x86_64). + # E. Building on an Apple Intel architecture for arm64. + # F. Building on an Apple Intel architecture for universal binaries (x86_64 + arm64). + + # Check if we are NOT cross-compiling to another Apple architecture. + if (NOT DEFINED CMAKE_OSX_ARCHITECTURES OR CMAKE_OSX_ARCHITECTURES STREQUAL "") + # Building for the native architectures. + if ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "x86_64") + # Apple Intel-based computer or Rosetta. (covers case D) + + set(OCIO_ARCH_X86 1) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + # The AVX and F16C will be compiled, but it won't used at runtime by Rosetta. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) + elseif ("${CMAKE_SYSTEM_PROCESSOR}" STREQUAL "arm64") + # Apple arm64. (covers case A) + + set(OCIO_ARCH_X86 0) + # OCIO translates the SSE instructions into ARM Neon with the library SSE2NEON. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + # OCIO does not translate AVX to ARM Neon. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + # Turn F16C off since they are x86_64 instructions. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) + endif() + else() + # Cross-compiling to another Apple architecture (x86_64, arm64, or both (universal)) + if ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "x86_64") + # Compiling for x86_64 regardless of native architecture. + # Covers case B assuming CMAKE_SYSTEM_PROCESSOR is arm64. + # Covers case D if CMAKE_OSX_ARCHITECTURES is set to x86_64 on x86_64 architecture. + + set(OCIO_ARCH_X86 1) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + # The AVX and F16C will be compiled, but they won't used at runtime by Rosetta. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) + elseif ("${CMAKE_OSX_ARCHITECTURES}" STREQUAL "arm64") + # Compiling for arm64 regardless of native architecture. + # Covers case E assuming CMAKE_SYSTEM_PROCESSOR is x86_64. + # Covers case A if CMAKE_OSX_ARCHITECTURES is set arm64 on arm64 architecture. + + set(OCIO_ARCH_X86 0) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF) + elseif ("${CMAKE_OSX_ARCHITECTURES}" MATCHES "(arm64;x86_64|x86_64;arm64)") + # Universal build (covers case C and F). + + set(OCIO_ARCH_X86 1) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON) + # The AVX and F16C will be compiled, but they won't used at runtime by Rosetta. + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX ON) + set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C ON) + endif() + endif() endif() option(OCIO_USE_SSE2 "Specify whether to enable SSE2 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE}) @@ -218,7 +268,9 @@ option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimiz option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C}) if (APPLE) + # TODO: Revisit whether that option is necessary. option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations using SSE2NEON for Apple ARM architecture" ON) + mark_as_advanced(OCIO_USE_SSE2NEON) endif() diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index ac91c7fe48..a325edea0f 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -279,29 +279,23 @@ Here are the most common OCIO-specific CMake options (the default values are sho - ``-DOCIO_BUILD_PYTHON=ON`` (Set to OFF to not build the Python binding) - ``-DOCIO_BUILD_OPENFX=OFF`` (Set to ON to build the OpenFX plug-ins) - ``-DOCIO_USE_SIMD=ON`` (Set to OFF to turn off SIMD CPU performance optimizations, such as SSE and NEON) -- ``-DOCIO_USE_SSE2=ON`` (Set to OFF to turn off SSE2 CPU performance optimizations) -- ``-DOCIO_USE_SSE3=ON`` (Set to OFF to turn off SSE3 CPU performance optimizations) -- ``-DOCIO_USE_SSSE3=ON`` (Set to OFF to turn off SSSE3 CPU performance optimizations) -- ``-DOCIO_USE_SSE4=ON`` (Set to OFF to turn off SSE4 CPU performance optimizations) -- ``-DOCIO_USE_SSE42=ON`` (Set to OFF to turn off SSE4.2 CPU performance optimizations) -- ``-DOCIO_USE_AVX=ON`` (Set to OFF to turn off AVX CPU performance optimizations) -- ``-DOCIO_USE_AVX2=ON`` (Set to OFF to turn off AVX2 CPU performance optimizations) -- ``-DOCIO_USE_AVX512=ON`` (Set to OFF to turn off AVX512 CPU performance optimizations) -- ``-DOCIO_USE_F16C=ON`` (Set to OFF to turn off F16C CPU performance optimizations) -- ``-OCIO_USE_SSE2NEON=ON`` (Apple Only; Set to OFF to turn off the SSE2NEON translation performance optimizations on Apple ARM) +- ``-DOCIO_USE_SSE2`` (Set to OFF to turn off SSE2 CPU performance optimizations) +- ``-DOCIO_USE_AVX`` (Set to OFF to turn off AVX CPU performance optimizations) +- ``-DOCIO_USE_AVX2`` (Set to OFF to turn off AVX2 CPU performance optimizations) +- ``-DOCIO_USE_F16C`` (Set to OFF to turn off F16C CPU performance optimizations) - ``-DOCIO_BUILD_TESTS=ON`` (Set to OFF to not build the unit tests) - ``-DOCIO_BUILD_GPU_TESTS=ON`` (Set to OFF to not build the GPU unit tests) -- ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU reendering) +- ``-DOCIO_USE_HEADLESS=OFF`` (Set to ON to do headless GPU rendering) - ``-DOCIO_WARNING_AS_ERROR=ON`` (Set to OFF to turn off warnings as errors) - ``-DOCIO_BUILD_DOCS=OFF`` (Set to ON to build the documentation) - ``-DOCIO_BUILD_FROZEN_DOCS=OFF`` (Set to ON to update the Python documentation) Note that OCIO will turn off any specific SIMD CPU performance optimizations if they are not supported -by the build target architecture. +by the build target architecture. The default for ``OCIO_USE_SSE2``, ``OCIO_USE_AVX``, ``OCIO_USE_AVX2`` and +``OCIO_USE_F16C`` depends on the architecture, but will be ON where supported. -On the MacOS, the default is to build universal binaries -(natively supporting both the Intel and ARM processors). The ``-DCMAKE_OSX_ARCHITECTURES`` option -may be set to just arm64 or x86_64 to override the default value, which is ``arm64;x86_64``. +On MacOS, the default is to build for the native architecture. The ``-DCMAKE_OSX_ARCHITECTURES`` option +may be set to ``arm64;x86_64`` to build the universal binaries. When doing a universal build, note that the OCIO dependencies must be built as universal libraries too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if From 3a91fb8f68516e13c8979bde5fd72469dbfdd3e4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?C=C3=A9drik=20Fuoco?= Date: Thu, 31 Aug 2023 21:05:34 -0400 Subject: [PATCH 22/22] Update documentation and remove ocio_use_sse2neon from the CI matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Cédrik Fuoco --- .github/workflows/ci_workflow.yml | 7 ------- docs/quick_start/installation.rst | 6 ++++-- 2 files changed, 4 insertions(+), 9 deletions(-) diff --git a/.github/workflows/ci_workflow.yml b/.github/workflows/ci_workflow.yml index 452198e8ed..2bc34c3ed2 100644 --- a/.github/workflows/ci_workflow.yml +++ b/.github/workflows/ci_workflow.yml @@ -310,7 +310,6 @@ jobs: config=${{ matrix.build-type }}, shared=${{ matrix.build-shared }}, simd=${{ matrix.use-simd }}, - sse2neon=${{ matrix.use-sse2neon }}, cxx=${{ matrix.cxx-standard }}, python=${{ matrix.python-version }}, docs=${{ matrix.build-docs }}, @@ -331,7 +330,6 @@ jobs: build-docs: 'OFF' build-openfx: 'ON' use-simd: 'ON' - use-sse2neon: 'ON' use-oiio: 'ON' cxx-standard: 17 python-version: '3.11' @@ -343,7 +341,6 @@ jobs: build-docs: 'OFF' build-openfx: 'OFF' use-simd: 'ON' - use-sse2neon: 'OFF' use-oiio: 'OFF' cxx-standard: 11 python-version: '3.10' @@ -354,7 +351,6 @@ jobs: build-docs: 'ON' build-openfx: 'OFF' use-simd: 'OFF' - use-sse2neon: 'OFF' use-oiio: 'OFF' cxx-standard: 11 python-version: '3.10' @@ -365,7 +361,6 @@ jobs: build-docs: 'OFF' build-openfx: 'ON' use-simd: 'ON' - use-sse2neon: 'ON' use-oiio: 'OFF' cxx-standard: 11 python-version: '3.9' @@ -376,7 +371,6 @@ jobs: build-docs: 'OFF' build-openfx: 'ON' use-simd: 'ON' - use-sse2neon: 'ON' use-oiio: 'OFF' cxx-standard: 14 python-version: '3.7' @@ -411,7 +405,6 @@ jobs: -DOCIO_INSTALL_EXT_PACKAGES=ALL \ -DOCIO_WARNING_AS_ERROR=ON \ -DPython_EXECUTABLE=$(which python) \ - -DOCIO_USE_SSE2NEON=${{ matrix.use-sse2neon }} \ -DCMAKE_OSX_ARCHITECTURES="${{ matrix.arch-type }}" working-directory: _build - name: Build diff --git a/docs/quick_start/installation.rst b/docs/quick_start/installation.rst index a325edea0f..7ba97b6ad0 100644 --- a/docs/quick_start/installation.rst +++ b/docs/quick_start/installation.rst @@ -294,8 +294,10 @@ Note that OCIO will turn off any specific SIMD CPU performance optimizations if by the build target architecture. The default for ``OCIO_USE_SSE2``, ``OCIO_USE_AVX``, ``OCIO_USE_AVX2`` and ``OCIO_USE_F16C`` depends on the architecture, but will be ON where supported. -On MacOS, the default is to build for the native architecture. The ``-DCMAKE_OSX_ARCHITECTURES`` option -may be set to ``arm64;x86_64`` to build the universal binaries. +On MacOS, the default is to build for the native architecture that CMake is running under. +For example, if a x86_64 version of CMake is running under Rosetta, the native architecture will +be x86_64, rather then arm64. You can use the ``CMAKE_OSX_ARCHITECTURES`` option to override that. +To build universal binaries, use the following option: ``-DCMAKE_OSX_ARCHITECTURES="arm64;x86_64"``. When doing a universal build, note that the OCIO dependencies must be built as universal libraries too. If you are running in OCIO_INSTALL_EXT_PACKAGES=MISSING or NONE mode, your build will fail if