Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix unit tests environment variables #4607

Merged
merged 2 commits into from
Nov 21, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 36 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,10 @@ if(CMAKE_CXX_COMPILER_ID MATCHES "Clang" AND NOT APPLE)
"Build with memory sanitizer (experimental; requires a memory-sanitized Python interpreter)"
OFF)
endif()
option(
ESPRESSO_ADD_OMPI_SINGLETON_WARNING
"Add a runtime warning in the pypresso script for NUMA architectures that aren't supported in singleton mode by Open MPI 4.x"
ON)
option(WARNINGS_ARE_ERRORS "Treat warnings as errors during compilation" OFF)
option(WITH_CCACHE "Use ccache compiler invocation." OFF)
option(WITH_PROFILER "Enable profiler annotations." OFF)
Expand Down Expand Up @@ -320,11 +324,38 @@ find_package(MPI 3.0 REQUIRED)
find_package(MpiexecBackend)

# OpenMPI checks the number of processes against the number of CPUs
if("${MPIEXEC_BACKEND_NAME}" STREQUAL "OpenMPI" AND "${MPIEXEC_BACKEND_VERSION}"
VERSION_GREATER_EQUAL 2.0.0)
set(MPIEXEC_OVERSUBSCRIBE "-oversubscribe")
else()
set(MPIEXEC_OVERSUBSCRIBE "")
set(MPIEXEC_OVERSUBSCRIBE "")
# Open MPI 4.x has a bug on NUMA archs that prevents running in singleton mode
set(ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA OFF)
set(ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN "AMD (EPYC|Ryzen)")

if("${MPIEXEC_BACKEND_NAME}" STREQUAL "OpenMPI")
if("${MPIEXEC_BACKEND_VERSION}" VERSION_GREATER_EQUAL 2.0.0)
set(MPIEXEC_OVERSUBSCRIBE "-oversubscribe")
endif()
if("${MPIEXEC_BACKEND_VERSION}" VERSION_GREATER_EQUAL 4.0
AND "${MPIEXEC_BACKEND_VERSION}" VERSION_LESS 5.0)
if(NOT DEFINED ESPRESSO_CPU_MODEL_NAME)
if(CMAKE_SYSTEM_NAME STREQUAL Linux)
if(EXISTS /proc/cpuinfo)
file(READ /proc/cpuinfo ESPRESSO_CPU_INFO)
string(REGEX
REPLACE ".*\n[Mm]odel name[ \t]*:[ \t]+([^\n]+).*" "\\1"
ESPRESSO_CPU_MODEL_NAME_STRING "${ESPRESSO_CPU_INFO}")
else()
set(ESPRESSO_CPU_MODEL_NAME_STRING "__unreadable")
endif()
else()
set(ESPRESSO_CPU_MODEL_NAME_STRING "__unaffected")
endif()
set(ESPRESSO_CPU_MODEL_NAME "${ESPRESSO_CPU_MODEL_NAME_STRING}"
CACHE INTERNAL "")
endif()
if(ESPRESSO_CPU_MODEL_NAME MATCHES
"^${ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN}")
set(ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA ON)
endif()
endif()
endif()

# OpenMPI cannot run two jobs in parallel in a Docker container, because the
Expand Down
12 changes: 7 additions & 5 deletions cmake/unit_test.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -49,12 +49,14 @@ function(UNIT_TEST)
else()
set(SANITIZERS_HALT_ON_ERROR "halt_on_error=0")
endif()
set(UBSAN_OPTIONS "UBSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/maintainer/CI/ubsan.supp:${SANITIZERS_HALT_ON_ERROR}:print_stacktrace=1")
set(ASAN_OPTIONS "ASAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}:detect_leaks=0:allocator_may_return_null=1")
set(MSAN_OPTIONS "MSAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}")
list(APPEND TEST_ENV_VARIABLES "UBSAN_OPTIONS=suppressions=${CMAKE_SOURCE_DIR}/maintainer/CI/ubsan.supp:${SANITIZERS_HALT_ON_ERROR}:print_stacktrace=1")
list(APPEND TEST_ENV_VARIABLES "ASAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}:detect_leaks=0:allocator_may_return_null=1")
list(APPEND TEST_ENV_VARIABLES "MSAN_OPTIONS=${SANITIZERS_HALT_ON_ERROR}")
if(NOT TEST_NUM_PROC AND ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA AND "${TEST_DEPENDS}" MATCHES "(^|;)([Bb]oost::mpi|MPI::MPI_CXX)($|;)")
list(APPEND TEST_ENV_VARIABLES "OMPI_MCA_hwloc_base_binding_policy=none")
endif()
set_tests_properties(
${TEST_NAME} PROPERTIES ENVIRONMENT
"${UBSAN_OPTIONS} ${ASAN_OPTIONS} ${MSAN_OPTIONS}")
${TEST_NAME} PROPERTIES ENVIRONMENT "${TEST_ENV_VARIABLES}")

add_dependencies(check_unit_tests ${TEST_NAME})
endfunction(UNIT_TEST)
14 changes: 14 additions & 0 deletions doc/sphinx/installation.rst
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,15 @@ are required to be able to compile and use |es|:
Other MPI implementations like Intel MPI should also work, although
they are not actively tested in |es| continuous integration.

Open MPI version 4.x is known to not properly support the MCA binding
policy "numa" in singleton mode on a few NUMA architectures.
On affected systems, e.g. AMD Ryzen or AMD EPYC, Open MPI halts with
a fatal error when setting the processor affinity in ``MPI_Init``.
This issue can be resolved by setting the environment variable
``OMPI_MCA_hwloc_base_binding_policy`` to a value other than "numa",
such as "l3cache" to bind to a NUMA shared memory block, or to
"none" to disable binding (can cause performance loss).

Python
|es|'s main user interface relies on Python 3.

Expand Down Expand Up @@ -743,6 +752,11 @@ The following options are available:
* ``WITH_VALGRIND_INSTRUMENTATION``: Build with valgrind instrumentation
markers

* ``ESPRESSO_ADD_OMPI_SINGLETON_WARNING``: Add a runtime warning in the
pypresso and ipypresso scripts that is triggered in singleton mode
with Open MPI version 4.x on unsupported NUMA environments
(see :term:`MPI installation requirements <MPI>` for details).

When the value in the :file:`CMakeLists.txt` file is set to ON, the
corresponding option is created; if the value of the option is set to OFF,
the corresponding option is not created. These options can also be modified
Expand Down
3 changes: 1 addition & 2 deletions doc/sphinx/running.rst
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,7 @@ Parallel computing

Many algorithms in |es| are designed to work with multiple MPI ranks.
However, not all algorithms benefit from MPI parallelization equally.
Several algorithms only use MPI rank 0 (e.g. :ref:`Reaction methods`), while
a small subset simply don't support MPI (e.g. :ref:`Dipolar direct sum`).
Several algorithms only use MPI rank 0 (e.g. :ref:`Reaction methods`).
|es| should work with most MPI implementations on the market;
see the :term:`MPI installation requirements <MPI>` for details.

Expand Down
9 changes: 9 additions & 0 deletions src/python/pypresso.cmakein
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,15 @@ else
fi
export PYTHONPATH

# Open MPI 4.x cannot run in singleton mode on some NUMA systems
if [ "@ESPRESSO_ADD_OMPI_SINGLETON_WARNING@" = "ON" ] && [ "@ESPRESSO_MPIEXEC_GUARD_SINGLETON_NUMA@" = "ON" ]; then
if [ -z "${OMPI_COMM_WORLD_SIZE}" ] && [ "${OMPI_MCA_hwloc_base_binding_policy}" = "numa" ]; then
if test -f /proc/cpuinfo && grep --quiet -P "^[Mm]odel name[ \t]*:[ \t]+@ESPRESSO_CPU_MODEL_NAME_OMPI_SINGLETON_NUMA_PATTERN@( |$)" /proc/cpuinfo; then
echo "warning: if Open MPI fails to set processor affinity, set environment variable OMPI_MCA_hwloc_base_binding_policy to \"none\" or \"l3cache\""
fi
fi
fi

if [ "@CMAKE_CXX_COMPILER_ID@" != "GNU" ] && [ "@WITH_ASAN@" = "ON" ]; then
asan_lib=$("@CMAKE_CXX_COMPILER@" /dev/null -### -o /dev/null -fsanitize=address 2>&1 | grep -o '[" ][^" ]*libclang_rt.asan[^" ]*[^s][" ]' | sed 's/[" ]//g' | sed 's/\.a$/.so/g')
export DYLD_INSERT_LIBRARIES="$asan_lib"
Expand Down
4 changes: 2 additions & 2 deletions src/script_interface/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -42,9 +42,9 @@ unit_test(NAME ParallelExceptionHandler_test SRC
unit_test(NAME packed_variant_test SRC packed_variant_test.cpp DEPENDS
espresso::script_interface)
unit_test(NAME ObjectList_test SRC ObjectList_test.cpp DEPENDS
espresso::script_interface espresso::core)
espresso::script_interface espresso::core Boost::mpi)
unit_test(NAME ObjectMap_test SRC ObjectMap_test.cpp DEPENDS
espresso::script_interface espresso::core)
espresso::script_interface espresso::core Boost::mpi)
unit_test(NAME serialization_mpi_guard_test SRC
serialization_mpi_guard_test.cpp DEPENDS espresso::script_interface
Boost::mpi MPI::MPI_CXX NUM_PROC 2)
Expand Down