diff --git a/src/framework/MOM_error_handler.F90 b/src/framework/MOM_error_handler.F90 index 6051fed08b..223423c7fc 100644 --- a/src/framework/MOM_error_handler.F90 +++ b/src/framework/MOM_error_handler.F90 @@ -3,7 +3,12 @@ module MOM_error_handler ! This file is part of MOM6. See LICENSE.md for the license. +use MOM_coms_infra, only : num_PEs use MOM_error_infra, only : MOM_err, is_root_pe, stdlog, stdout, NOTE, WARNING, FATAL +use posix, only : getpid, getppid, handler_interface +use posix, only : signal, kill, SIGUSR1 +use posix, only : sigjmp_buf, siglongjmp +use posix, only : sleep implicit none ; private @@ -15,6 +20,7 @@ module MOM_error_handler public :: is_root_pe, stdlog, stdout !> Integer parameters encoding the severity of an error message public :: NOTE, WARNING, FATAL +public :: disable_fatal_errors, enable_fatal_errors integer :: verbosity = 6 !< Verbosity level: @@ -40,6 +46,19 @@ module MOM_error_handler integer :: callTreeIndentLevel = 0 !< The level of calling within the call tree +! Error handling + +logical :: ignore_fatal = .false. + !< If true, ignore FATAL errors and jump to a prior state. +integer, parameter :: err_signal = SIGUSR1 + !< Signal used to trigger the error handler +integer :: err_pid + !< Process ID for the error handler (either self or MPI launcher) +procedure(handler_interface), pointer :: prior_handler + !< The default signal handler used before signal() setup (usually SIG_DFT) +type(sigjmp_buf) :: prior_env + !< Buffer containing the program state to be recovered by longjmp + contains !> This provides a convenient interface for writing an informative comment, depending @@ -61,6 +80,49 @@ subroutine MOM_mesg(message, verb, all_print) end subroutine MOM_mesg +!> Enable error handling, replacing FATALs in MOM_error with err_handler. +subroutine disable_fatal_errors(env) + type(sigjmp_buf), intent(in) :: env + !> Process recovery state after FATAL errors + + integer :: rc + integer :: sig + + ignore_fatal = .true. + + ! TODO: Only need to call this once; move to an init() function? + if (num_PEs() > 1) then + err_pid = getppid() + else + err_pid = getpid() + endif + + ! Store the program state + prior_env = env + + ! Setup the signal handler + ! NOTE: Passing parameters to signal() in GFortran causes a compiler error. + ! We avert this by copying err_signal to a variable. + sig = err_signal + ! TODO: Use sigaction() in place of signal() + prior_handler => signal(sig, err_handler) +end subroutine disable_fatal_errors + +!> Disable the error handler and abort on FATAL +subroutine enable_fatal_errors() + integer :: rc + integer :: sig + procedure(handler_interface), pointer :: dummy + + ignore_fatal = .false. + err_pid = -1 ! NOTE: 0 might be safer, since it's unusable. + + ! Restore the original signal handler (usually SIG_DFT). + sig = err_signal + ! NOTE: As above, we copy the err_signal to accommodate GFortran. + dummy => signal(sig, prior_handler) +end subroutine enable_fatal_errors + !> This provides a convenient interface for writing an error message !! with run-time filter based on a verbosity and the severity of the error. subroutine MOM_error(level, message, all_print) @@ -71,6 +133,7 @@ subroutine MOM_error(level, message, all_print) ! This provides a convenient interface for writing an error message ! with run-time filter based on a verbosity. logical :: write_msg + integer :: rc write_msg = is_root_pe() if (present(all_print)) write_msg = write_msg .or. all_print @@ -81,6 +144,15 @@ subroutine MOM_error(level, message, all_print) case (WARNING) if (write_msg.and.verbosity>=1) call MOM_err(WARNING, message) case (FATAL) + if (ignore_fatal) then + print *, "(FATAL): " // message + rc = kill(err_pid, err_signal) + ! NOTE: MPI launchers require, in their words, "a few seconds" to + ! propagate the signal to the nodes, so we wait here to avoid + ! anomalous FATAL calls. + ! In practice, the signal will take control before sleep() completes. + rc = sleep(3) + endif if (verbosity>=0) call MOM_err(FATAL, message) case default call MOM_err(level, message) @@ -180,7 +252,13 @@ subroutine assert(logical_arg, msg) if (.not. logical_arg) then call MOM_error(FATAL, msg) endif - end subroutine assert +!> Restore the process state via longjmp after receiving a signal. +subroutine err_handler(sig) + integer, intent(in) :: sig + !< Signal passed to the handler (unused) + call siglongjmp(prior_env, 1) +end subroutine + end module MOM_error_handler diff --git a/src/framework/posix.F90 b/src/framework/posix.F90 new file mode 100644 index 0000000000..0a534db6c0 --- /dev/null +++ b/src/framework/posix.F90 @@ -0,0 +1,347 @@ +!> Interface to the libc POSIX API +#include "posix.h" + +module posix + +use, intrinsic :: iso_c_binding, only : c_char +use, intrinsic :: iso_c_binding, only : c_int +use, intrinsic :: iso_c_binding, only : c_long +use, intrinsic :: iso_c_binding, only : c_null_char +use, intrinsic :: iso_c_binding, only : c_funptr +use, intrinsic :: iso_c_binding, only : c_funloc +use, intrinsic :: iso_c_binding, only : c_f_procpointer + +implicit none + +!> Container for the jump point buffer created by setjmp(). +!! +!! The buffer typically contains the current register values, stack pointers, +!! and any information required to restore the process state. +type, bind(c) :: jmp_buf + private + character(kind=c_char) :: state(JMP_BUF_SIZE) + !< Unstructured array of bytes used to store the process state +end type jmp_buf + +!> Container for the jump point buffer (with signals) created by sigsetjmp() +!! +!! In addition to the content stored by `jmp_buf`, it also stores signal state. +type, bind(c) :: sigjmp_buf + private + character(kind=c_char) :: state(SIGJMP_BUF_SIZE) + !< Unstructured array of bytes used to store the process state +end type sigjmp_buf + +! POSIX signals +integer, parameter :: SIGUSR1 = POSIX_SIGUSR1 + !< Signal number for SIGUSR1 (user-defined signal 1) + +interface + !> C interface to POSIX chmod() + !! Users should use the Fortran-defined chmod() function. + function chmod_posix(path, mode) result(rc) bind(c, name="chmod") + ! #include + ! int chmod(const char *path, mode_t mode); + import :: c_char, c_int + + character(kind=c_char), dimension(*), intent(in) :: path + !< Zero-delimited file path + integer(kind=c_int), value, intent(in) :: mode + !< File permission to be assigned to file. + integer(kind=c_int) :: rc + !< Function return code + end function chmod_posix + + !> C interface to POSIX signal() + !! Users should use the Fortran-defined signal() function. + function signal_posix(sig, func) result(handle) bind(c, name="signal") + ! #include + ! void (*signal(int sig, void (*func)(int)))(int); + import :: c_int, c_funptr + + integer(kind=c_int), value, intent(in) :: sig + !< Signal to be configured + type(c_funptr), value, intent(in) :: func + !< Function handle to be called when `sig` is raised + type(c_funptr) :: handle + !< Prior handle for sig to be replaced by `func` + end function signal_posix + + !> C interface to POSIX kill() + !! Users should use the Fortran-defined kill() function. + function kill_posix(pid, sig) result(rc) bind(c, name="kill") + ! #include + ! int kill(pid_t pid, int sig); + import :: c_int + + integer(kind=c_int), value, intent(in) :: pid + !< Process ID which is to receive the signal + integer(kind=c_int), value, intent(in) :: sig + !< Signal to be sent to the process + integer(kind=c_int) :: rc + !< Function return code + end function kill_posix + + !> C interface to POSIX getpid() + !! Users should use the Fortran-defined getpid() function. + function getpid_posix() result(pid) bind(c, name="getpid") + ! #include + ! pid_t getpid(void); + import :: c_long + + integer(kind=c_long) :: pid + !< Process ID of the current process. + end function getpid_posix + + !> C interface to POSIX getppid() + !! Users should use the Fortran-defined getppid() function. + function getppid_posix() result(pid) bind(c, name="getppid") + ! #include + ! pid_t getppid(void); + import :: c_long + + integer(kind=c_long) :: pid + !< Process ID of the parent process to the current process. + end function getppid_posix + + !> C interface to POSIX sleep() + !! Users should use the Fortran-defined sleep() function. + function sleep_posix(seconds) result(rc) bind(c, name="sleep") + ! #include + ! unsigned int sleep(unsigned int seconds); + import :: c_int + + integer(kind=c_int), value, intent(in) :: seconds + !< Number of real-time seconds which the thread should sleep + integer(kind=c_int) :: rc + !< Function return code + end function + + ! NOTE: The C setjmp and sigsetjmp functions *must* be called explicitly by + ! the Fortran code, rather than through a wrapper Fortran function. + ! + ! Otherwise, setjmp() will capture the stack inside the wrapper, rather than + ! the point where setjmp() is called. + ! + ! Hence, we remove the `_posix` suffix and call these explicitly. + ! (The integer kind <-> c_int conversion will need to be addressed.) + + ! NOTE: POSIX explicitly says setjmp/sigsetjmp may be either a function or a + ! macro, and thus bind() may point to a nonexistent function. + ! e.g. sigsetjmp is a macro to __sigsetjmp in glibc, so we use a macro. + + !> Save the current program execution state to `env`. + !! + !! This function creates a snapshot of the process state to `env`, which can + !! be restored by calling `longjmp`. When `setjmp` is called, the function + !! returns 0. When `longjmp` is later called, the program is restored to the + !! point where `setjmp` was called, except it now returns a value (rc) as + !! specified by `longjmp`. + function setjmp(env) result(rc) bind(c, name="setjmp") + ! #include + ! int setjmp(jmp_buf env); + import :: jmp_buf, c_int + + type(jmp_buf), intent(in) :: env + !< Current process state + integer(kind=c_int) :: rc + !< Function return code; set to 0 if setjmp() was called, otherwise + !! specified by the corresponding longjmp() call. + end function setjmp + + !> Save the current execution and ,optionally, the signal state to `env`. + !! + !! This function creates a snapshot of the process state to `env`, which can + !! be restored by calling `longjmp`. When `setjmp` is called, the function + !! returns 0. When `longjmp` is later called, the program is restored to the + !! point where `setjmp` was called, except it now returns a value (rc) as + !! specified by `longjmp`. + !! + !! If `savesigs` is set to a nonzero value, then the signal state is included + !! in the program state. + function sigsetjmp(env, savesigs) result(rc) bind(c, name=SIGSETJMP_NAME) + ! #include + ! int sigsetjmp(jmp_buf env, int savesigs); + import :: sigjmp_buf, c_int + + type(sigjmp_buf), intent(in) :: env + !< Current process state + integer(kind=c_int), value, intent(in) :: savesigs + !< Flag to enable signal state when set to a nonzero value + integer(kind=c_int) :: rc + !< Function return code; set to 0 if sigsetjmp() was called, otherwise + !! specified by the corresponding siglongjmp() call. + end function sigsetjmp + + !> C interface to POSIX longjmp() + !! Users should use the Fortran-defined longjmp() function. + subroutine longjmp_posix(env, val) bind(c, name="longjmp") + ! #include + ! int longjmp(jmp_buf env, int val); + import :: jmp_buf, c_int + + type(jmp_buf), intent(in) :: env + !< Process state to restore + integer(kind=c_int), value, intent(in) :: val + !< Return code sent to setjmp() + end subroutine longjmp_posix + + !> C interface to POSIX siglongjmp() + !! Users should use the Fortran-defined siglongjmp() function. + subroutine siglongjmp_posix(env, val) bind(c, name="longjmp") + ! #include + ! int siglongjmp(jmp_buf env, int val); + import :: sigjmp_buf, c_int + + type(sigjmp_buf), intent(in) :: env + !< Process state to restore + integer(kind=c_int), value, intent(in) :: val + !< Return code sent to sigsetjmp() + end subroutine siglongjmp_posix + + ! Note on types: + ! mode_t: + ! "According to POSIX, it shall be an integer type." + ! pid_t: + ! "According to POSIX, it shall be a signed integer type, and the + ! implementation shall support one or more programming environments where + ! the width of pid_t is no greater than the width of the type long. + ! jmp_buf: + ! This is a strongly platform-dependent variable, large enough to contain + ! a complete copy of the process execution state (registers, stack, etc). + ! sigjmp_buf: + ! A more comprehensive version of jmp_buf which contains signal state. +end interface + +abstract interface + !> Function interface for signal handlers + subroutine handler_interface(sig) + integer, intent(in) :: sig + !> Input signal to handler + end subroutine +end interface + +contains + +!> Change mode of a file +!! +!! This changes the file permission of file `path` to `mode` following POSIX +!! conventions. If successful, it returns zero. Otherwise, it returns -1. +function chmod(path, mode) result(rc) + character(len=*), intent(in) :: path + integer, intent(in) :: mode + integer :: rc + + integer(kind=c_int) :: mode_c + integer(kind=c_int) :: rc_c + + mode_c = int(mode, kind=c_int) + rc_c = chmod_posix(path//c_null_char, mode_c) + rc = int(rc_c) +end function chmod + +!> Create a signal handler `handle` to be called when `sig` is detected. +!! +!! If successful, the previous handler for `sig` is returned. Otherwise, +!! SIG_ERR is returned. +function signal(sig, func) result(handle) + integer, intent(in) :: sig + procedure(handler_interface) :: func + procedure(handler_interface), pointer :: handle + + integer(kind=c_int) :: sig_c + type(c_funptr) :: handle_c + + sig_c = int(sig, kind=c_int) + handle_c = signal_posix(sig_c, c_funloc(func)) + call c_f_procpointer(handle_c, handle) +end function signal + +!> Send signal `sig` to process `pid`. +!! +!! If successful, this function returns 0. Otherwise, it returns -1. +function kill(pid, sig) result(rc) + integer, intent(in) :: pid + integer, intent(in) :: sig + integer :: rc + + integer(kind=c_int) :: pid_c, sig_c, rc_c + + pid_c = int(pid, kind=c_int) + sig_c = int(sig, kind=c_int) + rc_c = kill_posix(pid_c, sig_c) + rc = int(rc_c) +end function kill + +!> Get the ID of the current process. +function getpid() result(pid) + integer :: pid + + integer(kind=c_long) :: pid_c + + pid_c = getpid_posix() + pid = int(pid_c) +end function getpid + +!> Get the ID of the parent process of the current process. +function getppid() result(pid) + integer :: pid + + integer(kind=c_long) :: pid_c + + pid_c = getppid_posix() + pid = int(pid_c) +end function getppid + +!> Force the process to a sleep state for `seconds` seconds. +!! +!! The sleep state may be interrupted by a signal. If it sleeps for the entire +!! duration, then it returns 0. Otherwise, it returns the number of seconds +!! remaining at the point of interruption. +function sleep(seconds) result(rc) + ! NOTE: This function may replace an existing compiler `sleep()` extension. + integer, intent(in) :: seconds + integer :: rc + + integer(kind=c_int) :: seconds_c + integer(kind=c_int) :: rc_c + + seconds_c = int(seconds, kind=c_int) + rc_c = sleep_posix(seconds_c) + rc = int(rc_c) +end function sleep + +!> Restore program to state saved by `env`, and return the value `val`. +!! +!! This "nonlocal goto" alters program execution to the state stored in `env` +!! produced by a prior execution of `setjmp`. Program execution is moved +!! back to this `setjmp`, except the function will now return `val`. +subroutine longjmp(env, val) + type(jmp_buf), intent(in) :: env + integer, intent(in) :: val + + integer(kind=c_int) :: val_c + + val_c = int(val, kind=c_int) + call longjmp_posix(env, val_c) +end subroutine longjmp + +!> Restore program to state saved by `env`, and return the value `val`. +!! +!! This "nonlocal goto" alters program execution to the state stored in `env` +!! produced by a prior execution of `setjmp`. Program execution is moved back +!! to this `setjmp`, except the function will now return `val`. +!! +!! `siglongjmp` behaves in the same manner as `longjmp`, but also provides +!! predictable handling of the signal state. +subroutine siglongjmp(env, val) + type(sigjmp_buf), intent(in) :: env + integer, intent(in) :: val + + integer(kind=c_int) :: val_c + + val_c = int(val, kind=c_int) + call siglongjmp_posix(env, val_c) +end subroutine siglongjmp + +end module posix diff --git a/src/framework/posix.h b/src/framework/posix.h new file mode 100644 index 0000000000..6d6012c5e7 --- /dev/null +++ b/src/framework/posix.h @@ -0,0 +1,28 @@ +#ifndef MOM6_POSIX_H_ +#define MOM6_POSIX_H_ + +! JMP_BUF_SIZE should be set to sizeof(jmp_buf). +! If unset, then use a typical glibc value (25 long ints) +#ifndef JMP_BUF_SIZE +#define JMP_BUF_SIZE 200 +#endif + +! If unset, assume jmp_buf and sigjmp_buf are equivalent (as in glibc). +#ifndef SIGJMP_BUF_SIZE +#define SIGJMP_BUF_SIZE JMP_BUF_SIZE +#endif + +! glibc defines sigsetjmp as __sigsetjmp via macro readable from . +! Perhaps autoconf can configure this one... +! TODO: Need a solution here! +#ifndef SIGSETJMP_NAME +#define SIGSETJMP_NAME "__sigsetjmp" +#endif + +! This should be defined by /usr/include/signal.h +! If unset, we use the most common (x86) value +#ifndef POSIX_SIGUSR1 +#define POSIX_SIGUSR1 10 +#endif + +#endif