Skip to content

Commit

Permalink
Revert part of c1bbbb5 to restore the usock component, thus fixing sh…
Browse files Browse the repository at this point in the history
…ow_help aggregation.

Fixes #1467

Restore debugger attach operations

Fixes #1225
  • Loading branch information
Ralph Castain committed Mar 19, 2016
1 parent e020566 commit c146c49
Show file tree
Hide file tree
Showing 22 changed files with 3,748 additions and 162 deletions.
2 changes: 1 addition & 1 deletion ompi/mca/rte/orte/rte_orte.h
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ OMPI_DECLSPEC void __opal_attribute_noreturn__
#define OMPI_ERROR_LOG ORTE_ERROR_LOG

/* Init and finalize objects and operations */
OMPI_DECLSPEC int ompi_rte_init(int *pargc, char ***pargv);
#define ompi_rte_init(a, b) orte_init(a, b, ORTE_PROC_MPI)
#define ompi_rte_finalize() orte_finalize()
OMPI_DECLSPEC void ompi_rte_wait_for_debugger(void);

Expand Down
101 changes: 21 additions & 80 deletions ompi/mca/rte/orte/rte_orte_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -52,79 +52,6 @@

extern ompi_rte_orte_component_t mca_rte_orte_component;

typedef struct {
volatile bool active;
int status;
int errhandler;
} errhandler_t;

static void register_cbfunc(int status, int errhndler, void *cbdata)
{
errhandler_t *cd = (errhandler_t*)cbdata;
cd->status = status;
cd->errhandler = errhndler;
cd->active = false;
}

static volatile bool wait_for_release = true;
static int errhandler = -1;

static void notify_cbfunc(int status,
opal_list_t *procs,
opal_list_t *info,
opal_pmix_release_cbfunc_t cbfunc,
void *cbdata)
{
if (NULL != cbfunc) {
cbfunc(cbdata);
}
wait_for_release = false;
}


int ompi_rte_init(int *pargc, char ***pargv)
{
int rc;
opal_list_t info;
opal_value_t val;
errhandler_t cd;

if (ORTE_SUCCESS != (rc = orte_init(pargc, pargv, ORTE_PROC_MPI))) {
return rc;
}

if (!orte_standalone_operation) {
/* register to receive any debugger release */
OBJ_CONSTRUCT(&info, opal_list_t);
OBJ_CONSTRUCT(&val, opal_value_t);
val.key = strdup(OPAL_PMIX_ERROR_NAME);
val.type = OPAL_INT;
val.data.integer = OPAL_ERR_DEBUGGER_RELEASE;
opal_list_append(&info, &val.super);
cd.status = ORTE_ERROR;
cd.errhandler = -1;
cd.active = true;

opal_pmix.register_errhandler(&info, notify_cbfunc, register_cbfunc, &cd);

/* let the MPI progress engine run while we wait for
* registration to complete */
OMPI_WAIT_FOR_COMPLETION(cd.active);
/* safely deconstruct the list */
opal_list_remove_first(&info);
OBJ_DESTRUCT(&val);
OBJ_DESTRUCT(&info);
if (OPAL_SUCCESS != cd.status) {
/* ouch - we are doomed */
ORTE_ERROR_LOG(cd.status);
return OMPI_ERROR;
}
errhandler = cd.errhandler;
}

return OMPI_SUCCESS;
}

void ompi_rte_abort(int error_code, char *fmt, ...)
{
va_list arglist;
Expand Down Expand Up @@ -173,10 +100,10 @@ void ompi_rte_abort(int error_code, char *fmt, ...)
* attaching debuggers -- see big comment in
* orte/tools/orterun/debuggers.c explaining the two scenarios.
*/

void ompi_rte_wait_for_debugger(void)
{
int debugger;
orte_rml_recv_cb_t xfer;

/* See lengthy comment in orte/tools/orterun/debuggers.c about
orte_in_parallel_debugger */
Expand All @@ -186,16 +113,16 @@ void ompi_rte_wait_for_debugger(void)
debugger = 1;
}

if (!debugger) {
if (!debugger && NULL == getenv("ORTE_TEST_DEBUGGER_ATTACH")) {
/* if not, just return */
return;
}

/* if we are being debugged, then we need to find
* the correct plug-ins
*/
ompi_debugger_setup_dlls();

/* wait for the debugger to attach */
if (orte_standalone_operation) {
/* spin until debugger attaches and releases us */
while (MPIR_debug_gate == 0) {
Expand All @@ -206,9 +133,23 @@ void ompi_rte_wait_for_debugger(void)
#endif
}
} else {
/* now wait for the notification to occur */
OMPI_WAIT_FOR_COMPLETION(wait_for_release);
/* deregister the errhandler */
opal_pmix.deregister_errhandler(errhandler, NULL, NULL);
/* only the rank=0 proc waits for either a message from the
* HNP or for the debugger to attach - everyone else will just
* spin in * the grpcomm barrier in ompi_mpi_init until rank=0
* joins them.
*/
if (0 != ORTE_PROC_MY_NAME->vpid) {
return;
}

/* VPID 0 waits for a message from the HNP */
OBJ_CONSTRUCT(&xfer, orte_rml_recv_cb_t);
xfer.active = true;
orte_rml.recv_buffer_nb(OMPI_NAME_WILDCARD,
ORTE_RML_TAG_DEBUGGER_RELEASE,
ORTE_RML_NON_PERSISTENT,
orte_rml_recv_callback, &xfer);
/* let the MPI progress engine run while we wait */
OMPI_WAIT_FOR_COMPLETION(xfer.active);
}
}
87 changes: 84 additions & 3 deletions orte/mca/ess/base/ess_base_std_app.c
Original file line number Diff line number Diff line change
Expand Up @@ -47,9 +47,14 @@
#include "opal/runtime/opal.h"
#include "opal/runtime/opal_cr.h"

#include "orte/mca/rml/base/base.h"
#include "orte/mca/routed/base/base.h"
#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/dfs/base/base.h"
#include "orte/mca/grpcomm/base/base.h"
#include "orte/mca/oob/base/base.h"
#include "orte/mca/rml/rml.h"
#include "orte/mca/qos/base/base.h"
#include "orte/mca/odls/odls_types.h"
#include "orte/mca/filem/base/base.h"
#include "orte/mca/errmgr/base/base.h"
Expand Down Expand Up @@ -169,14 +174,84 @@ int orte_ess_base_app_setup(bool db_restrict_local)
}
OBJ_DESTRUCT(&kv);
}

/* Setup the communication infrastructure */
/*
* OOB Layer
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_oob_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_oob_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_oob_base_select";
goto error;
}
/* Runtime Messaging Layer */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_rml_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_rml_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml_base_select";
goto error;
}
/* Messaging QoS Layer */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_qos_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_qos_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_qos_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_qos_base_select";
goto error;
}
/* setup the errmgr */
if (ORTE_SUCCESS != (ret = orte_errmgr_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_errmgr_base_select";
goto error;
}

/* Routed system */
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_routed_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_routed_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_routed_base_select";
goto error;
}
/*
* Group communications
*/
if (ORTE_SUCCESS != (ret = mca_base_framework_open(&orte_grpcomm_base_framework, 0))) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_open";
goto error;
}
if (ORTE_SUCCESS != (ret = orte_grpcomm_base_select())) {
ORTE_ERROR_LOG(ret);
error = "orte_grpcomm_base_select";
goto error;
}
/* enable communication via the rml */
if (ORTE_SUCCESS != (ret = orte_rml.enable_comm())) {
ORTE_ERROR_LOG(ret);
error = "orte_rml.enable_comm";
goto error;
}
/* setup the routed info */
if (ORTE_SUCCESS != (ret = orte_routed.init_routes(ORTE_PROC_MY_NAME->jobid, NULL))) {
ORTE_ERROR_LOG(ret);
error = "orte_routed.init_routes";
goto error;
}
#if OPAL_ENABLE_FT_CR == 1
/*
* Setup the SnapC
Expand Down Expand Up @@ -247,7 +322,13 @@ int orte_ess_base_app_finalize(void)
(void) mca_base_framework_close(&orte_filem_base_framework);
(void) mca_base_framework_close(&orte_errmgr_base_framework);

/* now can close the rml and its friendly group comm */
(void) mca_base_framework_close(&orte_grpcomm_base_framework);
(void) mca_base_framework_close(&orte_dfs_base_framework);
(void) mca_base_framework_close(&orte_routed_base_framework);

(void) mca_base_framework_close(&orte_rml_base_framework);
(void) mca_base_framework_close(&orte_oob_base_framework);
(void) mca_base_framework_close(&orte_state_base_framework);

orte_session_dir_finalize(ORTE_PROC_MY_NAME);
Expand Down Expand Up @@ -296,7 +377,7 @@ void orte_ess_base_app_abort(int status, bool report)
* the message if routing is enabled as this indicates we
* have someone to send to
*/
if (report && orte_create_session_dirs) {
if (report && orte_routing_is_enabled && orte_create_session_dirs) {
myfile = opal_os_path(false, orte_process_info.proc_session_dir, "aborted", NULL);
fd = open(myfile, O_CREAT, S_IRUSR);
close(fd);
Expand Down
13 changes: 13 additions & 0 deletions orte/mca/ess/pmi/ess_pmi_module.c
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,8 @@
#include "opal/mca/pmix/base/base.h"

#include "orte/mca/errmgr/errmgr.h"
#include "orte/mca/grpcomm/grpcomm.h"
#include "orte/mca/rml/rml.h"
#include "orte/util/proc_info.h"
#include "orte/util/show_help.h"
#include "orte/util/name_fns.h"
Expand Down Expand Up @@ -85,6 +87,7 @@ static int rte_init(void)
char *envar, *ev1, *ev2;
uint64_t unique_key[2];
char *string_key;
char *rmluri;
opal_value_t *kv;
char *val;
int u32, *u32ptr;
Expand Down Expand Up @@ -379,6 +382,16 @@ static int rte_init(void)

/*** PUSH DATA FOR OTHERS TO FIND ***/

/* push our RML URI in case others need to talk directly to us */
rmluri = orte_rml.get_contact_info();
/* push it out for others to use */
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_PROC_URI, rmluri, OPAL_STRING);
if (ORTE_SUCCESS != ret) {
error = "pmix put uri";
goto error;
}
free(rmluri);

/* push our hostname so others can find us, if they need to */
OPAL_MODEX_SEND_VALUE(ret, OPAL_PMIX_GLOBAL, OPAL_PMIX_HOSTNAME, orte_process_info.nodename, OPAL_STRING);
if (ORTE_SUCCESS != ret) {
Expand Down
56 changes: 56 additions & 0 deletions orte/mca/oob/usock/Makefile.am
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#
# Copyright (c) 2004-2005 The Trustees of Indiana University and Indiana
# University Research and Technology
# Corporation. All rights reserved.
# Copyright (c) 2004-2005 The University of Tennessee and The University
# of Tennessee Research Foundation. All rights
# reserved.
# Copyright (c) 2004-2009 High Performance Computing Center Stuttgart,
# University of Stuttgart. All rights reserved.
# Copyright (c) 2004-2005 The Regents of the University of California.
# All rights reserved.
# Copyright (c) 2010 Cisco Systems, Inc. All rights reserved.
# Copyright (c) 2012-2013 Los Alamos National Security, LLC.
# All rights reserved
# Copyright (c) 2013-2015 Intel, Inc. All rights reserved.
# $COPYRIGHT$
#
# Additional copyrights may follow
#
# $HEADER$
#

sources = \
oob_usock_component.h \
oob_usock.h \
oob_usock_component.c \
oob_usock_connection.h \
oob_usock_sendrecv.h \
oob_usock_hdr.h \
oob_usock_peer.h \
oob_usock_ping.h \
oob_usock.c \
oob_usock_connection.c \
oob_usock_sendrecv.c

# Make the output library in this directory, and name it either
# mca_<type>_<name>.la (for DSO builds) or libmca_<type>_<name>.la
# (for static builds).

if MCA_BUILD_orte_oob_usock_DSO
component_noinst =
component_install = mca_oob_usock.la
else
component_noinst = libmca_oob_usock.la
component_install =
endif

mcacomponentdir = $(ortelibdir)
mcacomponent_LTLIBRARIES = $(component_install)
mca_oob_usock_la_SOURCES = $(sources)
mca_oob_usock_la_LDFLAGS = -module -avoid-version

noinst_LTLIBRARIES = $(component_noinst)
libmca_oob_usock_la_SOURCES = $(sources)
libmca_oob_usock_la_LDFLAGS = -module -avoid-version

Loading

0 comments on commit c146c49

Please sign in to comment.